diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4566 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 2907, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.7182130584192438e-09, + "logits/chosen": -2.7645790576934814, + "logits/rejected": -2.8125059604644775, + "logps/chosen": -113.67314910888672, + "logps/rejected": -132.0498504638672, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -2.9990971088409424, + "logits/rejected": -3.0227837562561035, + "logps/chosen": -281.044921875, + "logps/rejected": -247.3936309814453, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004584211856126785, + "rewards/margins": 0.00850469246506691, + "rewards/rejected": -0.0039204806089401245, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.88598895072937, + "logits/rejected": -2.917177200317383, + "logps/chosen": -359.26177978515625, + "logps/rejected": -298.42877197265625, + "loss": 0.6845, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.05283154919743538, + "rewards/margins": 0.018162177875638008, + "rewards/rejected": 0.03466937318444252, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -2.9570868015289307, + "logits/rejected": -2.9609949588775635, + "logps/chosen": -326.2544860839844, + "logps/rejected": -289.9393615722656, + "loss": 0.6564, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1515582799911499, + "rewards/margins": 0.05830109864473343, + "rewards/rejected": 0.09325718879699707, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -2.9531846046447754, + "logits/rejected": -2.955566883087158, + "logps/chosen": -376.5739440917969, + "logps/rejected": -331.3490295410156, + "loss": 0.6444, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.3529122769832611, + "rewards/margins": 0.1296483278274536, + "rewards/rejected": 0.2232639044523239, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.8878796100616455, + "logits/rejected": -2.9229512214660645, + "logps/chosen": -427.5284118652344, + "logps/rejected": -266.94415283203125, + "loss": 0.6204, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.5041142702102661, + "rewards/margins": 0.29676300287246704, + "rewards/rejected": 0.20735123753547668, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -2.92777943611145, + "logits/rejected": -2.9371728897094727, + "logps/chosen": -318.0414123535156, + "logps/rejected": -265.1334228515625, + "loss": 0.6105, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38802462816238403, + "rewards/margins": 0.31535086035728455, + "rewards/rejected": 0.0726737454533577, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -2.9851737022399902, + "logits/rejected": -3.0005269050598145, + "logps/chosen": -404.5143127441406, + "logps/rejected": -300.8736572265625, + "loss": 0.6069, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5573440790176392, + "rewards/margins": 0.41277560591697693, + "rewards/rejected": 0.14456847310066223, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -2.96441912651062, + "logits/rejected": -2.968987464904785, + "logps/chosen": -314.7437438964844, + "logps/rejected": -254.7586669921875, + "loss": 0.5569, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.45461219549179077, + "rewards/margins": 0.5670984983444214, + "rewards/rejected": -0.1124863252043724, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -3.021219491958618, + "logits/rejected": -3.0178027153015137, + "logps/chosen": -308.31585693359375, + "logps/rejected": -257.63250732421875, + "loss": 0.5296, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.5257282257080078, + "rewards/margins": 0.7057730555534363, + "rewards/rejected": -0.18004484474658966, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.8890886306762695, + "logits/rejected": -2.896449327468872, + "logps/chosen": -375.84564208984375, + "logps/rejected": -241.34219360351562, + "loss": 0.5504, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.6213200092315674, + "rewards/margins": 0.8095133900642395, + "rewards/rejected": -0.18819323182106018, + "step": 100 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.936641216278076, + "eval_logits/rejected": -2.935973882675171, + "eval_logps/chosen": -361.9043884277344, + "eval_logps/rejected": -293.7761535644531, + "eval_loss": 0.5406630635261536, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": 0.5287383794784546, + "eval_rewards/margins": 0.7097563743591309, + "eval_rewards/rejected": -0.18101799488067627, + "eval_runtime": 163.7175, + "eval_samples_per_second": 12.216, + "eval_steps_per_second": 0.385, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -2.917739152908325, + "logits/rejected": -2.8890061378479004, + "logps/chosen": -334.1250305175781, + "logps/rejected": -331.29571533203125, + "loss": 0.5741, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4206802248954773, + "rewards/margins": 0.6734089851379395, + "rewards/rejected": -0.25272876024246216, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -2.9665865898132324, + "logits/rejected": -2.970818519592285, + "logps/chosen": -386.2568664550781, + "logps/rejected": -280.7279357910156, + "loss": 0.5533, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.3308308720588684, + "rewards/margins": 0.6611676216125488, + "rewards/rejected": -0.33033671975135803, + "step": 120 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -3.012545347213745, + "logits/rejected": -2.9925591945648193, + "logps/chosen": -353.75469970703125, + "logps/rejected": -290.1478576660156, + "loss": 0.5447, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.47041910886764526, + "rewards/margins": 0.7254467010498047, + "rewards/rejected": -0.2550275921821594, + "step": 130 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.9241251945495605, + "logits/rejected": -2.9576869010925293, + "logps/chosen": -329.7611389160156, + "logps/rejected": -265.63006591796875, + "loss": 0.5113, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.4418914318084717, + "rewards/margins": 0.7908871173858643, + "rewards/rejected": -0.3489956259727478, + "step": 140 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -2.9687321186065674, + "logits/rejected": -2.9832406044006348, + "logps/chosen": -331.42669677734375, + "logps/rejected": -269.3779296875, + "loss": 0.5387, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.38882407546043396, + "rewards/margins": 0.8327142000198364, + "rewards/rejected": -0.4438902735710144, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.9920172691345215, + "logits/rejected": -3.013425827026367, + "logps/chosen": -383.51934814453125, + "logps/rejected": -297.9476318359375, + "loss": 0.5083, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.6878620982170105, + "rewards/margins": 1.0685365200042725, + "rewards/rejected": -0.3806745111942291, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.9233837127685547, + "logits/rejected": -2.9321510791778564, + "logps/chosen": -339.95745849609375, + "logps/rejected": -280.793701171875, + "loss": 0.5131, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5272036194801331, + "rewards/margins": 0.8315987586975098, + "rewards/rejected": -0.3043951690196991, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -2.9920477867126465, + "logits/rejected": -2.9811954498291016, + "logps/chosen": -265.2094421386719, + "logps/rejected": -254.6926727294922, + "loss": 0.504, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.177840456366539, + "rewards/margins": 0.8774341344833374, + "rewards/rejected": -0.699593722820282, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -3.0027570724487305, + "logits/rejected": -2.987896203994751, + "logps/chosen": -330.7102966308594, + "logps/rejected": -239.6572723388672, + "loss": 0.5611, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.5821124315261841, + "rewards/margins": 1.141722559928894, + "rewards/rejected": -0.5596100687980652, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -3.048879384994507, + "logits/rejected": -2.9993340969085693, + "logps/chosen": -266.72430419921875, + "logps/rejected": -187.27467346191406, + "loss": 0.541, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.2961394786834717, + "rewards/margins": 0.8773609399795532, + "rewards/rejected": -0.5812214612960815, + "step": 200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.980220317840576, + "eval_logits/rejected": -2.9785656929016113, + "eval_logps/chosen": -360.50030517578125, + "eval_logps/rejected": -297.53515625, + "eval_loss": 0.5220658779144287, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": 0.6691505908966064, + "eval_rewards/margins": 1.2260682582855225, + "eval_rewards/rejected": -0.5569177269935608, + "eval_runtime": 163.6147, + "eval_samples_per_second": 12.224, + "eval_steps_per_second": 0.385, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -2.9797844886779785, + "logits/rejected": -2.9449918270111084, + "logps/chosen": -364.26287841796875, + "logps/rejected": -251.58901977539062, + "loss": 0.4772, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5583639740943909, + "rewards/margins": 1.3936102390289307, + "rewards/rejected": -0.835246205329895, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.9557044506073, + "logits/rejected": -2.9637341499328613, + "logps/chosen": -261.53216552734375, + "logps/rejected": -271.5208740234375, + "loss": 0.5707, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.367598295211792, + "rewards/margins": 1.1545054912567139, + "rewards/rejected": -0.7869071960449219, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -3.0034899711608887, + "logits/rejected": -2.991698980331421, + "logps/chosen": -308.8106689453125, + "logps/rejected": -278.55950927734375, + "loss": 0.5827, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.4888441562652588, + "rewards/margins": 1.2474777698516846, + "rewards/rejected": -0.7586336731910706, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -3.0844597816467285, + "logits/rejected": -3.0464837551116943, + "logps/chosen": -385.8021545410156, + "logps/rejected": -253.19869995117188, + "loss": 0.4898, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5026682019233704, + "rewards/margins": 1.1020526885986328, + "rewards/rejected": -0.5993844270706177, + "step": 240 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -3.0472395420074463, + "logits/rejected": -3.0599236488342285, + "logps/chosen": -341.8814697265625, + "logps/rejected": -295.29437255859375, + "loss": 0.5395, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.24311120808124542, + "rewards/margins": 0.9385444521903992, + "rewards/rejected": -0.6954333186149597, + "step": 250 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -3.0285518169403076, + "logits/rejected": -3.0690500736236572, + "logps/chosen": -353.20074462890625, + "logps/rejected": -244.77041625976562, + "loss": 0.6312, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.22555696964263916, + "rewards/margins": 0.8821722269058228, + "rewards/rejected": -1.107729196548462, + "step": 260 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -3.0879526138305664, + "logits/rejected": -3.0506978034973145, + "logps/chosen": -354.5426025390625, + "logps/rejected": -279.86773681640625, + "loss": 0.571, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.26413029432296753, + "rewards/margins": 0.9725528955459595, + "rewards/rejected": -0.7084226012229919, + "step": 270 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -3.0588438510894775, + "logits/rejected": -3.0481762886047363, + "logps/chosen": -339.85675048828125, + "logps/rejected": -285.8063049316406, + "loss": 0.6383, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.30532822012901306, + "rewards/margins": 1.10079026222229, + "rewards/rejected": -0.7954620122909546, + "step": 280 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -3.0889270305633545, + "logits/rejected": -3.0666940212249756, + "logps/chosen": -335.5870666503906, + "logps/rejected": -258.51641845703125, + "loss": 0.5611, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.7204562425613403, + "rewards/margins": 1.0493910312652588, + "rewards/rejected": -0.32893460988998413, + "step": 290 + }, + { + "epoch": 0.31, + "learning_rate": 4.982798165137615e-07, + "logits/chosen": -3.0340983867645264, + "logits/rejected": -3.0090713500976562, + "logps/chosen": -281.38751220703125, + "logps/rejected": -289.4985656738281, + "loss": 0.6034, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.268043577671051, + "rewards/margins": 0.5827728509902954, + "rewards/rejected": -0.3147292733192444, + "step": 300 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -3.035973072052002, + "eval_logits/rejected": -3.0234200954437256, + "eval_logps/chosen": -359.8170166015625, + "eval_logps/rejected": -296.5441589355469, + "eval_loss": 0.5459412932395935, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": 0.737476110458374, + "eval_rewards/margins": 1.1952924728393555, + "eval_rewards/rejected": -0.45781639218330383, + "eval_runtime": 164.3219, + "eval_samples_per_second": 12.171, + "eval_steps_per_second": 0.383, + "step": 300 + }, + { + "epoch": 0.32, + "learning_rate": 4.963685015290519e-07, + "logits/chosen": -3.1333563327789307, + "logits/rejected": -3.0529465675354004, + "logps/chosen": -394.2475280761719, + "logps/rejected": -328.84796142578125, + "loss": 0.5995, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5251134634017944, + "rewards/margins": 0.8031543493270874, + "rewards/rejected": -0.27804094552993774, + "step": 310 + }, + { + "epoch": 0.33, + "learning_rate": 4.944571865443424e-07, + "logits/chosen": -3.115387201309204, + "logits/rejected": -3.104794502258301, + "logps/chosen": -299.5379943847656, + "logps/rejected": -227.14413452148438, + "loss": 0.5504, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5498681664466858, + "rewards/margins": 1.550806999206543, + "rewards/rejected": -1.0009387731552124, + "step": 320 + }, + { + "epoch": 0.34, + "learning_rate": 4.92545871559633e-07, + "logits/chosen": -3.1059436798095703, + "logits/rejected": -3.110661029815674, + "logps/chosen": -405.8400573730469, + "logps/rejected": -290.01934814453125, + "loss": 0.5355, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5642995834350586, + "rewards/margins": 1.1246757507324219, + "rewards/rejected": -0.5603762269020081, + "step": 330 + }, + { + "epoch": 0.35, + "learning_rate": 4.906345565749235e-07, + "logits/chosen": -3.0694103240966797, + "logits/rejected": -3.075610876083374, + "logps/chosen": -301.7900695800781, + "logps/rejected": -273.09100341796875, + "loss": 0.6496, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.26571425795555115, + "rewards/margins": 0.967176079750061, + "rewards/rejected": -0.701461672782898, + "step": 340 + }, + { + "epoch": 0.36, + "learning_rate": 4.88723241590214e-07, + "logits/chosen": -3.078815460205078, + "logits/rejected": -3.097691059112549, + "logps/chosen": -373.6755065917969, + "logps/rejected": -278.1918640136719, + "loss": 0.5251, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5501624941825867, + "rewards/margins": 1.1470292806625366, + "rewards/rejected": -0.5968667268753052, + "step": 350 + }, + { + "epoch": 0.37, + "learning_rate": 4.868119266055046e-07, + "logits/chosen": -3.101353168487549, + "logits/rejected": -3.1290316581726074, + "logps/chosen": -370.21112060546875, + "logps/rejected": -328.2227783203125, + "loss": 0.5218, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.4693407118320465, + "rewards/margins": 1.2798802852630615, + "rewards/rejected": -0.8105396032333374, + "step": 360 + }, + { + "epoch": 0.38, + "learning_rate": 4.849006116207951e-07, + "logits/chosen": -3.108405113220215, + "logits/rejected": -3.108668804168701, + "logps/chosen": -357.5787048339844, + "logps/rejected": -308.5846252441406, + "loss": 0.5781, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.5667105317115784, + "rewards/margins": 1.561586618423462, + "rewards/rejected": -0.9948760271072388, + "step": 370 + }, + { + "epoch": 0.39, + "learning_rate": 4.829892966360856e-07, + "logits/chosen": -3.1076834201812744, + "logits/rejected": -3.139901638031006, + "logps/chosen": -372.7229919433594, + "logps/rejected": -321.50347900390625, + "loss": 0.5748, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.595112681388855, + "rewards/margins": 1.7188622951507568, + "rewards/rejected": -1.1237497329711914, + "step": 380 + }, + { + "epoch": 0.4, + "learning_rate": 4.810779816513762e-07, + "logits/chosen": -3.0231597423553467, + "logits/rejected": -3.055475950241089, + "logps/chosen": -308.81109619140625, + "logps/rejected": -280.67572021484375, + "loss": 0.594, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.022154245525598526, + "rewards/margins": 1.1715147495269775, + "rewards/rejected": -1.1493604183197021, + "step": 390 + }, + { + "epoch": 0.41, + "learning_rate": 4.791666666666667e-07, + "logits/chosen": -2.9821434020996094, + "logits/rejected": -2.990657329559326, + "logps/chosen": -350.4073791503906, + "logps/rejected": -234.08291625976562, + "loss": 0.5944, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.6456303000450134, + "rewards/margins": 1.8281257152557373, + "rewards/rejected": -1.182495355606079, + "step": 400 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.96209979057312, + "eval_logits/rejected": -2.963911294937134, + "eval_logps/chosen": -362.2125549316406, + "eval_logps/rejected": -300.90362548828125, + "eval_loss": 0.5573462247848511, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": 0.49792128801345825, + "eval_rewards/margins": 1.391687273979187, + "eval_rewards/rejected": -0.8937660455703735, + "eval_runtime": 163.7646, + "eval_samples_per_second": 12.213, + "eval_steps_per_second": 0.385, + "step": 400 + }, + { + "epoch": 0.42, + "learning_rate": 4.772553516819572e-07, + "logits/chosen": -2.9685988426208496, + "logits/rejected": -2.9469170570373535, + "logps/chosen": -359.9443054199219, + "logps/rejected": -339.13482666015625, + "loss": 0.7753, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5800348520278931, + "rewards/margins": 1.4965015649795532, + "rewards/rejected": -0.9164667129516602, + "step": 410 + }, + { + "epoch": 0.43, + "learning_rate": 4.753440366972477e-07, + "logits/chosen": -3.039097785949707, + "logits/rejected": -3.0352489948272705, + "logps/chosen": -279.19451904296875, + "logps/rejected": -275.61077880859375, + "loss": 0.5719, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.08171078562736511, + "rewards/margins": 0.8535135388374329, + "rewards/rejected": -0.7718027234077454, + "step": 420 + }, + { + "epoch": 0.44, + "learning_rate": 4.7343272171253825e-07, + "logits/chosen": -3.0542099475860596, + "logits/rejected": -3.048107624053955, + "logps/chosen": -304.2041015625, + "logps/rejected": -275.24664306640625, + "loss": 0.5521, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.6358417272567749, + "rewards/margins": 1.4337527751922607, + "rewards/rejected": -0.7979112863540649, + "step": 430 + }, + { + "epoch": 0.45, + "learning_rate": 4.715214067278288e-07, + "logits/chosen": -2.9832911491394043, + "logits/rejected": -2.9696083068847656, + "logps/chosen": -351.0896911621094, + "logps/rejected": -278.2879333496094, + "loss": 0.5257, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.4420256018638611, + "rewards/margins": 1.773047685623169, + "rewards/rejected": -1.331022024154663, + "step": 440 + }, + { + "epoch": 0.46, + "learning_rate": 4.696100917431192e-07, + "logits/chosen": -3.115874767303467, + "logits/rejected": -3.0773837566375732, + "logps/chosen": -392.2452392578125, + "logps/rejected": -324.62640380859375, + "loss": 0.5536, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2861310839653015, + "rewards/margins": 0.8551927804946899, + "rewards/rejected": -0.5690616369247437, + "step": 450 + }, + { + "epoch": 0.47, + "learning_rate": 4.6769877675840974e-07, + "logits/chosen": -3.0585522651672363, + "logits/rejected": -3.089534282684326, + "logps/chosen": -310.84967041015625, + "logps/rejected": -287.9058532714844, + "loss": 0.5614, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.2514137625694275, + "rewards/margins": 1.2147700786590576, + "rewards/rejected": -0.9633563160896301, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 4.6578746177370027e-07, + "logits/chosen": -3.0050368309020996, + "logits/rejected": -3.0113613605499268, + "logps/chosen": -243.838623046875, + "logps/rejected": -224.61404418945312, + "loss": 0.5769, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.10031839460134506, + "rewards/margins": 1.2319433689117432, + "rewards/rejected": -1.1316249370574951, + "step": 470 + }, + { + "epoch": 0.5, + "learning_rate": 4.638761467889908e-07, + "logits/chosen": -3.01200795173645, + "logits/rejected": -2.9829325675964355, + "logps/chosen": -353.6679992675781, + "logps/rejected": -299.7701416015625, + "loss": 0.5141, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.6005850434303284, + "rewards/margins": 2.408433437347412, + "rewards/rejected": -1.807848334312439, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 4.6196483180428133e-07, + "logits/chosen": -3.038440227508545, + "logits/rejected": -3.0429458618164062, + "logps/chosen": -330.0135192871094, + "logps/rejected": -262.1318359375, + "loss": 0.5292, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.4290197491645813, + "rewards/margins": 1.5280876159667969, + "rewards/rejected": -1.0990678071975708, + "step": 490 + }, + { + "epoch": 0.52, + "learning_rate": 4.600535168195718e-07, + "logits/chosen": -3.0223565101623535, + "logits/rejected": -3.0170624256134033, + "logps/chosen": -259.1560363769531, + "logps/rejected": -268.68365478515625, + "loss": 0.5512, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08852599561214447, + "rewards/margins": 1.076027750968933, + "rewards/rejected": -0.9875017404556274, + "step": 500 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -3.0406343936920166, + "eval_logits/rejected": -3.0485074520111084, + "eval_logps/chosen": -362.83642578125, + "eval_logps/rejected": -302.1329650878906, + "eval_loss": 0.5256651043891907, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": 0.4355368912220001, + "eval_rewards/margins": 1.452234148979187, + "eval_rewards/rejected": -1.0166972875595093, + "eval_runtime": 164.1914, + "eval_samples_per_second": 12.181, + "eval_steps_per_second": 0.384, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 4.5814220183486234e-07, + "logits/chosen": -2.971991777420044, + "logits/rejected": -2.9626731872558594, + "logps/chosen": -387.75872802734375, + "logps/rejected": -341.24224853515625, + "loss": 0.5611, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.0966944545507431, + "rewards/margins": 0.9770743250846863, + "rewards/rejected": -1.0737688541412354, + "step": 510 + }, + { + "epoch": 0.54, + "learning_rate": 4.562308868501529e-07, + "logits/chosen": -2.97809100151062, + "logits/rejected": -3.0156943798065186, + "logps/chosen": -325.83837890625, + "logps/rejected": -321.0384826660156, + "loss": 0.5693, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.27487924695014954, + "rewards/margins": 1.646512746810913, + "rewards/rejected": -1.371633529663086, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 4.543195718654434e-07, + "logits/chosen": -3.0082881450653076, + "logits/rejected": -3.003408193588257, + "logps/chosen": -274.6020812988281, + "logps/rejected": -240.13998413085938, + "loss": 0.5953, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0502743124961853, + "rewards/margins": 0.5772665739059448, + "rewards/rejected": -0.6275408864021301, + "step": 530 + }, + { + "epoch": 0.56, + "learning_rate": 4.5240825688073394e-07, + "logits/chosen": -3.0475857257843018, + "logits/rejected": -3.0587058067321777, + "logps/chosen": -345.28802490234375, + "logps/rejected": -276.25018310546875, + "loss": 0.559, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6526178121566772, + "rewards/margins": 1.6864182949066162, + "rewards/rejected": -1.033800482749939, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 4.504969418960244e-07, + "logits/chosen": -2.9780993461608887, + "logits/rejected": -3.0339550971984863, + "logps/chosen": -318.60699462890625, + "logps/rejected": -363.83966064453125, + "loss": 0.5182, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3249278664588928, + "rewards/margins": 1.6138547658920288, + "rewards/rejected": -1.2889269590377808, + "step": 550 + }, + { + "epoch": 0.58, + "learning_rate": 4.4858562691131495e-07, + "logits/chosen": -3.0293617248535156, + "logits/rejected": -3.0541815757751465, + "logps/chosen": -355.3965759277344, + "logps/rejected": -341.19097900390625, + "loss": 0.5655, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15244658291339874, + "rewards/margins": 1.1955846548080444, + "rewards/rejected": -1.043138027191162, + "step": 560 + }, + { + "epoch": 0.59, + "learning_rate": 4.466743119266055e-07, + "logits/chosen": -2.955909252166748, + "logits/rejected": -2.966557502746582, + "logps/chosen": -339.918701171875, + "logps/rejected": -312.85992431640625, + "loss": 0.5342, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24385514855384827, + "rewards/margins": 1.2561490535736084, + "rewards/rejected": -1.5000044107437134, + "step": 570 + }, + { + "epoch": 0.6, + "learning_rate": 4.44762996941896e-07, + "logits/chosen": -2.9457859992980957, + "logits/rejected": -2.921659231185913, + "logps/chosen": -361.46905517578125, + "logps/rejected": -314.6666259765625, + "loss": 0.5347, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.31199535727500916, + "rewards/margins": 1.5148388147354126, + "rewards/rejected": -1.202843427658081, + "step": 580 + }, + { + "epoch": 0.61, + "learning_rate": 4.4285168195718655e-07, + "logits/chosen": -2.9674103260040283, + "logits/rejected": -2.9832658767700195, + "logps/chosen": -279.5147705078125, + "logps/rejected": -283.4952697753906, + "loss": 0.5475, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.18109655380249023, + "rewards/margins": 1.081386685371399, + "rewards/rejected": -1.2624832391738892, + "step": 590 + }, + { + "epoch": 0.62, + "learning_rate": 4.40940366972477e-07, + "logits/chosen": -3.038327932357788, + "logits/rejected": -3.081512928009033, + "logps/chosen": -282.9052429199219, + "logps/rejected": -260.5687255859375, + "loss": 0.5879, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0032925487030297518, + "rewards/margins": 0.903986930847168, + "rewards/rejected": -0.9006943702697754, + "step": 600 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.9869041442871094, + "eval_logits/rejected": -2.991122007369995, + "eval_logps/chosen": -362.4848327636719, + "eval_logps/rejected": -301.2572021484375, + "eval_loss": 0.5287741422653198, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": 0.47069627046585083, + "eval_rewards/margins": 1.3998188972473145, + "eval_rewards/rejected": -0.9291225075721741, + "eval_runtime": 164.0279, + "eval_samples_per_second": 12.193, + "eval_steps_per_second": 0.384, + "step": 600 + }, + { + "epoch": 0.63, + "learning_rate": 4.3902905198776756e-07, + "logits/chosen": -3.0266683101654053, + "logits/rejected": -3.0426414012908936, + "logps/chosen": -345.2246398925781, + "logps/rejected": -280.60711669921875, + "loss": 0.5172, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.3867380917072296, + "rewards/margins": 1.5311682224273682, + "rewards/rejected": -1.14443039894104, + "step": 610 + }, + { + "epoch": 0.64, + "learning_rate": 4.371177370030581e-07, + "logits/chosen": -3.0024008750915527, + "logits/rejected": -3.0336501598358154, + "logps/chosen": -340.01483154296875, + "logps/rejected": -288.4037170410156, + "loss": 0.5674, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.09759467095136642, + "rewards/margins": 1.4280248880386353, + "rewards/rejected": -1.330430030822754, + "step": 620 + }, + { + "epoch": 0.65, + "learning_rate": 4.352064220183486e-07, + "logits/chosen": -3.073171377182007, + "logits/rejected": -3.0693984031677246, + "logps/chosen": -268.47442626953125, + "logps/rejected": -253.87173461914062, + "loss": 0.6129, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.08323542028665543, + "rewards/margins": 1.263319969177246, + "rewards/rejected": -1.180084466934204, + "step": 630 + }, + { + "epoch": 0.66, + "learning_rate": 4.3329510703363915e-07, + "logits/chosen": -3.1394124031066895, + "logits/rejected": -3.147449493408203, + "logps/chosen": -316.50323486328125, + "logps/rejected": -256.6443786621094, + "loss": 0.5405, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.0077073900029063225, + "rewards/margins": 1.4150127172470093, + "rewards/rejected": -1.407305359840393, + "step": 640 + }, + { + "epoch": 0.67, + "learning_rate": 4.313837920489297e-07, + "logits/chosen": -3.015110731124878, + "logits/rejected": -3.0439746379852295, + "logps/chosen": -309.4215087890625, + "logps/rejected": -278.88934326171875, + "loss": 0.5239, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.3628634512424469, + "rewards/margins": 1.4821045398712158, + "rewards/rejected": -1.1192409992218018, + "step": 650 + }, + { + "epoch": 0.68, + "learning_rate": 4.2947247706422016e-07, + "logits/chosen": -3.0345845222473145, + "logits/rejected": -2.997607469558716, + "logps/chosen": -311.18719482421875, + "logps/rejected": -289.7060852050781, + "loss": 0.5288, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4103531241416931, + "rewards/margins": 1.0822376012802124, + "rewards/rejected": -1.4925907850265503, + "step": 660 + }, + { + "epoch": 0.69, + "learning_rate": 4.275611620795107e-07, + "logits/chosen": -3.0080935955047607, + "logits/rejected": -3.015535593032837, + "logps/chosen": -377.9685974121094, + "logps/rejected": -297.92169189453125, + "loss": 0.5683, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14489376544952393, + "rewards/margins": 1.1130046844482422, + "rewards/rejected": -0.9681110382080078, + "step": 670 + }, + { + "epoch": 0.7, + "learning_rate": 4.2564984709480123e-07, + "logits/chosen": -3.030597448348999, + "logits/rejected": -3.059508800506592, + "logps/chosen": -368.32635498046875, + "logps/rejected": -274.53619384765625, + "loss": 0.6557, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3344075083732605, + "rewards/margins": 1.3792588710784912, + "rewards/rejected": -1.0448510646820068, + "step": 680 + }, + { + "epoch": 0.71, + "learning_rate": 4.2373853211009176e-07, + "logits/chosen": -3.0355846881866455, + "logits/rejected": -3.0617101192474365, + "logps/chosen": -310.24530029296875, + "logps/rejected": -280.7437438964844, + "loss": 0.5629, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.31564414501190186, + "rewards/margins": 1.532622218132019, + "rewards/rejected": -1.2169779539108276, + "step": 690 + }, + { + "epoch": 0.72, + "learning_rate": 4.2182721712538224e-07, + "logits/chosen": -3.029533863067627, + "logits/rejected": -3.05369234085083, + "logps/chosen": -370.49945068359375, + "logps/rejected": -285.1793212890625, + "loss": 0.6773, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.3230968117713928, + "rewards/margins": 1.0617311000823975, + "rewards/rejected": -0.7386342287063599, + "step": 700 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -3.041776180267334, + "eval_logits/rejected": -3.0563852787017822, + "eval_logps/chosen": -366.7193603515625, + "eval_logps/rejected": -301.1505432128906, + "eval_loss": 0.585310697555542, + "eval_rewards/accuracies": 0.7460317611694336, + "eval_rewards/chosen": 0.047242674976587296, + "eval_rewards/margins": 0.9657005667686462, + "eval_rewards/rejected": -0.9184578657150269, + "eval_runtime": 164.258, + "eval_samples_per_second": 12.176, + "eval_steps_per_second": 0.384, + "step": 700 + }, + { + "epoch": 0.73, + "learning_rate": 4.199159021406727e-07, + "logits/chosen": -2.963630199432373, + "logits/rejected": -3.031212329864502, + "logps/chosen": -297.58990478515625, + "logps/rejected": -283.17572021484375, + "loss": 0.6067, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.13306304812431335, + "rewards/margins": 1.0036863088607788, + "rewards/rejected": -0.8706234097480774, + "step": 710 + }, + { + "epoch": 0.74, + "learning_rate": 4.1800458715596325e-07, + "logits/chosen": -3.0382869243621826, + "logits/rejected": -3.0224924087524414, + "logps/chosen": -373.01947021484375, + "logps/rejected": -315.932861328125, + "loss": 0.6166, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.028559958562254906, + "rewards/margins": 0.9609702825546265, + "rewards/rejected": -0.9324103593826294, + "step": 720 + }, + { + "epoch": 0.75, + "learning_rate": 4.160932721712538e-07, + "logits/chosen": -3.0072388648986816, + "logits/rejected": -3.0005228519439697, + "logps/chosen": -340.4766540527344, + "logps/rejected": -306.3741149902344, + "loss": 0.6079, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.09662823379039764, + "rewards/margins": 1.1530828475952148, + "rewards/rejected": -1.0564546585083008, + "step": 730 + }, + { + "epoch": 0.76, + "learning_rate": 4.141819571865443e-07, + "logits/chosen": -2.9518847465515137, + "logits/rejected": -2.9550204277038574, + "logps/chosen": -325.9070739746094, + "logps/rejected": -244.12588500976562, + "loss": 0.564, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.1386883705854416, + "rewards/margins": 1.7188549041748047, + "rewards/rejected": -1.5801665782928467, + "step": 740 + }, + { + "epoch": 0.77, + "learning_rate": 4.1227064220183485e-07, + "logits/chosen": -2.9738943576812744, + "logits/rejected": -3.009288787841797, + "logps/chosen": -306.73614501953125, + "logps/rejected": -284.35089111328125, + "loss": 0.5213, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.16552898287773132, + "rewards/margins": 1.8094953298568726, + "rewards/rejected": -1.6439664363861084, + "step": 750 + }, + { + "epoch": 0.78, + "learning_rate": 4.103593272171253e-07, + "logits/chosen": -2.9576098918914795, + "logits/rejected": -2.9751369953155518, + "logps/chosen": -336.5853576660156, + "logps/rejected": -326.5455017089844, + "loss": 0.5703, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07825515419244766, + "rewards/margins": 1.1398742198944092, + "rewards/rejected": -1.0616191625595093, + "step": 760 + }, + { + "epoch": 0.79, + "learning_rate": 4.0844801223241586e-07, + "logits/chosen": -3.0122196674346924, + "logits/rejected": -2.9879307746887207, + "logps/chosen": -350.8817138671875, + "logps/rejected": -298.84307861328125, + "loss": 0.5197, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03767753392457962, + "rewards/margins": 1.09770929813385, + "rewards/rejected": -1.0600318908691406, + "step": 770 + }, + { + "epoch": 0.8, + "learning_rate": 4.065366972477064e-07, + "logits/chosen": -2.9043805599212646, + "logits/rejected": -2.9711837768554688, + "logps/chosen": -379.1385803222656, + "logps/rejected": -296.9505920410156, + "loss": 0.5669, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3532111942768097, + "rewards/margins": 1.7610466480255127, + "rewards/rejected": -1.407835602760315, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 4.046253822629969e-07, + "logits/chosen": -2.9510416984558105, + "logits/rejected": -2.961275100708008, + "logps/chosen": -330.33673095703125, + "logps/rejected": -288.71173095703125, + "loss": 0.5177, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.301142156124115, + "rewards/margins": 1.2936238050460815, + "rewards/rejected": -0.9924817085266113, + "step": 790 + }, + { + "epoch": 0.83, + "learning_rate": 4.0271406727828745e-07, + "logits/chosen": -2.991361141204834, + "logits/rejected": -2.974353790283203, + "logps/chosen": -322.8855895996094, + "logps/rejected": -271.7654113769531, + "loss": 0.5263, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06158037111163139, + "rewards/margins": 1.1184431314468384, + "rewards/rejected": -1.056862711906433, + "step": 800 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.963681936264038, + "eval_logits/rejected": -2.966184139251709, + "eval_logps/chosen": -364.9457702636719, + "eval_logps/rejected": -303.87957763671875, + "eval_loss": 0.5150811076164246, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": 0.22460374236106873, + "eval_rewards/margins": 1.4159626960754395, + "eval_rewards/rejected": -1.191359043121338, + "eval_runtime": 163.931, + "eval_samples_per_second": 12.2, + "eval_steps_per_second": 0.384, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 4.00802752293578e-07, + "logits/chosen": -2.974116802215576, + "logits/rejected": -2.9998645782470703, + "logps/chosen": -330.64910888671875, + "logps/rejected": -294.6690368652344, + "loss": 0.5031, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.18794824182987213, + "rewards/margins": 1.2928552627563477, + "rewards/rejected": -1.1049071550369263, + "step": 810 + }, + { + "epoch": 0.85, + "learning_rate": 3.9889143730886847e-07, + "logits/chosen": -2.9930388927459717, + "logits/rejected": -2.983773946762085, + "logps/chosen": -376.36212158203125, + "logps/rejected": -305.14111328125, + "loss": 0.5357, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27564454078674316, + "rewards/margins": 1.5530188083648682, + "rewards/rejected": -1.277374267578125, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 3.96980122324159e-07, + "logits/chosen": -3.0382747650146484, + "logits/rejected": -3.0700857639312744, + "logps/chosen": -313.2106018066406, + "logps/rejected": -256.5130310058594, + "loss": 0.556, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.03291673585772514, + "rewards/margins": 1.2974836826324463, + "rewards/rejected": -1.3304002285003662, + "step": 830 + }, + { + "epoch": 0.87, + "learning_rate": 3.9506880733944953e-07, + "logits/chosen": -3.0589098930358887, + "logits/rejected": -3.058842897415161, + "logps/chosen": -304.68658447265625, + "logps/rejected": -276.25177001953125, + "loss": 0.5578, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.22373457252979279, + "rewards/margins": 1.489611268043518, + "rewards/rejected": -1.7133458852767944, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 3.9315749235474006e-07, + "logits/chosen": -3.037079334259033, + "logits/rejected": -3.0386836528778076, + "logps/chosen": -347.38897705078125, + "logps/rejected": -334.3331298828125, + "loss": 0.5433, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.15273378789424896, + "rewards/margins": 1.5688612461090088, + "rewards/rejected": -1.7215951681137085, + "step": 850 + }, + { + "epoch": 0.89, + "learning_rate": 3.912461773700306e-07, + "logits/chosen": -2.9914333820343018, + "logits/rejected": -3.013286828994751, + "logps/chosen": -361.6410217285156, + "logps/rejected": -342.3985900878906, + "loss": 0.5464, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.03575097769498825, + "rewards/margins": 1.2531265020370483, + "rewards/rejected": -1.2888776063919067, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 3.8933486238532107e-07, + "logits/chosen": -2.9541945457458496, + "logits/rejected": -2.979830265045166, + "logps/chosen": -424.258544921875, + "logps/rejected": -299.7648620605469, + "loss": 0.581, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.013498688116669655, + "rewards/margins": 1.3451616764068604, + "rewards/rejected": -1.331662893295288, + "step": 870 + }, + { + "epoch": 0.91, + "learning_rate": 3.874235474006116e-07, + "logits/chosen": -2.9790916442871094, + "logits/rejected": -2.987037181854248, + "logps/chosen": -364.68048095703125, + "logps/rejected": -290.4891052246094, + "loss": 0.58, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.007329714484512806, + "rewards/margins": 1.4278209209442139, + "rewards/rejected": -1.4351506233215332, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 3.8551223241590214e-07, + "logits/chosen": -2.986210823059082, + "logits/rejected": -2.9739222526550293, + "logps/chosen": -300.7494812011719, + "logps/rejected": -278.2732849121094, + "loss": 0.5741, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04843021556735039, + "rewards/margins": 1.3019744157791138, + "rewards/rejected": -1.3504045009613037, + "step": 890 + }, + { + "epoch": 0.93, + "learning_rate": 3.8360091743119267e-07, + "logits/chosen": -2.9838929176330566, + "logits/rejected": -2.9902117252349854, + "logps/chosen": -306.20025634765625, + "logps/rejected": -289.5735168457031, + "loss": 0.5366, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.058762937784194946, + "rewards/margins": 1.2602014541625977, + "rewards/rejected": -1.2014386653900146, + "step": 900 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.9907381534576416, + "eval_logits/rejected": -2.982360363006592, + "eval_logps/chosen": -364.6807556152344, + "eval_logps/rejected": -302.8385314941406, + "eval_loss": 0.5133689641952515, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": 0.25110283493995667, + "eval_rewards/margins": 1.338356614112854, + "eval_rewards/rejected": -1.0872538089752197, + "eval_runtime": 164.3114, + "eval_samples_per_second": 12.172, + "eval_steps_per_second": 0.383, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 3.816896024464832e-07, + "logits/chosen": -3.0253748893737793, + "logits/rejected": -2.9562289714813232, + "logps/chosen": -281.73016357421875, + "logps/rejected": -248.2506866455078, + "loss": 0.5377, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.28038138151168823, + "rewards/margins": 1.607208251953125, + "rewards/rejected": -1.326826810836792, + "step": 910 + }, + { + "epoch": 0.95, + "learning_rate": 3.797782874617737e-07, + "logits/chosen": -2.9797048568725586, + "logits/rejected": -2.932326555252075, + "logps/chosen": -333.2131042480469, + "logps/rejected": -267.63128662109375, + "loss": 0.4959, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.26507893204689026, + "rewards/margins": 1.226858139038086, + "rewards/rejected": -0.9617794156074524, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 3.778669724770642e-07, + "logits/chosen": -2.9677836894989014, + "logits/rejected": -2.9711012840270996, + "logps/chosen": -301.1932067871094, + "logps/rejected": -239.915771484375, + "loss": 0.5646, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.06536471843719482, + "rewards/margins": 1.3896596431732178, + "rewards/rejected": -1.3242948055267334, + "step": 930 + }, + { + "epoch": 0.97, + "learning_rate": 3.7595565749235474e-07, + "logits/chosen": -3.003399133682251, + "logits/rejected": -2.9879281520843506, + "logps/chosen": -351.9979553222656, + "logps/rejected": -264.519775390625, + "loss": 0.585, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.3593365252017975, + "rewards/margins": 1.1577335596084595, + "rewards/rejected": -0.7983969449996948, + "step": 940 + }, + { + "epoch": 0.98, + "learning_rate": 3.740443425076452e-07, + "logits/chosen": -2.9658942222595215, + "logits/rejected": -2.982341766357422, + "logps/chosen": -336.6238708496094, + "logps/rejected": -299.588134765625, + "loss": 0.5176, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.2032889872789383, + "rewards/margins": 1.3534172773361206, + "rewards/rejected": -1.1501282453536987, + "step": 950 + }, + { + "epoch": 0.99, + "learning_rate": 3.7213302752293575e-07, + "logits/chosen": -2.951019763946533, + "logits/rejected": -2.985151767730713, + "logps/chosen": -338.8179931640625, + "logps/rejected": -283.80328369140625, + "loss": 0.538, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.049173761159181595, + "rewards/margins": 0.8662222623825073, + "rewards/rejected": -0.8170484304428101, + "step": 960 + }, + { + "epoch": 1.0, + "learning_rate": 3.702217125382263e-07, + "logits/chosen": -2.9252991676330566, + "logits/rejected": -2.937505006790161, + "logps/chosen": -354.7286682128906, + "logps/rejected": -312.35333251953125, + "loss": 0.4865, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.2835424542427063, + "rewards/margins": 1.7692314386367798, + "rewards/rejected": -1.4856891632080078, + "step": 970 + }, + { + "epoch": 1.01, + "learning_rate": 3.6831039755351677e-07, + "logits/chosen": -2.975984573364258, + "logits/rejected": -2.9734318256378174, + "logps/chosen": -319.2844543457031, + "logps/rejected": -303.3651428222656, + "loss": 0.1133, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7135050296783447, + "rewards/margins": 4.7504682540893555, + "rewards/rejected": -3.0369625091552734, + "step": 980 + }, + { + "epoch": 1.02, + "learning_rate": 3.663990825688073e-07, + "logits/chosen": -2.899402141571045, + "logits/rejected": -2.89802885055542, + "logps/chosen": -288.45123291015625, + "logps/rejected": -316.5885314941406, + "loss": 0.1405, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7678325176239014, + "rewards/margins": 5.483719348907471, + "rewards/rejected": -3.715886354446411, + "step": 990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6448776758409783e-07, + "logits/chosen": -2.8675971031188965, + "logits/rejected": -2.89615797996521, + "logps/chosen": -325.03863525390625, + "logps/rejected": -333.436767578125, + "loss": 0.1034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.79555344581604, + "rewards/margins": 4.759924411773682, + "rewards/rejected": -2.9643709659576416, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -2.92020583152771, + "eval_logits/rejected": -2.9095799922943115, + "eval_logps/chosen": -364.1185302734375, + "eval_logps/rejected": -306.2866516113281, + "eval_loss": 0.5107486248016357, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": 0.307327002286911, + "eval_rewards/margins": 1.739391803741455, + "eval_rewards/rejected": -1.4320647716522217, + "eval_runtime": 164.3142, + "eval_samples_per_second": 12.172, + "eval_steps_per_second": 0.383, + "step": 1000 + }, + { + "epoch": 1.04, + "learning_rate": 3.6257645259938836e-07, + "logits/chosen": -2.8848228454589844, + "logits/rejected": -2.9434664249420166, + "logps/chosen": -304.1281433105469, + "logps/rejected": -323.9388732910156, + "loss": 0.0912, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5237512588500977, + "rewards/margins": 5.300021171569824, + "rewards/rejected": -3.7762699127197266, + "step": 1010 + }, + { + "epoch": 1.05, + "learning_rate": 3.606651376146789e-07, + "logits/chosen": -2.818145275115967, + "logits/rejected": -2.773864269256592, + "logps/chosen": -315.73687744140625, + "logps/rejected": -252.3991241455078, + "loss": 0.1072, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2391575574874878, + "rewards/margins": 4.052863121032715, + "rewards/rejected": -2.8137052059173584, + "step": 1020 + }, + { + "epoch": 1.06, + "learning_rate": 3.5875382262996937e-07, + "logits/chosen": -2.8956587314605713, + "logits/rejected": -2.88509202003479, + "logps/chosen": -332.889404296875, + "logps/rejected": -375.0550231933594, + "loss": 0.0854, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7886276245117188, + "rewards/margins": 4.777144432067871, + "rewards/rejected": -2.9885172843933105, + "step": 1030 + }, + { + "epoch": 1.07, + "learning_rate": 3.568425076452599e-07, + "logits/chosen": -2.9382426738739014, + "logits/rejected": -2.9390716552734375, + "logps/chosen": -339.12451171875, + "logps/rejected": -315.15625, + "loss": 0.099, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 1.3939841985702515, + "rewards/margins": 4.764640808105469, + "rewards/rejected": -3.370656967163086, + "step": 1040 + }, + { + "epoch": 1.08, + "learning_rate": 3.5493119266055044e-07, + "logits/chosen": -2.8407671451568604, + "logits/rejected": -2.821763753890991, + "logps/chosen": -336.37298583984375, + "logps/rejected": -257.6861267089844, + "loss": 0.1132, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3384716510772705, + "rewards/margins": 4.965681076049805, + "rewards/rejected": -3.627209424972534, + "step": 1050 + }, + { + "epoch": 1.09, + "learning_rate": 3.5301987767584097e-07, + "logits/chosen": -2.8167824745178223, + "logits/rejected": -2.810854434967041, + "logps/chosen": -323.439208984375, + "logps/rejected": -342.47991943359375, + "loss": 0.2041, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7954254150390625, + "rewards/margins": 4.722014427185059, + "rewards/rejected": -3.9265894889831543, + "step": 1060 + }, + { + "epoch": 1.1, + "learning_rate": 3.511085626911315e-07, + "logits/chosen": -2.940957546234131, + "logits/rejected": -3.0021321773529053, + "logps/chosen": -366.2899475097656, + "logps/rejected": -343.2218933105469, + "loss": 0.3299, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.4886529445648193, + "rewards/margins": 5.359461307525635, + "rewards/rejected": -3.870807647705078, + "step": 1070 + }, + { + "epoch": 1.11, + "learning_rate": 3.49197247706422e-07, + "logits/chosen": -2.933786392211914, + "logits/rejected": -2.906247615814209, + "logps/chosen": -261.5579833984375, + "logps/rejected": -276.83026123046875, + "loss": 0.149, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.48736995458602905, + "rewards/margins": 4.608451843261719, + "rewards/rejected": -4.121081829071045, + "step": 1080 + }, + { + "epoch": 1.12, + "learning_rate": 3.472859327217125e-07, + "logits/chosen": -2.8349316120147705, + "logits/rejected": -2.9043667316436768, + "logps/chosen": -364.6941833496094, + "logps/rejected": -376.5315856933594, + "loss": 0.1092, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2117881774902344, + "rewards/margins": 5.32895565032959, + "rewards/rejected": -4.117166996002197, + "step": 1090 + }, + { + "epoch": 1.14, + "learning_rate": 3.4537461773700304e-07, + "logits/chosen": -2.985729694366455, + "logits/rejected": -2.8761606216430664, + "logps/chosen": -258.05841064453125, + "logps/rejected": -241.39053344726562, + "loss": 0.1114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6233962774276733, + "rewards/margins": 4.293813228607178, + "rewards/rejected": -3.6704165935516357, + "step": 1100 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -2.9666378498077393, + "eval_logits/rejected": -2.95609450340271, + "eval_logps/chosen": -365.8598327636719, + "eval_logps/rejected": -310.414794921875, + "eval_loss": 0.534447431564331, + "eval_rewards/accuracies": 0.7460317611694336, + "eval_rewards/chosen": 0.1331927627325058, + "eval_rewards/margins": 1.9780747890472412, + "eval_rewards/rejected": -1.8448821306228638, + "eval_runtime": 164.1399, + "eval_samples_per_second": 12.185, + "eval_steps_per_second": 0.384, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 3.434633027522936e-07, + "logits/chosen": -2.9507124423980713, + "logits/rejected": -2.9483211040496826, + "logps/chosen": -338.0868835449219, + "logps/rejected": -325.01483154296875, + "loss": 0.1007, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6764055490493774, + "rewards/margins": 5.684920310974121, + "rewards/rejected": -4.008514404296875, + "step": 1110 + }, + { + "epoch": 1.16, + "learning_rate": 3.415519877675841e-07, + "logits/chosen": -2.976590156555176, + "logits/rejected": -3.025784730911255, + "logps/chosen": -277.34710693359375, + "logps/rejected": -323.576171875, + "loss": 0.1131, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.841607928276062, + "rewards/margins": 4.658609867095947, + "rewards/rejected": -3.8170018196105957, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 3.3964067278287464e-07, + "logits/chosen": -3.0445570945739746, + "logits/rejected": -3.0413312911987305, + "logps/chosen": -337.9605407714844, + "logps/rejected": -288.26666259765625, + "loss": 0.1463, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5894230604171753, + "rewards/margins": 5.3310723304748535, + "rewards/rejected": -3.7416489124298096, + "step": 1130 + }, + { + "epoch": 1.18, + "learning_rate": 3.377293577981651e-07, + "logits/chosen": -2.9471421241760254, + "logits/rejected": -2.9865708351135254, + "logps/chosen": -288.2189025878906, + "logps/rejected": -309.2388610839844, + "loss": 0.0934, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.4304211139678955, + "rewards/margins": 5.832246780395508, + "rewards/rejected": -4.401825428009033, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 3.3581804281345565e-07, + "logits/chosen": -2.9803059101104736, + "logits/rejected": -2.9711978435516357, + "logps/chosen": -337.70697021484375, + "logps/rejected": -298.4077453613281, + "loss": 0.0967, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4117494821548462, + "rewards/margins": 5.474527359008789, + "rewards/rejected": -4.062777519226074, + "step": 1150 + }, + { + "epoch": 1.2, + "learning_rate": 3.339067278287462e-07, + "logits/chosen": -2.8603241443634033, + "logits/rejected": -2.8709046840667725, + "logps/chosen": -312.73504638671875, + "logps/rejected": -306.9026794433594, + "loss": 0.0785, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3839662075042725, + "rewards/margins": 5.900813102722168, + "rewards/rejected": -4.516847133636475, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 3.319954128440367e-07, + "logits/chosen": -2.975525379180908, + "logits/rejected": -2.9611260890960693, + "logps/chosen": -325.86163330078125, + "logps/rejected": -285.2755432128906, + "loss": 0.0992, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0602104663848877, + "rewards/margins": 4.849926948547363, + "rewards/rejected": -3.7897167205810547, + "step": 1170 + }, + { + "epoch": 1.22, + "learning_rate": 3.3008409785932725e-07, + "logits/chosen": -2.877586841583252, + "logits/rejected": -2.821748971939087, + "logps/chosen": -324.6281433105469, + "logps/rejected": -323.02301025390625, + "loss": 0.0892, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1957648992538452, + "rewards/margins": 5.952631950378418, + "rewards/rejected": -4.756867408752441, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 3.2817278287461773e-07, + "logits/chosen": -2.9451098442077637, + "logits/rejected": -2.9684863090515137, + "logps/chosen": -279.90216064453125, + "logps/rejected": -338.3842468261719, + "loss": 0.1045, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2665460109710693, + "rewards/margins": 5.084068298339844, + "rewards/rejected": -3.8175220489501953, + "step": 1190 + }, + { + "epoch": 1.24, + "learning_rate": 3.262614678899082e-07, + "logits/chosen": -2.889819383621216, + "logits/rejected": -2.9235751628875732, + "logps/chosen": -303.02838134765625, + "logps/rejected": -356.177734375, + "loss": 0.1338, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8252049684524536, + "rewards/margins": 5.507418155670166, + "rewards/rejected": -4.6822123527526855, + "step": 1200 + }, + { + "epoch": 1.24, + "eval_logits/chosen": -2.9508416652679443, + "eval_logits/rejected": -2.9459922313690186, + "eval_logps/chosen": -368.0057678222656, + "eval_logps/rejected": -313.3835144042969, + "eval_loss": 0.534950315952301, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -0.0813969075679779, + "eval_rewards/margins": 2.0603599548339844, + "eval_rewards/rejected": -2.141756772994995, + "eval_runtime": 164.0736, + "eval_samples_per_second": 12.19, + "eval_steps_per_second": 0.384, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 3.2435015290519874e-07, + "logits/chosen": -2.916611671447754, + "logits/rejected": -2.927777051925659, + "logps/chosen": -283.2217712402344, + "logps/rejected": -297.02850341796875, + "loss": 0.0893, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0804212093353271, + "rewards/margins": 5.549715518951416, + "rewards/rejected": -4.469293594360352, + "step": 1210 + }, + { + "epoch": 1.26, + "learning_rate": 3.2243883792048927e-07, + "logits/chosen": -2.977875232696533, + "logits/rejected": -2.986704111099243, + "logps/chosen": -335.274658203125, + "logps/rejected": -380.4412536621094, + "loss": 0.1303, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4366910457611084, + "rewards/margins": 5.666425704956055, + "rewards/rejected": -4.229735374450684, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 3.205275229357798e-07, + "logits/chosen": -2.868638753890991, + "logits/rejected": -2.8948395252227783, + "logps/chosen": -387.9947204589844, + "logps/rejected": -389.3511657714844, + "loss": 0.1117, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3278411626815796, + "rewards/margins": 5.698910236358643, + "rewards/rejected": -4.371068954467773, + "step": 1230 + }, + { + "epoch": 1.28, + "learning_rate": 3.186162079510703e-07, + "logits/chosen": -2.9128642082214355, + "logits/rejected": -2.91692852973938, + "logps/chosen": -351.5616149902344, + "logps/rejected": -373.9852600097656, + "loss": 0.1466, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3745101690292358, + "rewards/margins": 5.025930404663086, + "rewards/rejected": -3.6514201164245605, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 3.167048929663608e-07, + "logits/chosen": -2.955967426300049, + "logits/rejected": -2.923954486846924, + "logps/chosen": -278.7707824707031, + "logps/rejected": -281.9942321777344, + "loss": 0.1003, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 1.0149275064468384, + "rewards/margins": 5.51505184173584, + "rewards/rejected": -4.500124931335449, + "step": 1250 + }, + { + "epoch": 1.3, + "learning_rate": 3.1479357798165134e-07, + "logits/chosen": -2.9661002159118652, + "logits/rejected": -2.948564052581787, + "logps/chosen": -339.5476989746094, + "logps/rejected": -321.3616638183594, + "loss": 0.0984, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.038356065750122, + "rewards/margins": 5.16934061050415, + "rewards/rejected": -4.130984306335449, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 3.128822629969419e-07, + "logits/chosen": -2.862750291824341, + "logits/rejected": -2.8853306770324707, + "logps/chosen": -350.9757995605469, + "logps/rejected": -333.6067199707031, + "loss": 0.1195, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.147101640701294, + "rewards/margins": 5.550149917602539, + "rewards/rejected": -4.403048038482666, + "step": 1270 + }, + { + "epoch": 1.32, + "learning_rate": 3.109709480122324e-07, + "logits/chosen": -2.9103734493255615, + "logits/rejected": -2.9115426540374756, + "logps/chosen": -286.4703063964844, + "logps/rejected": -298.028076171875, + "loss": 0.1039, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1303044557571411, + "rewards/margins": 5.467093467712402, + "rewards/rejected": -4.336789131164551, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 3.0905963302752294e-07, + "logits/chosen": -2.7934536933898926, + "logits/rejected": -2.880432605743408, + "logps/chosen": -312.3811340332031, + "logps/rejected": -316.52215576171875, + "loss": 0.1168, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9917739629745483, + "rewards/margins": 5.529503345489502, + "rewards/rejected": -4.537729263305664, + "step": 1290 + }, + { + "epoch": 1.34, + "learning_rate": 3.071483180428134e-07, + "logits/chosen": -2.9836788177490234, + "logits/rejected": -2.9340128898620605, + "logps/chosen": -382.5011291503906, + "logps/rejected": -295.3705749511719, + "loss": 0.0979, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5801069736480713, + "rewards/margins": 5.381975173950195, + "rewards/rejected": -3.801867723464966, + "step": 1300 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -2.9200918674468994, + "eval_logits/rejected": -2.9171833992004395, + "eval_logps/chosen": -368.1370849609375, + "eval_logps/rejected": -314.4656982421875, + "eval_loss": 0.5474238991737366, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -0.09453116357326508, + "eval_rewards/margins": 2.1554412841796875, + "eval_rewards/rejected": -2.249972343444824, + "eval_runtime": 164.7724, + "eval_samples_per_second": 12.138, + "eval_steps_per_second": 0.382, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 3.0523700305810395e-07, + "logits/chosen": -2.9245269298553467, + "logits/rejected": -2.9436841011047363, + "logps/chosen": -330.574951171875, + "logps/rejected": -342.8641662597656, + "loss": 0.1043, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3053131103515625, + "rewards/margins": 5.542339324951172, + "rewards/rejected": -4.237026214599609, + "step": 1310 + }, + { + "epoch": 1.36, + "learning_rate": 3.033256880733945e-07, + "logits/chosen": -2.9248242378234863, + "logits/rejected": -2.935176372528076, + "logps/chosen": -286.57171630859375, + "logps/rejected": -278.96746826171875, + "loss": 0.104, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7352786660194397, + "rewards/margins": 5.138430118560791, + "rewards/rejected": -4.403151035308838, + "step": 1320 + }, + { + "epoch": 1.37, + "learning_rate": 3.01414373088685e-07, + "logits/chosen": -2.8516454696655273, + "logits/rejected": -2.7985987663269043, + "logps/chosen": -344.3554382324219, + "logps/rejected": -341.85986328125, + "loss": 0.1138, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1177431344985962, + "rewards/margins": 5.806307792663574, + "rewards/rejected": -4.688565254211426, + "step": 1330 + }, + { + "epoch": 1.38, + "learning_rate": 2.9950305810397555e-07, + "logits/chosen": -2.906580686569214, + "logits/rejected": -2.97481369972229, + "logps/chosen": -335.2439880371094, + "logps/rejected": -320.96929931640625, + "loss": 0.1256, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8055500984191895, + "rewards/margins": 6.211544990539551, + "rewards/rejected": -5.405994892120361, + "step": 1340 + }, + { + "epoch": 1.39, + "learning_rate": 2.9759174311926603e-07, + "logits/chosen": -2.9623026847839355, + "logits/rejected": -2.9445879459381104, + "logps/chosen": -323.3135070800781, + "logps/rejected": -329.90496826171875, + "loss": 0.1101, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9566561579704285, + "rewards/margins": 7.040016174316406, + "rewards/rejected": -6.083359718322754, + "step": 1350 + }, + { + "epoch": 1.4, + "learning_rate": 2.9568042813455656e-07, + "logits/chosen": -2.8847999572753906, + "logits/rejected": -2.8803889751434326, + "logps/chosen": -342.5070495605469, + "logps/rejected": -269.67431640625, + "loss": 0.1087, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4027617573738098, + "rewards/margins": 4.224934101104736, + "rewards/rejected": -3.8221726417541504, + "step": 1360 + }, + { + "epoch": 1.41, + "learning_rate": 2.937691131498471e-07, + "logits/chosen": -2.949441909790039, + "logits/rejected": -2.9045028686523438, + "logps/chosen": -338.4786376953125, + "logps/rejected": -334.69189453125, + "loss": 0.1096, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9296592473983765, + "rewards/margins": 5.884530067443848, + "rewards/rejected": -4.954870700836182, + "step": 1370 + }, + { + "epoch": 1.42, + "learning_rate": 2.918577981651376e-07, + "logits/chosen": -2.932290554046631, + "logits/rejected": -2.9427378177642822, + "logps/chosen": -280.7291564941406, + "logps/rejected": -314.51953125, + "loss": 0.1177, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.7406275868415833, + "rewards/margins": 5.4294962882995605, + "rewards/rejected": -4.688868522644043, + "step": 1380 + }, + { + "epoch": 1.43, + "learning_rate": 2.8994648318042816e-07, + "logits/chosen": -2.9206976890563965, + "logits/rejected": -2.9712460041046143, + "logps/chosen": -358.7654113769531, + "logps/rejected": -313.7131652832031, + "loss": 0.112, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8611236810684204, + "rewards/margins": 5.0603203773498535, + "rewards/rejected": -4.199196815490723, + "step": 1390 + }, + { + "epoch": 1.44, + "learning_rate": 2.8803516819571863e-07, + "logits/chosen": -2.979775905609131, + "logits/rejected": -2.9886953830718994, + "logps/chosen": -379.1478576660156, + "logps/rejected": -358.28179931640625, + "loss": 0.1366, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.4315125942230225, + "rewards/margins": 6.2576189041137695, + "rewards/rejected": -4.826106071472168, + "step": 1400 + }, + { + "epoch": 1.44, + "eval_logits/chosen": -2.9143617153167725, + "eval_logits/rejected": -2.9134304523468018, + "eval_logps/chosen": -371.9402770996094, + "eval_logps/rejected": -315.933837890625, + "eval_loss": 0.5439518094062805, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": -0.47485068440437317, + "eval_rewards/margins": 1.921934962272644, + "eval_rewards/rejected": -2.3967857360839844, + "eval_runtime": 165.1605, + "eval_samples_per_second": 12.109, + "eval_steps_per_second": 0.381, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 2.8612385321100917e-07, + "logits/chosen": -2.8306632041931152, + "logits/rejected": -2.9071240425109863, + "logps/chosen": -294.634033203125, + "logps/rejected": -327.87896728515625, + "loss": 0.1281, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.9478427171707153, + "rewards/margins": 5.661940097808838, + "rewards/rejected": -4.714097499847412, + "step": 1410 + }, + { + "epoch": 1.47, + "learning_rate": 2.842125382262997e-07, + "logits/chosen": -2.9503073692321777, + "logits/rejected": -2.9379420280456543, + "logps/chosen": -308.3216247558594, + "logps/rejected": -308.57574462890625, + "loss": 0.1361, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2659790515899658, + "rewards/margins": 5.583965301513672, + "rewards/rejected": -4.317985534667969, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 2.8230122324159023e-07, + "logits/chosen": -2.8611526489257812, + "logits/rejected": -2.9008944034576416, + "logps/chosen": -375.9707946777344, + "logps/rejected": -374.29913330078125, + "loss": 0.1194, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3691354990005493, + "rewards/margins": 6.1049299240112305, + "rewards/rejected": -4.735795021057129, + "step": 1430 + }, + { + "epoch": 1.49, + "learning_rate": 2.8038990825688076e-07, + "logits/chosen": -2.994868516921997, + "logits/rejected": -2.9603443145751953, + "logps/chosen": -263.36474609375, + "logps/rejected": -250.1201934814453, + "loss": 0.1098, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.8430646657943726, + "rewards/margins": 4.541081428527832, + "rewards/rejected": -3.698017120361328, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 2.784785932721712e-07, + "logits/chosen": -2.9225330352783203, + "logits/rejected": -2.925787925720215, + "logps/chosen": -319.07574462890625, + "logps/rejected": -327.4895324707031, + "loss": 0.1336, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1656725406646729, + "rewards/margins": 5.621832847595215, + "rewards/rejected": -4.456160068511963, + "step": 1450 + }, + { + "epoch": 1.51, + "learning_rate": 2.765672782874617e-07, + "logits/chosen": -2.896669864654541, + "logits/rejected": -2.895914316177368, + "logps/chosen": -331.0616760253906, + "logps/rejected": -286.6056213378906, + "loss": 0.1367, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3613170385360718, + "rewards/margins": 5.62969970703125, + "rewards/rejected": -4.268383026123047, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 2.7465596330275225e-07, + "logits/chosen": -2.955178737640381, + "logits/rejected": -2.9608724117279053, + "logps/chosen": -350.20703125, + "logps/rejected": -255.1401824951172, + "loss": 0.104, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.233802080154419, + "rewards/margins": 5.684638023376465, + "rewards/rejected": -4.450836658477783, + "step": 1470 + }, + { + "epoch": 1.53, + "learning_rate": 2.727446483180428e-07, + "logits/chosen": -2.9283223152160645, + "logits/rejected": -2.952641010284424, + "logps/chosen": -313.20306396484375, + "logps/rejected": -316.35333251953125, + "loss": 0.1125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9838566780090332, + "rewards/margins": 5.377806186676025, + "rewards/rejected": -4.393948554992676, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 2.708333333333333e-07, + "logits/chosen": -2.850559949874878, + "logits/rejected": -2.833322048187256, + "logps/chosen": -341.3831481933594, + "logps/rejected": -314.9398498535156, + "loss": 0.0943, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7250410914421082, + "rewards/margins": 5.533560276031494, + "rewards/rejected": -4.808518409729004, + "step": 1490 + }, + { + "epoch": 1.55, + "learning_rate": 2.6892201834862385e-07, + "logits/chosen": -2.937903642654419, + "logits/rejected": -2.9050183296203613, + "logps/chosen": -326.52691650390625, + "logps/rejected": -302.30694580078125, + "loss": 0.1042, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6643081903457642, + "rewards/margins": 5.094948768615723, + "rewards/rejected": -4.43064022064209, + "step": 1500 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.930583953857422, + "eval_logits/rejected": -2.936053991317749, + "eval_logps/chosen": -372.2054138183594, + "eval_logps/rejected": -318.7686462402344, + "eval_loss": 0.552377462387085, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -0.5013648867607117, + "eval_rewards/margins": 2.178898334503174, + "eval_rewards/rejected": -2.6802632808685303, + "eval_runtime": 167.7329, + "eval_samples_per_second": 11.924, + "eval_steps_per_second": 0.376, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6701070336391433e-07, + "logits/chosen": -2.90950345993042, + "logits/rejected": -2.87695050239563, + "logps/chosen": -378.1885070800781, + "logps/rejected": -322.77337646484375, + "loss": 0.0924, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.4327830076217651, + "rewards/margins": 5.619394779205322, + "rewards/rejected": -4.186612129211426, + "step": 1510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6509938837920486e-07, + "logits/chosen": -2.849907398223877, + "logits/rejected": -2.8833765983581543, + "logps/chosen": -289.51605224609375, + "logps/rejected": -320.0068054199219, + "loss": 0.0975, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8192381858825684, + "rewards/margins": 4.99267053604126, + "rewards/rejected": -4.173432350158691, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 2.631880733944954e-07, + "logits/chosen": -2.964118719100952, + "logits/rejected": -2.984459400177002, + "logps/chosen": -303.44866943359375, + "logps/rejected": -299.94482421875, + "loss": 0.105, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.37635737657546997, + "rewards/margins": 4.797235488891602, + "rewards/rejected": -4.420877933502197, + "step": 1530 + }, + { + "epoch": 1.59, + "learning_rate": 2.612767584097859e-07, + "logits/chosen": -2.9242002964019775, + "logits/rejected": -2.9575366973876953, + "logps/chosen": -308.75616455078125, + "logps/rejected": -282.21380615234375, + "loss": 0.1016, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4492263793945312, + "rewards/margins": 5.21218204498291, + "rewards/rejected": -3.7629554271698, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 2.5936544342507646e-07, + "logits/chosen": -2.902669668197632, + "logits/rejected": -2.932953357696533, + "logps/chosen": -306.1797790527344, + "logps/rejected": -315.36700439453125, + "loss": 0.1412, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0359132289886475, + "rewards/margins": 5.480694770812988, + "rewards/rejected": -4.444781303405762, + "step": 1550 + }, + { + "epoch": 1.61, + "learning_rate": 2.5745412844036693e-07, + "logits/chosen": -2.862687110900879, + "logits/rejected": -2.9322876930236816, + "logps/chosen": -361.41583251953125, + "logps/rejected": -309.0520935058594, + "loss": 0.1228, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3139979839324951, + "rewards/margins": 5.389029026031494, + "rewards/rejected": -4.07503080368042, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5554281345565747e-07, + "logits/chosen": -2.937886953353882, + "logits/rejected": -2.9431166648864746, + "logps/chosen": -312.0148620605469, + "logps/rejected": -329.90863037109375, + "loss": 0.1005, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.7804427146911621, + "rewards/margins": 5.692571640014648, + "rewards/rejected": -4.912128925323486, + "step": 1570 + }, + { + "epoch": 1.63, + "learning_rate": 2.53631498470948e-07, + "logits/chosen": -2.972658634185791, + "logits/rejected": -2.9699690341949463, + "logps/chosen": -347.6422424316406, + "logps/rejected": -312.10858154296875, + "loss": 0.0857, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3729878664016724, + "rewards/margins": 5.7400736808776855, + "rewards/rejected": -4.367085933685303, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 2.5172018348623853e-07, + "logits/chosen": -2.8885016441345215, + "logits/rejected": -2.9170756340026855, + "logps/chosen": -355.39813232421875, + "logps/rejected": -336.2825927734375, + "loss": 0.0852, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1573688983917236, + "rewards/margins": 5.676226615905762, + "rewards/rejected": -4.518857479095459, + "step": 1590 + }, + { + "epoch": 1.65, + "learning_rate": 2.4980886850152906e-07, + "logits/chosen": -2.895519256591797, + "logits/rejected": -2.851107120513916, + "logps/chosen": -341.70904541015625, + "logps/rejected": -305.86480712890625, + "loss": 0.1313, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0455483198165894, + "rewards/margins": 5.084899425506592, + "rewards/rejected": -4.039351463317871, + "step": 1600 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -2.8998661041259766, + "eval_logits/rejected": -2.9059910774230957, + "eval_logps/chosen": -369.42547607421875, + "eval_logps/rejected": -313.8333435058594, + "eval_loss": 0.5333446264266968, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -0.22337232530117035, + "eval_rewards/margins": 1.9633642435073853, + "eval_rewards/rejected": -2.1867363452911377, + "eval_runtime": 167.9925, + "eval_samples_per_second": 11.905, + "eval_steps_per_second": 0.375, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 2.478975535168196e-07, + "logits/chosen": -2.892448663711548, + "logits/rejected": -2.8929343223571777, + "logps/chosen": -341.67431640625, + "logps/rejected": -287.61383056640625, + "loss": 0.1044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8163628578186035, + "rewards/margins": 4.475451469421387, + "rewards/rejected": -3.659088611602783, + "step": 1610 + }, + { + "epoch": 1.67, + "learning_rate": 2.459862385321101e-07, + "logits/chosen": -2.9265084266662598, + "logits/rejected": -2.9548892974853516, + "logps/chosen": -382.5340576171875, + "logps/rejected": -347.4888916015625, + "loss": 0.1018, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.6915686130523682, + "rewards/margins": 6.067580223083496, + "rewards/rejected": -4.376010894775391, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 2.440749235474006e-07, + "logits/chosen": -2.8851680755615234, + "logits/rejected": -2.903552532196045, + "logps/chosen": -342.8496398925781, + "logps/rejected": -317.72845458984375, + "loss": 0.1005, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0220366716384888, + "rewards/margins": 5.634666442871094, + "rewards/rejected": -4.6126298904418945, + "step": 1630 + }, + { + "epoch": 1.69, + "learning_rate": 2.421636085626911e-07, + "logits/chosen": -2.8142755031585693, + "logits/rejected": -2.8399770259857178, + "logps/chosen": -344.5146484375, + "logps/rejected": -301.95928955078125, + "loss": 0.0981, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.6624820232391357, + "rewards/margins": 6.2719035148620605, + "rewards/rejected": -4.6094207763671875, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 2.402522935779816e-07, + "logits/chosen": -2.8734793663024902, + "logits/rejected": -2.876209259033203, + "logps/chosen": -339.1289367675781, + "logps/rejected": -351.3002014160156, + "loss": 0.1207, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1800963878631592, + "rewards/margins": 5.746790409088135, + "rewards/rejected": -4.566694736480713, + "step": 1650 + }, + { + "epoch": 1.71, + "learning_rate": 2.3834097859327215e-07, + "logits/chosen": -2.922632932662964, + "logits/rejected": -2.973679780960083, + "logps/chosen": -285.7434997558594, + "logps/rejected": -304.81536865234375, + "loss": 0.1184, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9965537786483765, + "rewards/margins": 5.077877998352051, + "rewards/rejected": -4.081325054168701, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 2.3642966360856268e-07, + "logits/chosen": -2.9541144371032715, + "logits/rejected": -2.929344654083252, + "logps/chosen": -326.902587890625, + "logps/rejected": -306.6372985839844, + "loss": 0.0972, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.7346267700195312, + "rewards/margins": 5.666862964630127, + "rewards/rejected": -3.9322357177734375, + "step": 1670 + }, + { + "epoch": 1.73, + "learning_rate": 2.345183486238532e-07, + "logits/chosen": -2.857109785079956, + "logits/rejected": -2.8801960945129395, + "logps/chosen": -308.47369384765625, + "logps/rejected": -375.78692626953125, + "loss": 0.0899, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.088503122329712, + "rewards/margins": 5.432967185974121, + "rewards/rejected": -4.344464302062988, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 2.3260703363914372e-07, + "logits/chosen": -2.9087703227996826, + "logits/rejected": -2.9551265239715576, + "logps/chosen": -381.01959228515625, + "logps/rejected": -338.1856994628906, + "loss": 0.105, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9901115894317627, + "rewards/margins": 5.308182716369629, + "rewards/rejected": -3.318070888519287, + "step": 1690 + }, + { + "epoch": 1.75, + "learning_rate": 2.3069571865443425e-07, + "logits/chosen": -2.9682905673980713, + "logits/rejected": -2.9819796085357666, + "logps/chosen": -277.3031921386719, + "logps/rejected": -325.71649169921875, + "loss": 0.1629, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5376863479614258, + "rewards/margins": 5.757152557373047, + "rewards/rejected": -5.219466209411621, + "step": 1700 + }, + { + "epoch": 1.75, + "eval_logits/chosen": -2.909576654434204, + "eval_logits/rejected": -2.9181904792785645, + "eval_logps/chosen": -371.09588623046875, + "eval_logps/rejected": -319.5571594238281, + "eval_loss": 0.5655122399330139, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -0.3904118835926056, + "eval_rewards/margins": 2.3687071800231934, + "eval_rewards/rejected": -2.7591187953948975, + "eval_runtime": 164.0305, + "eval_samples_per_second": 12.193, + "eval_steps_per_second": 0.384, + "step": 1700 + }, + { + "epoch": 1.76, + "learning_rate": 2.2878440366972476e-07, + "logits/chosen": -2.9808902740478516, + "logits/rejected": -2.9869067668914795, + "logps/chosen": -326.5906677246094, + "logps/rejected": -384.11944580078125, + "loss": 0.0867, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5069032311439514, + "rewards/margins": 5.799986362457275, + "rewards/rejected": -5.2930827140808105, + "step": 1710 + }, + { + "epoch": 1.78, + "learning_rate": 2.268730886850153e-07, + "logits/chosen": -2.8385868072509766, + "logits/rejected": -2.9057114124298096, + "logps/chosen": -325.4120178222656, + "logps/rejected": -309.59136962890625, + "loss": 0.0989, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1605224609375, + "rewards/margins": 5.591654300689697, + "rewards/rejected": -4.431131362915039, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 2.249617737003058e-07, + "logits/chosen": -2.8839237689971924, + "logits/rejected": -2.8796088695526123, + "logps/chosen": -352.14886474609375, + "logps/rejected": -371.3978576660156, + "loss": 0.1089, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0480362176895142, + "rewards/margins": 5.741724967956543, + "rewards/rejected": -4.693687915802002, + "step": 1730 + }, + { + "epoch": 1.8, + "learning_rate": 2.2305045871559633e-07, + "logits/chosen": -2.883430242538452, + "logits/rejected": -2.8605690002441406, + "logps/chosen": -345.23272705078125, + "logps/rejected": -331.52325439453125, + "loss": 0.1311, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.180057168006897, + "rewards/margins": 5.705449104309082, + "rewards/rejected": -4.525391578674316, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 2.2113914373088686e-07, + "logits/chosen": -2.962857484817505, + "logits/rejected": -2.9696333408355713, + "logps/chosen": -377.6351623535156, + "logps/rejected": -362.8825378417969, + "loss": 0.1067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8095831871032715, + "rewards/margins": 5.408170700073242, + "rewards/rejected": -4.598587512969971, + "step": 1750 + }, + { + "epoch": 1.82, + "learning_rate": 2.1922782874617736e-07, + "logits/chosen": -2.9029316902160645, + "logits/rejected": -2.9539952278137207, + "logps/chosen": -289.267822265625, + "logps/rejected": -366.2077941894531, + "loss": 0.0929, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.47237950563430786, + "rewards/margins": 6.199611186981201, + "rewards/rejected": -5.727231502532959, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 2.1731651376146787e-07, + "logits/chosen": -2.8582608699798584, + "logits/rejected": -2.8988289833068848, + "logps/chosen": -277.98406982421875, + "logps/rejected": -328.0066833496094, + "loss": 0.1048, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7530109286308289, + "rewards/margins": 5.7497334480285645, + "rewards/rejected": -4.99672269821167, + "step": 1770 + }, + { + "epoch": 1.84, + "learning_rate": 2.154051987767584e-07, + "logits/chosen": -2.9023542404174805, + "logits/rejected": -2.9242827892303467, + "logps/chosen": -314.0538330078125, + "logps/rejected": -299.74420166015625, + "loss": 0.12, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.7526110410690308, + "rewards/margins": 5.213059425354004, + "rewards/rejected": -4.460447311401367, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 2.134938837920489e-07, + "logits/chosen": -2.8436591625213623, + "logits/rejected": -2.8463809490203857, + "logps/chosen": -255.3565673828125, + "logps/rejected": -273.94464111328125, + "loss": 0.0818, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.06976697593927383, + "rewards/margins": 4.849638938903809, + "rewards/rejected": -4.919405937194824, + "step": 1790 + }, + { + "epoch": 1.86, + "learning_rate": 2.1158256880733944e-07, + "logits/chosen": -2.9030632972717285, + "logits/rejected": -2.8941729068756104, + "logps/chosen": -367.6248779296875, + "logps/rejected": -302.12249755859375, + "loss": 0.0993, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.4400938153266907, + "rewards/margins": 5.627293586730957, + "rewards/rejected": -5.187199592590332, + "step": 1800 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.8477160930633545, + "eval_logits/rejected": -2.8601999282836914, + "eval_logps/chosen": -374.3083801269531, + "eval_logps/rejected": -321.6667785644531, + "eval_loss": 0.5605445504188538, + "eval_rewards/accuracies": 0.7460317611694336, + "eval_rewards/chosen": -0.711660623550415, + "eval_rewards/margins": 2.2584221363067627, + "eval_rewards/rejected": -2.9700827598571777, + "eval_runtime": 164.7388, + "eval_samples_per_second": 12.14, + "eval_steps_per_second": 0.382, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 2.0967125382262994e-07, + "logits/chosen": -2.8688273429870605, + "logits/rejected": -2.868739366531372, + "logps/chosen": -337.7546081542969, + "logps/rejected": -312.27569580078125, + "loss": 0.1163, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7751233577728271, + "rewards/margins": 5.873146057128906, + "rewards/rejected": -5.098022937774658, + "step": 1810 + }, + { + "epoch": 1.88, + "learning_rate": 2.0775993883792048e-07, + "logits/chosen": -2.8374381065368652, + "logits/rejected": -2.8085215091705322, + "logps/chosen": -352.53192138671875, + "logps/rejected": -316.3230895996094, + "loss": 0.0933, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.9547752141952515, + "rewards/margins": 5.203994274139404, + "rewards/rejected": -4.249218940734863, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 2.05848623853211e-07, + "logits/chosen": -2.874891757965088, + "logits/rejected": -2.839573621749878, + "logps/chosen": -366.4833679199219, + "logps/rejected": -319.9959411621094, + "loss": 0.0966, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.212294101715088, + "rewards/margins": 5.914790630340576, + "rewards/rejected": -4.702496528625488, + "step": 1830 + }, + { + "epoch": 1.9, + "learning_rate": 2.0393730886850151e-07, + "logits/chosen": -2.8277204036712646, + "logits/rejected": -2.878105640411377, + "logps/chosen": -378.3955383300781, + "logps/rejected": -314.2088623046875, + "loss": 0.0863, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7747803926467896, + "rewards/margins": 5.967954158782959, + "rewards/rejected": -5.193174362182617, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0202599388379205e-07, + "logits/chosen": -2.8658251762390137, + "logits/rejected": -2.8985071182250977, + "logps/chosen": -339.0852355957031, + "logps/rejected": -384.46112060546875, + "loss": 0.0786, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9408707618713379, + "rewards/margins": 5.965841770172119, + "rewards/rejected": -5.024971008300781, + "step": 1850 + }, + { + "epoch": 1.92, + "learning_rate": 2.0011467889908258e-07, + "logits/chosen": -2.829246997833252, + "logits/rejected": -2.8732194900512695, + "logps/chosen": -381.65655517578125, + "logps/rejected": -284.0471496582031, + "loss": 0.0945, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5882245898246765, + "rewards/margins": 5.461816787719727, + "rewards/rejected": -4.873592376708984, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 1.9820336391437308e-07, + "logits/chosen": -2.8398656845092773, + "logits/rejected": -2.8620615005493164, + "logps/chosen": -309.2004089355469, + "logps/rejected": -296.1297302246094, + "loss": 0.093, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8088359832763672, + "rewards/margins": 5.4316887855529785, + "rewards/rejected": -4.6228532791137695, + "step": 1870 + }, + { + "epoch": 1.94, + "learning_rate": 1.9629204892966362e-07, + "logits/chosen": -2.941847324371338, + "logits/rejected": -2.950911283493042, + "logps/chosen": -329.76617431640625, + "logps/rejected": -295.0538635253906, + "loss": 0.1113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2022342681884766, + "rewards/margins": 5.600251197814941, + "rewards/rejected": -4.398016929626465, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 1.943807339449541e-07, + "logits/chosen": -2.8697052001953125, + "logits/rejected": -2.901094913482666, + "logps/chosen": -311.559326171875, + "logps/rejected": -333.4175720214844, + "loss": 0.0948, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.7903985977172852, + "rewards/margins": 5.597433090209961, + "rewards/rejected": -4.807034015655518, + "step": 1890 + }, + { + "epoch": 1.96, + "learning_rate": 1.9246941896024463e-07, + "logits/chosen": -2.939120054244995, + "logits/rejected": -2.9861233234405518, + "logps/chosen": -320.5481262207031, + "logps/rejected": -347.7875061035156, + "loss": 0.1116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6647524833679199, + "rewards/margins": 6.036587238311768, + "rewards/rejected": -5.3718342781066895, + "step": 1900 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.914954900741577, + "eval_logits/rejected": -2.927724599838257, + "eval_logps/chosen": -373.57073974609375, + "eval_logps/rejected": -319.2250061035156, + "eval_loss": 0.5649252533912659, + "eval_rewards/accuracies": 0.7539682388305664, + "eval_rewards/chosen": -0.6378985047340393, + "eval_rewards/margins": 2.088006019592285, + "eval_rewards/rejected": -2.7259042263031006, + "eval_runtime": 164.2377, + "eval_samples_per_second": 12.177, + "eval_steps_per_second": 0.384, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 1.9055810397553516e-07, + "logits/chosen": -2.9238085746765137, + "logits/rejected": -2.9308090209960938, + "logps/chosen": -313.63665771484375, + "logps/rejected": -304.2153625488281, + "loss": 0.1214, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6549821496009827, + "rewards/margins": 4.487866401672363, + "rewards/rejected": -3.8328843116760254, + "step": 1910 + }, + { + "epoch": 1.98, + "learning_rate": 1.8864678899082566e-07, + "logits/chosen": -2.8484818935394287, + "logits/rejected": -2.866534948348999, + "logps/chosen": -347.75689697265625, + "logps/rejected": -279.4710693359375, + "loss": 0.1082, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.3496394753456116, + "rewards/margins": 4.312170505523682, + "rewards/rejected": -3.9625308513641357, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 1.867354740061162e-07, + "logits/chosen": -2.9492716789245605, + "logits/rejected": -2.956796169281006, + "logps/chosen": -307.85845947265625, + "logps/rejected": -332.1622619628906, + "loss": 0.1061, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4475575387477875, + "rewards/margins": 5.942025184631348, + "rewards/rejected": -5.494467735290527, + "step": 1930 + }, + { + "epoch": 2.0, + "learning_rate": 1.8482415902140673e-07, + "logits/chosen": -2.923053503036499, + "logits/rejected": -2.920959949493408, + "logps/chosen": -331.311767578125, + "logps/rejected": -320.19586181640625, + "loss": 0.0801, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0848143100738525, + "rewards/margins": 5.605216979980469, + "rewards/rejected": -4.520401954650879, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 1.8291284403669723e-07, + "logits/chosen": -2.881058692932129, + "logits/rejected": -2.93363618850708, + "logps/chosen": -309.11212158203125, + "logps/rejected": -322.50665283203125, + "loss": 0.0254, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7099040746688843, + "rewards/margins": 6.735787868499756, + "rewards/rejected": -5.025883674621582, + "step": 1950 + }, + { + "epoch": 2.02, + "learning_rate": 1.8100152905198777e-07, + "logits/chosen": -2.7668607234954834, + "logits/rejected": -2.7822773456573486, + "logps/chosen": -351.9031677246094, + "logps/rejected": -415.9180603027344, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.090543508529663, + "rewards/margins": 7.8234100341796875, + "rewards/rejected": -6.7328667640686035, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7909021406727827e-07, + "logits/chosen": -2.9600331783294678, + "logits/rejected": -2.8843834400177, + "logps/chosen": -309.39642333984375, + "logps/rejected": -306.4966735839844, + "loss": 0.0282, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9620206952095032, + "rewards/margins": 6.899697303771973, + "rewards/rejected": -5.937676429748535, + "step": 1970 + }, + { + "epoch": 2.04, + "learning_rate": 1.771788990825688e-07, + "logits/chosen": -2.923687696456909, + "logits/rejected": -2.9661598205566406, + "logps/chosen": -330.7653503417969, + "logps/rejected": -352.5653076171875, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7405881881713867, + "rewards/margins": 8.00406265258789, + "rewards/rejected": -6.263474941253662, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 1.7526758409785934e-07, + "logits/chosen": -2.9299581050872803, + "logits/rejected": -2.8949360847473145, + "logps/chosen": -362.6274719238281, + "logps/rejected": -363.09149169921875, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0967247486114502, + "rewards/margins": 7.466977119445801, + "rewards/rejected": -6.37025260925293, + "step": 1990 + }, + { + "epoch": 2.06, + "learning_rate": 1.7335626911314984e-07, + "logits/chosen": -2.845986843109131, + "logits/rejected": -2.8671188354492188, + "logps/chosen": -274.60870361328125, + "logps/rejected": -295.59478759765625, + "loss": 0.0193, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.014614641666412354, + "rewards/margins": 6.743406772613525, + "rewards/rejected": -6.758021354675293, + "step": 2000 + }, + { + "epoch": 2.06, + "eval_logits/chosen": -2.882474422454834, + "eval_logits/rejected": -2.8919453620910645, + "eval_logps/chosen": -376.60406494140625, + "eval_logps/rejected": -329.82745361328125, + "eval_loss": 0.6121558547019958, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": -0.941230058670044, + "eval_rewards/margins": 2.8449153900146484, + "eval_rewards/rejected": -3.7861454486846924, + "eval_runtime": 164.9655, + "eval_samples_per_second": 12.124, + "eval_steps_per_second": 0.382, + "step": 2000 + }, + { + "epoch": 2.07, + "learning_rate": 1.7144495412844037e-07, + "logits/chosen": -2.9446756839752197, + "logits/rejected": -2.953831911087036, + "logps/chosen": -353.67376708984375, + "logps/rejected": -347.7017822265625, + "loss": 0.0214, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4430414140224457, + "rewards/margins": 7.570870399475098, + "rewards/rejected": -7.127829074859619, + "step": 2010 + }, + { + "epoch": 2.08, + "learning_rate": 1.6953363914373088e-07, + "logits/chosen": -2.940734386444092, + "logits/rejected": -2.9746463298797607, + "logps/chosen": -348.05328369140625, + "logps/rejected": -333.2148742675781, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2261013984680176, + "rewards/margins": 8.234363555908203, + "rewards/rejected": -7.008261680603027, + "step": 2020 + }, + { + "epoch": 2.09, + "learning_rate": 1.6762232415902138e-07, + "logits/chosen": -2.875319719314575, + "logits/rejected": -2.855180263519287, + "logps/chosen": -306.70050048828125, + "logps/rejected": -349.5177917480469, + "loss": 0.0203, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3287349343299866, + "rewards/margins": 7.007230281829834, + "rewards/rejected": -6.678494930267334, + "step": 2030 + }, + { + "epoch": 2.11, + "learning_rate": 1.6571100917431192e-07, + "logits/chosen": -2.9315755367279053, + "logits/rejected": -2.930187702178955, + "logps/chosen": -306.041259765625, + "logps/rejected": -305.6824951171875, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5696347951889038, + "rewards/margins": 7.535808563232422, + "rewards/rejected": -6.9661736488342285, + "step": 2040 + }, + { + "epoch": 2.12, + "learning_rate": 1.6379969418960242e-07, + "logits/chosen": -2.8848538398742676, + "logits/rejected": -2.905867338180542, + "logps/chosen": -389.7286682128906, + "logps/rejected": -386.9409484863281, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3652772903442383, + "rewards/margins": 8.069811820983887, + "rewards/rejected": -6.704535484313965, + "step": 2050 + }, + { + "epoch": 2.13, + "learning_rate": 1.6188837920489295e-07, + "logits/chosen": -2.8828773498535156, + "logits/rejected": -2.8783280849456787, + "logps/chosen": -359.57666015625, + "logps/rejected": -339.35345458984375, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8870366811752319, + "rewards/margins": 8.021839141845703, + "rewards/rejected": -7.13480281829834, + "step": 2060 + }, + { + "epoch": 2.14, + "learning_rate": 1.5997706422018349e-07, + "logits/chosen": -2.88275408744812, + "logits/rejected": -2.929903984069824, + "logps/chosen": -322.4759216308594, + "logps/rejected": -423.30682373046875, + "loss": 0.0171, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5736116170883179, + "rewards/margins": 8.667892456054688, + "rewards/rejected": -7.0942816734313965, + "step": 2070 + }, + { + "epoch": 2.15, + "learning_rate": 1.58065749235474e-07, + "logits/chosen": -2.90217661857605, + "logits/rejected": -2.8910233974456787, + "logps/chosen": -413.54522705078125, + "logps/rejected": -385.43341064453125, + "loss": 0.0127, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9447764158248901, + "rewards/margins": 7.688788414001465, + "rewards/rejected": -6.744012355804443, + "step": 2080 + }, + { + "epoch": 2.16, + "learning_rate": 1.5615443425076452e-07, + "logits/chosen": -2.7480947971343994, + "logits/rejected": -2.747185230255127, + "logps/chosen": -354.87493896484375, + "logps/rejected": -351.3457946777344, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.513919472694397, + "rewards/margins": 8.175249099731445, + "rewards/rejected": -7.661329746246338, + "step": 2090 + }, + { + "epoch": 2.17, + "learning_rate": 1.5424311926605506e-07, + "logits/chosen": -2.8367042541503906, + "logits/rejected": -2.8408215045928955, + "logps/chosen": -298.4134826660156, + "logps/rejected": -300.4717712402344, + "loss": 0.0175, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2804573178291321, + "rewards/margins": 7.449028968811035, + "rewards/rejected": -7.729485511779785, + "step": 2100 + }, + { + "epoch": 2.17, + "eval_logits/chosen": -2.839301586151123, + "eval_logits/rejected": -2.8474462032318115, + "eval_logps/chosen": -383.2186279296875, + "eval_logps/rejected": -338.7977294921875, + "eval_loss": 0.6523212790489197, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -1.6026798486709595, + "eval_rewards/margins": 3.080495834350586, + "eval_rewards/rejected": -4.683175563812256, + "eval_runtime": 165.5125, + "eval_samples_per_second": 12.084, + "eval_steps_per_second": 0.381, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 1.5233180428134556e-07, + "logits/chosen": -2.9054439067840576, + "logits/rejected": -2.913278102874756, + "logps/chosen": -323.6388244628906, + "logps/rejected": -323.73419189453125, + "loss": 0.0138, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8671594858169556, + "rewards/margins": 7.387481689453125, + "rewards/rejected": -6.520320892333984, + "step": 2110 + }, + { + "epoch": 2.19, + "learning_rate": 1.504204892966361e-07, + "logits/chosen": -2.800830364227295, + "logits/rejected": -2.8197312355041504, + "logps/chosen": -359.3259582519531, + "logps/rejected": -394.8112487792969, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4949137568473816, + "rewards/margins": 9.693056106567383, + "rewards/rejected": -9.198141098022461, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 1.485091743119266e-07, + "logits/chosen": -2.890476942062378, + "logits/rejected": -2.925356388092041, + "logps/chosen": -315.17742919921875, + "logps/rejected": -378.8518371582031, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27211472392082214, + "rewards/margins": 8.301239967346191, + "rewards/rejected": -8.02912425994873, + "step": 2130 + }, + { + "epoch": 2.21, + "learning_rate": 1.465978593272171e-07, + "logits/chosen": -2.820862054824829, + "logits/rejected": -2.8192131519317627, + "logps/chosen": -233.12344360351562, + "logps/rejected": -238.68014526367188, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23832440376281738, + "rewards/margins": 7.234049320220947, + "rewards/rejected": -7.472373008728027, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 1.4468654434250764e-07, + "logits/chosen": -2.7812180519104004, + "logits/rejected": -2.839566946029663, + "logps/chosen": -400.56396484375, + "logps/rejected": -418.9078063964844, + "loss": 0.0183, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.306671380996704, + "rewards/margins": 9.309846878051758, + "rewards/rejected": -8.003175735473633, + "step": 2150 + }, + { + "epoch": 2.23, + "learning_rate": 1.4277522935779814e-07, + "logits/chosen": -2.857119083404541, + "logits/rejected": -2.8069121837615967, + "logps/chosen": -346.87091064453125, + "logps/rejected": -364.4837341308594, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42278409004211426, + "rewards/margins": 7.95212459564209, + "rewards/rejected": -7.5293402671813965, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 1.4086391437308867e-07, + "logits/chosen": -2.828322172164917, + "logits/rejected": -2.872556209564209, + "logps/chosen": -371.8916015625, + "logps/rejected": -404.73162841796875, + "loss": 0.0183, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5918793082237244, + "rewards/margins": 8.069284439086914, + "rewards/rejected": -7.477405548095703, + "step": 2170 + }, + { + "epoch": 2.25, + "learning_rate": 1.389525993883792e-07, + "logits/chosen": -2.8225388526916504, + "logits/rejected": -2.8491692543029785, + "logps/chosen": -293.30047607421875, + "logps/rejected": -313.1904296875, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3963487148284912, + "rewards/margins": 7.192727565765381, + "rewards/rejected": -7.589076042175293, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 1.370412844036697e-07, + "logits/chosen": -2.8727283477783203, + "logits/rejected": -2.850238561630249, + "logps/chosen": -306.93695068359375, + "logps/rejected": -345.2283020019531, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09160284698009491, + "rewards/margins": 7.823256492614746, + "rewards/rejected": -7.731653690338135, + "step": 2190 + }, + { + "epoch": 2.27, + "learning_rate": 1.3512996941896024e-07, + "logits/chosen": -2.8500583171844482, + "logits/rejected": -2.8594961166381836, + "logps/chosen": -283.15771484375, + "logps/rejected": -311.3097839355469, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03272407129406929, + "rewards/margins": 8.4783353805542, + "rewards/rejected": -8.445611953735352, + "step": 2200 + }, + { + "epoch": 2.27, + "eval_logits/chosen": -2.806851387023926, + "eval_logits/rejected": -2.812812328338623, + "eval_logps/chosen": -386.09039306640625, + "eval_logps/rejected": -342.2704162597656, + "eval_loss": 0.6702452898025513, + "eval_rewards/accuracies": 0.7420634627342224, + "eval_rewards/chosen": -1.8898613452911377, + "eval_rewards/margins": 3.1405844688415527, + "eval_rewards/rejected": -5.0304460525512695, + "eval_runtime": 165.1336, + "eval_samples_per_second": 12.111, + "eval_steps_per_second": 0.382, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 1.3321865443425075e-07, + "logits/chosen": -2.7931952476501465, + "logits/rejected": -2.8073198795318604, + "logps/chosen": -338.2393493652344, + "logps/rejected": -352.142333984375, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13923540711402893, + "rewards/margins": 8.090972900390625, + "rewards/rejected": -8.230208396911621, + "step": 2210 + }, + { + "epoch": 2.29, + "learning_rate": 1.3130733944954128e-07, + "logits/chosen": -2.74983811378479, + "logits/rejected": -2.748617649078369, + "logps/chosen": -358.42401123046875, + "logps/rejected": -402.30328369140625, + "loss": 0.0129, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6026802062988281, + "rewards/margins": 9.088810920715332, + "rewards/rejected": -7.4861297607421875, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 1.293960244648318e-07, + "logits/chosen": -2.8457603454589844, + "logits/rejected": -2.8344614505767822, + "logps/chosen": -365.7544860839844, + "logps/rejected": -347.2682189941406, + "loss": 0.0163, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.001628613448701799, + "rewards/margins": 8.202213287353516, + "rewards/rejected": -8.203841209411621, + "step": 2230 + }, + { + "epoch": 2.31, + "learning_rate": 1.2748470948012232e-07, + "logits/chosen": -2.8093724250793457, + "logits/rejected": -2.81803822517395, + "logps/chosen": -340.55352783203125, + "logps/rejected": -407.7304992675781, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47526517510414124, + "rewards/margins": 8.480849266052246, + "rewards/rejected": -8.005583763122559, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 1.2557339449541285e-07, + "logits/chosen": -2.8672242164611816, + "logits/rejected": -2.855675220489502, + "logps/chosen": -343.7786865234375, + "logps/rejected": -365.4543151855469, + "loss": 0.0212, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.35550713539123535, + "rewards/margins": 8.239429473876953, + "rewards/rejected": -7.8839240074157715, + "step": 2250 + }, + { + "epoch": 2.33, + "learning_rate": 1.2366207951070336e-07, + "logits/chosen": -2.8647074699401855, + "logits/rejected": -2.8598999977111816, + "logps/chosen": -335.4911193847656, + "logps/rejected": -369.7025146484375, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1035120040178299, + "rewards/margins": 8.297709465026855, + "rewards/rejected": -8.401222229003906, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 1.217507645259939e-07, + "logits/chosen": -2.8038744926452637, + "logits/rejected": -2.8534445762634277, + "logps/chosen": -327.49005126953125, + "logps/rejected": -348.63116455078125, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.261627733707428, + "rewards/margins": 8.056116104125977, + "rewards/rejected": -7.794488430023193, + "step": 2270 + }, + { + "epoch": 2.35, + "learning_rate": 1.198394495412844e-07, + "logits/chosen": -2.879183053970337, + "logits/rejected": -2.9233028888702393, + "logps/chosen": -337.91790771484375, + "logps/rejected": -346.1882019042969, + "loss": 0.0176, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.12253670394420624, + "rewards/margins": 7.606545925140381, + "rewards/rejected": -7.4840087890625, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 1.1792813455657493e-07, + "logits/chosen": -2.7606253623962402, + "logits/rejected": -2.8114898204803467, + "logps/chosen": -337.6861877441406, + "logps/rejected": -345.7854309082031, + "loss": 0.0258, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3526005446910858, + "rewards/margins": 8.258612632751465, + "rewards/rejected": -7.906012058258057, + "step": 2290 + }, + { + "epoch": 2.37, + "learning_rate": 1.1601681957186543e-07, + "logits/chosen": -2.8984854221343994, + "logits/rejected": -2.912468910217285, + "logps/chosen": -334.9092102050781, + "logps/rejected": -334.67669677734375, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7173303961753845, + "rewards/margins": 7.802558898925781, + "rewards/rejected": -7.085227966308594, + "step": 2300 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -2.8489737510681152, + "eval_logits/rejected": -2.854724407196045, + "eval_logps/chosen": -383.90655517578125, + "eval_logps/rejected": -339.3347473144531, + "eval_loss": 0.6559089422225952, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -1.6714773178100586, + "eval_rewards/margins": 3.0653984546661377, + "eval_rewards/rejected": -4.736875534057617, + "eval_runtime": 164.8339, + "eval_samples_per_second": 12.133, + "eval_steps_per_second": 0.382, + "step": 2300 + }, + { + "epoch": 2.38, + "learning_rate": 1.1410550458715595e-07, + "logits/chosen": -2.8347411155700684, + "logits/rejected": -2.851090908050537, + "logps/chosen": -329.1361999511719, + "logps/rejected": -359.9030456542969, + "loss": 0.0278, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.2210700958967209, + "rewards/margins": 7.436942100524902, + "rewards/rejected": -7.215872287750244, + "step": 2310 + }, + { + "epoch": 2.39, + "learning_rate": 1.1219418960244648e-07, + "logits/chosen": -2.8228423595428467, + "logits/rejected": -2.841404438018799, + "logps/chosen": -282.3636169433594, + "logps/rejected": -409.47979736328125, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2855607867240906, + "rewards/margins": 8.16025161743164, + "rewards/rejected": -8.445813179016113, + "step": 2320 + }, + { + "epoch": 2.4, + "learning_rate": 1.10282874617737e-07, + "logits/chosen": -2.8471851348876953, + "logits/rejected": -2.8798093795776367, + "logps/chosen": -295.41900634765625, + "logps/rejected": -340.5544738769531, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3259337544441223, + "rewards/margins": 8.084188461303711, + "rewards/rejected": -8.410122871398926, + "step": 2330 + }, + { + "epoch": 2.41, + "learning_rate": 1.0837155963302752e-07, + "logits/chosen": -2.636784076690674, + "logits/rejected": -2.740302562713623, + "logps/chosen": -287.13702392578125, + "logps/rejected": -391.1552429199219, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.528149425983429, + "rewards/margins": 9.060527801513672, + "rewards/rejected": -8.532378196716309, + "step": 2340 + }, + { + "epoch": 2.43, + "learning_rate": 1.0646024464831804e-07, + "logits/chosen": -2.753213882446289, + "logits/rejected": -2.822252035140991, + "logps/chosen": -369.473388671875, + "logps/rejected": -360.38983154296875, + "loss": 0.0221, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3261668086051941, + "rewards/margins": 8.502967834472656, + "rewards/rejected": -8.829133033752441, + "step": 2350 + }, + { + "epoch": 2.44, + "learning_rate": 1.0454892966360856e-07, + "logits/chosen": -2.782691717147827, + "logits/rejected": -2.868027448654175, + "logps/chosen": -333.0803527832031, + "logps/rejected": -355.0961608886719, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5834169983863831, + "rewards/margins": 8.312009811401367, + "rewards/rejected": -8.895425796508789, + "step": 2360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0263761467889908e-07, + "logits/chosen": -2.770711898803711, + "logits/rejected": -2.796137809753418, + "logps/chosen": -336.739990234375, + "logps/rejected": -372.0965576171875, + "loss": 0.0151, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.35953274369239807, + "rewards/margins": 8.809865951538086, + "rewards/rejected": -8.450332641601562, + "step": 2370 + }, + { + "epoch": 2.46, + "learning_rate": 1.007262996941896e-07, + "logits/chosen": -2.7547390460968018, + "logits/rejected": -2.7793593406677246, + "logps/chosen": -335.936279296875, + "logps/rejected": -330.6647033691406, + "loss": 0.0167, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.02794322930276394, + "rewards/margins": 8.719170570373535, + "rewards/rejected": -8.747113227844238, + "step": 2380 + }, + { + "epoch": 2.47, + "learning_rate": 9.881498470948011e-08, + "logits/chosen": -2.846524477005005, + "logits/rejected": -2.799567222595215, + "logps/chosen": -343.198486328125, + "logps/rejected": -335.6533508300781, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013063406571745872, + "rewards/margins": 8.952108383178711, + "rewards/rejected": -8.965171813964844, + "step": 2390 + }, + { + "epoch": 2.48, + "learning_rate": 9.690366972477065e-08, + "logits/chosen": -2.85577392578125, + "logits/rejected": -2.8093135356903076, + "logps/chosen": -333.2208251953125, + "logps/rejected": -358.0810241699219, + "loss": 0.0142, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.41089487075805664, + "rewards/margins": 8.724878311157227, + "rewards/rejected": -9.135773658752441, + "step": 2400 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -2.835172653198242, + "eval_logits/rejected": -2.839359760284424, + "eval_logps/chosen": -386.6546936035156, + "eval_logps/rejected": -343.19000244140625, + "eval_loss": 0.6733575463294983, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": -1.946290373802185, + "eval_rewards/margins": 3.1761116981506348, + "eval_rewards/rejected": -5.122402191162109, + "eval_runtime": 165.3843, + "eval_samples_per_second": 12.093, + "eval_steps_per_second": 0.381, + "step": 2400 + }, + { + "epoch": 2.49, + "learning_rate": 9.499235474006116e-08, + "logits/chosen": -2.846043109893799, + "logits/rejected": -2.8555102348327637, + "logps/chosen": -376.3670349121094, + "logps/rejected": -341.2032470703125, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03427610173821449, + "rewards/margins": 8.654411315917969, + "rewards/rejected": -8.688688278198242, + "step": 2410 + }, + { + "epoch": 2.5, + "learning_rate": 9.308103975535168e-08, + "logits/chosen": -2.8411316871643066, + "logits/rejected": -2.8570432662963867, + "logps/chosen": -373.59844970703125, + "logps/rejected": -401.067138671875, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0549451112747192, + "rewards/margins": 8.842530250549316, + "rewards/rejected": -7.787585258483887, + "step": 2420 + }, + { + "epoch": 2.51, + "learning_rate": 9.116972477064219e-08, + "logits/chosen": -2.895292282104492, + "logits/rejected": -2.854443073272705, + "logps/chosen": -345.359375, + "logps/rejected": -408.4029846191406, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30679136514663696, + "rewards/margins": 7.920645713806152, + "rewards/rejected": -8.227437019348145, + "step": 2430 + }, + { + "epoch": 2.52, + "learning_rate": 8.925840978593272e-08, + "logits/chosen": -2.835501194000244, + "logits/rejected": -2.896915912628174, + "logps/chosen": -264.5487365722656, + "logps/rejected": -387.1824951171875, + "loss": 0.019, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3389735221862793, + "rewards/margins": 9.347002029418945, + "rewards/rejected": -9.008028030395508, + "step": 2440 + }, + { + "epoch": 2.53, + "learning_rate": 8.734709480122324e-08, + "logits/chosen": -2.806790828704834, + "logits/rejected": -2.8148555755615234, + "logps/chosen": -308.4158630371094, + "logps/rejected": -376.0751953125, + "loss": 0.0166, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.38033682107925415, + "rewards/margins": 8.450287818908691, + "rewards/rejected": -8.8306245803833, + "step": 2450 + }, + { + "epoch": 2.54, + "learning_rate": 8.543577981651376e-08, + "logits/chosen": -2.7967381477355957, + "logits/rejected": -2.792023181915283, + "logps/chosen": -455.0721740722656, + "logps/rejected": -405.89501953125, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7061554789543152, + "rewards/margins": 9.382705688476562, + "rewards/rejected": -8.676549911499023, + "step": 2460 + }, + { + "epoch": 2.55, + "learning_rate": 8.352446483180428e-08, + "logits/chosen": -2.8607754707336426, + "logits/rejected": -2.8268520832061768, + "logps/chosen": -331.96820068359375, + "logps/rejected": -321.39422607421875, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4299864172935486, + "rewards/margins": 8.559895515441895, + "rewards/rejected": -8.129908561706543, + "step": 2470 + }, + { + "epoch": 2.56, + "learning_rate": 8.161314984709481e-08, + "logits/chosen": -2.8822827339172363, + "logits/rejected": -2.893578052520752, + "logps/chosen": -339.42449951171875, + "logps/rejected": -356.1263427734375, + "loss": 0.0083, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2752775549888611, + "rewards/margins": 8.704290390014648, + "rewards/rejected": -8.4290132522583, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 7.970183486238531e-08, + "logits/chosen": -2.828721284866333, + "logits/rejected": -2.833087205886841, + "logps/chosen": -328.60418701171875, + "logps/rejected": -360.6470642089844, + "loss": 0.0176, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.30725544691085815, + "rewards/margins": 8.530462265014648, + "rewards/rejected": -8.223207473754883, + "step": 2490 + }, + { + "epoch": 2.58, + "learning_rate": 7.779051987767583e-08, + "logits/chosen": -2.8711142539978027, + "logits/rejected": -2.892519950866699, + "logps/chosen": -370.29339599609375, + "logps/rejected": -355.296875, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0712406188249588, + "rewards/margins": 8.155640602111816, + "rewards/rejected": -8.084399223327637, + "step": 2500 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.8333258628845215, + "eval_logits/rejected": -2.8368897438049316, + "eval_logps/chosen": -388.3058776855469, + "eval_logps/rejected": -347.57440185546875, + "eval_loss": 0.6890397667884827, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -2.1114044189453125, + "eval_rewards/margins": 3.4494407176971436, + "eval_rewards/rejected": -5.560845375061035, + "eval_runtime": 164.7492, + "eval_samples_per_second": 12.14, + "eval_steps_per_second": 0.382, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 7.587920489296635e-08, + "logits/chosen": -2.855881690979004, + "logits/rejected": -2.8854427337646484, + "logps/chosen": -351.69769287109375, + "logps/rejected": -358.4553527832031, + "loss": 0.0164, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2860868573188782, + "rewards/margins": 8.237478256225586, + "rewards/rejected": -8.523565292358398, + "step": 2510 + }, + { + "epoch": 2.6, + "learning_rate": 7.396788990825688e-08, + "logits/chosen": -2.8813681602478027, + "logits/rejected": -2.9079108238220215, + "logps/chosen": -322.7754821777344, + "logps/rejected": -327.5832824707031, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06775089353322983, + "rewards/margins": 8.004460334777832, + "rewards/rejected": -8.072211265563965, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 7.20565749235474e-08, + "logits/chosen": -2.810084819793701, + "logits/rejected": -2.815389394760132, + "logps/chosen": -325.9468688964844, + "logps/rejected": -330.6631164550781, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12425418943166733, + "rewards/margins": 8.769124984741211, + "rewards/rejected": -8.893379211425781, + "step": 2530 + }, + { + "epoch": 2.62, + "learning_rate": 7.014525993883792e-08, + "logits/chosen": -2.7919540405273438, + "logits/rejected": -2.7934675216674805, + "logps/chosen": -353.1927185058594, + "logps/rejected": -365.3847351074219, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2072305679321289, + "rewards/margins": 8.390886306762695, + "rewards/rejected": -8.18365478515625, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 6.823394495412843e-08, + "logits/chosen": -2.7678780555725098, + "logits/rejected": -2.765697479248047, + "logps/chosen": -358.8880615234375, + "logps/rejected": -376.55706787109375, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11311036348342896, + "rewards/margins": 9.944357872009277, + "rewards/rejected": -9.83124828338623, + "step": 2550 + }, + { + "epoch": 2.64, + "learning_rate": 6.632262996941895e-08, + "logits/chosen": -2.834345817565918, + "logits/rejected": -2.7858288288116455, + "logps/chosen": -336.33648681640625, + "logps/rejected": -366.84991455078125, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12319626659154892, + "rewards/margins": 9.35567569732666, + "rewards/rejected": -9.478872299194336, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 6.441131498470948e-08, + "logits/chosen": -2.847996950149536, + "logits/rejected": -2.863615036010742, + "logps/chosen": -304.58502197265625, + "logps/rejected": -352.5277404785156, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37131237983703613, + "rewards/margins": 8.316202163696289, + "rewards/rejected": -8.687514305114746, + "step": 2570 + }, + { + "epoch": 2.66, + "learning_rate": 6.25e-08, + "logits/chosen": -2.8685457706451416, + "logits/rejected": -2.876739501953125, + "logps/chosen": -394.1883850097656, + "logps/rejected": -382.19287109375, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5528956055641174, + "rewards/margins": 9.068865776062012, + "rewards/rejected": -8.515970230102539, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 6.058868501529052e-08, + "logits/chosen": -2.9075653553009033, + "logits/rejected": -2.8715763092041016, + "logps/chosen": -366.0291442871094, + "logps/rejected": -358.59381103515625, + "loss": 0.0202, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.14853370189666748, + "rewards/margins": 8.612794876098633, + "rewards/rejected": -8.464262008666992, + "step": 2590 + }, + { + "epoch": 2.68, + "learning_rate": 5.8677370030581035e-08, + "logits/chosen": -2.797910213470459, + "logits/rejected": -2.840148687362671, + "logps/chosen": -331.3750305175781, + "logps/rejected": -344.34332275390625, + "loss": 0.011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.22872698307037354, + "rewards/margins": 8.751152038574219, + "rewards/rejected": -8.979879379272461, + "step": 2600 + }, + { + "epoch": 2.68, + "eval_logits/chosen": -2.8258047103881836, + "eval_logits/rejected": -2.8298983573913574, + "eval_logps/chosen": -390.2113952636719, + "eval_logps/rejected": -350.03887939453125, + "eval_loss": 0.6998910307884216, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -2.301961660385132, + "eval_rewards/margins": 3.5053274631500244, + "eval_rewards/rejected": -5.807290077209473, + "eval_runtime": 164.7101, + "eval_samples_per_second": 12.143, + "eval_steps_per_second": 0.382, + "step": 2600 + }, + { + "epoch": 2.69, + "learning_rate": 5.6766055045871554e-08, + "logits/chosen": -2.837218761444092, + "logits/rejected": -2.8603646755218506, + "logps/chosen": -325.1515197753906, + "logps/rejected": -377.93707275390625, + "loss": 0.0122, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7425957918167114, + "rewards/margins": 9.372137069702148, + "rewards/rejected": -8.629541397094727, + "step": 2610 + }, + { + "epoch": 2.7, + "learning_rate": 5.485474006116208e-08, + "logits/chosen": -2.859614372253418, + "logits/rejected": -2.907731294631958, + "logps/chosen": -323.7126159667969, + "logps/rejected": -337.7955627441406, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9508829116821289, + "rewards/margins": 9.375367164611816, + "rewards/rejected": -8.424482345581055, + "step": 2620 + }, + { + "epoch": 2.71, + "learning_rate": 5.294342507645259e-08, + "logits/chosen": -2.8355846405029297, + "logits/rejected": -2.8445563316345215, + "logps/chosen": -387.78021240234375, + "logps/rejected": -341.34332275390625, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06057599186897278, + "rewards/margins": 8.586808204650879, + "rewards/rejected": -8.647383689880371, + "step": 2630 + }, + { + "epoch": 2.72, + "learning_rate": 5.1032110091743117e-08, + "logits/chosen": -2.8416004180908203, + "logits/rejected": -2.8135132789611816, + "logps/chosen": -294.2474670410156, + "logps/rejected": -342.1490173339844, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0905003547668457, + "rewards/margins": 7.913638114929199, + "rewards/rejected": -9.004137992858887, + "step": 2640 + }, + { + "epoch": 2.73, + "learning_rate": 4.9120795107033635e-08, + "logits/chosen": -2.808621406555176, + "logits/rejected": -2.8129184246063232, + "logps/chosen": -361.94146728515625, + "logps/rejected": -372.2251892089844, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.09845595061779022, + "rewards/margins": 9.0397367477417, + "rewards/rejected": -9.138191223144531, + "step": 2650 + }, + { + "epoch": 2.75, + "learning_rate": 4.7209480122324154e-08, + "logits/chosen": -2.857626438140869, + "logits/rejected": -2.854701042175293, + "logps/chosen": -334.3382568359375, + "logps/rejected": -397.60504150390625, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4237436354160309, + "rewards/margins": 8.734308242797852, + "rewards/rejected": -9.158050537109375, + "step": 2660 + }, + { + "epoch": 2.76, + "learning_rate": 4.529816513761467e-08, + "logits/chosen": -2.833742618560791, + "logits/rejected": -2.848910093307495, + "logps/chosen": -376.8042297363281, + "logps/rejected": -431.098388671875, + "loss": 0.0185, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.09423612058162689, + "rewards/margins": 8.842924118041992, + "rewards/rejected": -8.937159538269043, + "step": 2670 + }, + { + "epoch": 2.77, + "learning_rate": 4.33868501529052e-08, + "logits/chosen": -2.8585665225982666, + "logits/rejected": -2.888023614883423, + "logps/chosen": -299.4255065917969, + "logps/rejected": -347.65032958984375, + "loss": 0.0227, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.39737778902053833, + "rewards/margins": 8.897387504577637, + "rewards/rejected": -9.294764518737793, + "step": 2680 + }, + { + "epoch": 2.78, + "learning_rate": 4.147553516819572e-08, + "logits/chosen": -2.7752485275268555, + "logits/rejected": -2.7679455280303955, + "logps/chosen": -288.93524169921875, + "logps/rejected": -362.3262634277344, + "loss": 0.0138, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.21344709396362305, + "rewards/margins": 8.969237327575684, + "rewards/rejected": -9.182684898376465, + "step": 2690 + }, + { + "epoch": 2.79, + "learning_rate": 3.9564220183486236e-08, + "logits/chosen": -2.771638870239258, + "logits/rejected": -2.7894372940063477, + "logps/chosen": -358.27276611328125, + "logps/rejected": -337.4700012207031, + "loss": 0.0114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.08803452551364899, + "rewards/margins": 8.622003555297852, + "rewards/rejected": -8.710036277770996, + "step": 2700 + }, + { + "epoch": 2.79, + "eval_logits/chosen": -2.817159414291382, + "eval_logits/rejected": -2.820690393447876, + "eval_logps/chosen": -389.5738525390625, + "eval_logps/rejected": -348.8511962890625, + "eval_loss": 0.6951248645782471, + "eval_rewards/accuracies": 0.7698412537574768, + "eval_rewards/chosen": -2.238208055496216, + "eval_rewards/margins": 3.4503118991851807, + "eval_rewards/rejected": -5.6885199546813965, + "eval_runtime": 164.1407, + "eval_samples_per_second": 12.185, + "eval_steps_per_second": 0.384, + "step": 2700 + }, + { + "epoch": 2.8, + "learning_rate": 3.7652905198776755e-08, + "logits/chosen": -2.8291115760803223, + "logits/rejected": -2.812997817993164, + "logps/chosen": -361.16973876953125, + "logps/rejected": -371.3473205566406, + "loss": 0.0109, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2673804759979248, + "rewards/margins": 8.214799880981445, + "rewards/rejected": -8.482179641723633, + "step": 2710 + }, + { + "epoch": 2.81, + "learning_rate": 3.574159021406728e-08, + "logits/chosen": -2.8169431686401367, + "logits/rejected": -2.780579090118408, + "logps/chosen": -340.25567626953125, + "logps/rejected": -452.1532287597656, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4266482889652252, + "rewards/margins": 8.901152610778809, + "rewards/rejected": -9.327801704406738, + "step": 2720 + }, + { + "epoch": 2.82, + "learning_rate": 3.383027522935779e-08, + "logits/chosen": -2.852733850479126, + "logits/rejected": -2.8627407550811768, + "logps/chosen": -345.10504150390625, + "logps/rejected": -381.2701416015625, + "loss": 0.0144, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6709401607513428, + "rewards/margins": 9.368196487426758, + "rewards/rejected": -8.697256088256836, + "step": 2730 + }, + { + "epoch": 2.83, + "learning_rate": 3.191896024464832e-08, + "logits/chosen": -2.847033977508545, + "logits/rejected": -2.880303382873535, + "logps/chosen": -344.7592468261719, + "logps/rejected": -366.505615234375, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2561042606830597, + "rewards/margins": 9.626019477844238, + "rewards/rejected": -9.369915008544922, + "step": 2740 + }, + { + "epoch": 2.84, + "learning_rate": 3.0007645259938836e-08, + "logits/chosen": -2.829150676727295, + "logits/rejected": -2.8306522369384766, + "logps/chosen": -299.6269836425781, + "logps/rejected": -362.5341796875, + "loss": 0.0191, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.312942773103714, + "rewards/margins": 10.283103942871094, + "rewards/rejected": -9.970161437988281, + "step": 2750 + }, + { + "epoch": 2.85, + "learning_rate": 2.809633027522936e-08, + "logits/chosen": -2.815882444381714, + "logits/rejected": -2.7824299335479736, + "logps/chosen": -315.24993896484375, + "logps/rejected": -347.3058776855469, + "loss": 0.0207, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3008100688457489, + "rewards/margins": 8.677629470825195, + "rewards/rejected": -8.978440284729004, + "step": 2760 + }, + { + "epoch": 2.86, + "learning_rate": 2.6185015290519877e-08, + "logits/chosen": -2.8008246421813965, + "logits/rejected": -2.7953882217407227, + "logps/chosen": -333.69329833984375, + "logps/rejected": -373.4231872558594, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08695399761199951, + "rewards/margins": 7.9680304527282715, + "rewards/rejected": -8.054986000061035, + "step": 2770 + }, + { + "epoch": 2.87, + "learning_rate": 2.4273700305810396e-08, + "logits/chosen": -2.790097951889038, + "logits/rejected": -2.827036142349243, + "logps/chosen": -378.98236083984375, + "logps/rejected": -420.4288635253906, + "loss": 0.0135, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.11630038917064667, + "rewards/margins": 10.843367576599121, + "rewards/rejected": -10.727069854736328, + "step": 2780 + }, + { + "epoch": 2.88, + "learning_rate": 2.2362385321100918e-08, + "logits/chosen": -2.7879481315612793, + "logits/rejected": -2.7845988273620605, + "logps/chosen": -350.80572509765625, + "logps/rejected": -345.9007873535156, + "loss": 0.018, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.001116895698942244, + "rewards/margins": 8.821355819702148, + "rewards/rejected": -8.822473526000977, + "step": 2790 + }, + { + "epoch": 2.89, + "learning_rate": 2.0451070336391437e-08, + "logits/chosen": -2.7627055644989014, + "logits/rejected": -2.7335832118988037, + "logps/chosen": -341.35662841796875, + "logps/rejected": -318.8877258300781, + "loss": 0.0437, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.1377846747636795, + "rewards/margins": 8.672611236572266, + "rewards/rejected": -8.534826278686523, + "step": 2800 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.8117332458496094, + "eval_logits/rejected": -2.8151025772094727, + "eval_logps/chosen": -389.4859924316406, + "eval_logps/rejected": -348.1217346191406, + "eval_loss": 0.6910788416862488, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -2.229426622390747, + "eval_rewards/margins": 3.3861491680145264, + "eval_rewards/rejected": -5.615575313568115, + "eval_runtime": 165.138, + "eval_samples_per_second": 12.111, + "eval_steps_per_second": 0.381, + "step": 2800 + }, + { + "epoch": 2.9, + "learning_rate": 1.8539755351681956e-08, + "logits/chosen": -2.7365012168884277, + "logits/rejected": -2.788407325744629, + "logps/chosen": -330.33197021484375, + "logps/rejected": -380.91168212890625, + "loss": 0.0163, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.0645025223493576, + "rewards/margins": 8.317387580871582, + "rewards/rejected": -8.381890296936035, + "step": 2810 + }, + { + "epoch": 2.91, + "learning_rate": 1.6628440366972478e-08, + "logits/chosen": -2.8033618927001953, + "logits/rejected": -2.8255763053894043, + "logps/chosen": -373.3817443847656, + "logps/rejected": -359.75421142578125, + "loss": 0.0142, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.008309101685881615, + "rewards/margins": 8.045055389404297, + "rewards/rejected": -8.03674602508545, + "step": 2820 + }, + { + "epoch": 2.92, + "learning_rate": 1.4717125382262997e-08, + "logits/chosen": -2.854548692703247, + "logits/rejected": -2.8665812015533447, + "logps/chosen": -339.0101318359375, + "logps/rejected": -377.62847900390625, + "loss": 0.0123, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6132253408432007, + "rewards/margins": 8.178349494934082, + "rewards/rejected": -8.791574478149414, + "step": 2830 + }, + { + "epoch": 2.93, + "learning_rate": 1.2805810397553517e-08, + "logits/chosen": -2.8801310062408447, + "logits/rejected": -2.826385021209717, + "logps/chosen": -346.5010681152344, + "logps/rejected": -360.3970031738281, + "loss": 0.026, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.022508572787046432, + "rewards/margins": 8.316872596740723, + "rewards/rejected": -8.33938217163086, + "step": 2840 + }, + { + "epoch": 2.94, + "learning_rate": 1.0894495412844038e-08, + "logits/chosen": -2.758545160293579, + "logits/rejected": -2.7856967449188232, + "logps/chosen": -326.37896728515625, + "logps/rejected": -359.3761291503906, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21567471325397491, + "rewards/margins": 8.950045585632324, + "rewards/rejected": -8.734369277954102, + "step": 2850 + }, + { + "epoch": 2.95, + "learning_rate": 8.983180428134555e-09, + "logits/chosen": -2.8466389179229736, + "logits/rejected": -2.8278822898864746, + "logps/chosen": -327.270751953125, + "logps/rejected": -307.3416748046875, + "loss": 0.0158, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5247852802276611, + "rewards/margins": 8.214715957641602, + "rewards/rejected": -8.739501953125, + "step": 2860 + }, + { + "epoch": 2.96, + "learning_rate": 7.071865443425076e-09, + "logits/chosen": -2.7788777351379395, + "logits/rejected": -2.7975292205810547, + "logps/chosen": -361.37384033203125, + "logps/rejected": -391.8774719238281, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35571426153182983, + "rewards/margins": 8.981501579284668, + "rewards/rejected": -9.337217330932617, + "step": 2870 + }, + { + "epoch": 2.97, + "learning_rate": 5.1605504587155965e-09, + "logits/chosen": -2.8489837646484375, + "logits/rejected": -2.7892398834228516, + "logps/chosen": -342.7001953125, + "logps/rejected": -358.37469482421875, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04198342561721802, + "rewards/margins": 8.752424240112305, + "rewards/rejected": -8.710439682006836, + "step": 2880 + }, + { + "epoch": 2.98, + "learning_rate": 3.249235474006116e-09, + "logits/chosen": -2.8391404151916504, + "logits/rejected": -2.862032175064087, + "logps/chosen": -332.72100830078125, + "logps/rejected": -362.6523132324219, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023767167702317238, + "rewards/margins": 8.220497131347656, + "rewards/rejected": -8.19672966003418, + "step": 2890 + }, + { + "epoch": 2.99, + "learning_rate": 1.3379204892966359e-09, + "logits/chosen": -2.854654550552368, + "logits/rejected": -2.8116354942321777, + "logps/chosen": -330.1148986816406, + "logps/rejected": -358.40155029296875, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3439770042896271, + "rewards/margins": 8.63255500793457, + "rewards/rejected": -8.976531028747559, + "step": 2900 + }, + { + "epoch": 2.99, + "eval_logits/chosen": -2.814802408218384, + "eval_logits/rejected": -2.818735122680664, + "eval_logps/chosen": -389.9677429199219, + "eval_logps/rejected": -348.89801025390625, + "eval_loss": 0.6909257769584656, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -2.2775967121124268, + "eval_rewards/margins": 3.415607452392578, + "eval_rewards/rejected": -5.693204402923584, + "eval_runtime": 164.8452, + "eval_samples_per_second": 12.133, + "eval_steps_per_second": 0.382, + "step": 2900 + }, + { + "epoch": 3.0, + "step": 2907, + "total_flos": 0.0, + "train_loss": 0.23139607249449978, + "train_runtime": 34004.0578, + "train_samples_per_second": 5.467, + "train_steps_per_second": 0.085 + } + ], + "logging_steps": 10, + "max_steps": 2907, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}