{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 8826, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.662514156285391e-10, "logits/chosen": -2.4580037593841553, "logits/rejected": -2.5939767360687256, "logps/chosen": -243.83958435058594, "logps/rejected": -714.9505004882812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 5.66251415628539e-09, "logits/chosen": -2.6363182067871094, "logits/rejected": -2.650538444519043, "logps/chosen": -474.0602111816406, "logps/rejected": -478.1452331542969, "loss": 0.6937, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": -0.0010112549643963575, "rewards/margins": -0.0012499317526817322, "rewards/rejected": 0.0002386771811870858, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.132502831257078e-08, "logits/chosen": -2.624431610107422, "logits/rejected": -2.706930637359619, "logps/chosen": -245.6259307861328, "logps/rejected": -396.68994140625, "loss": 0.6818, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.013883018866181374, "rewards/margins": 0.02570372261106968, "rewards/rejected": -0.011820705607533455, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.698754246885617e-08, "logits/chosen": -2.504279613494873, "logits/rejected": -2.553950786590576, "logps/chosen": -387.2401428222656, "logps/rejected": -394.9368591308594, "loss": 0.6402, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.048846740275621414, "rewards/margins": 0.09468663483858109, "rewards/rejected": -0.04583989828824997, "step": 30 }, { "epoch": 0.01, "learning_rate": 2.265005662514156e-08, "logits/chosen": -2.597482204437256, "logits/rejected": -2.7208497524261475, "logps/chosen": -259.1339416503906, "logps/rejected": -430.7806701660156, "loss": 0.5622, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.13973958790302277, "rewards/margins": 0.28847968578338623, "rewards/rejected": -0.14874012768268585, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.8312570781426952e-08, "logits/chosen": -2.720853328704834, "logits/rejected": -2.554872751235962, "logps/chosen": -207.9132537841797, "logps/rejected": -521.3519897460938, "loss": 0.464, "rewards/accuracies": 0.9375, "rewards/chosen": 0.352109432220459, "rewards/margins": 0.6585723757743835, "rewards/rejected": -0.3064630329608917, "step": 50 }, { "epoch": 0.02, "learning_rate": 3.397508493771234e-08, "logits/chosen": -2.685779094696045, "logits/rejected": -2.7349629402160645, "logps/chosen": -276.85235595703125, "logps/rejected": -298.54736328125, "loss": 0.3658, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5488882064819336, "rewards/margins": 1.0533435344696045, "rewards/rejected": -0.5044553279876709, "step": 60 }, { "epoch": 0.02, "learning_rate": 3.9637599093997736e-08, "logits/chosen": -2.684504508972168, "logits/rejected": -2.665677070617676, "logps/chosen": -268.5458679199219, "logps/rejected": -463.054443359375, "loss": 0.3165, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5929225087165833, "rewards/margins": 1.2806422710418701, "rewards/rejected": -0.6877198219299316, "step": 70 }, { "epoch": 0.03, "learning_rate": 4.530011325028312e-08, "logits/chosen": -2.6040701866149902, "logits/rejected": -2.630781888961792, "logps/chosen": -389.4125671386719, "logps/rejected": -363.0667419433594, "loss": 0.2849, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6518958210945129, "rewards/margins": 1.5321602821350098, "rewards/rejected": -0.8802644610404968, "step": 80 }, { "epoch": 0.03, "learning_rate": 5.096262740656852e-08, "logits/chosen": -2.667577028274536, "logits/rejected": -2.6753334999084473, "logps/chosen": -220.32681274414062, "logps/rejected": -263.08758544921875, "loss": 0.2186, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9696056246757507, "rewards/margins": 2.209392547607422, "rewards/rejected": -1.2397868633270264, "step": 90 }, { "epoch": 0.03, "learning_rate": 5.6625141562853904e-08, "logits/chosen": -2.683596134185791, "logits/rejected": -2.736514091491699, "logps/chosen": -198.9838104248047, "logps/rejected": -397.607421875, "loss": 0.2048, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1258020401000977, "rewards/margins": 2.909505605697632, "rewards/rejected": -1.7837038040161133, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": -2.759521722793579, "eval_logits/rejected": -2.6882898807525635, "eval_logps/chosen": -252.78482055664062, "eval_logps/rejected": -418.130615234375, "eval_loss": 0.18157674372196198, "eval_rewards/accuracies": 0.9528619647026062, "eval_rewards/chosen": 1.0171663761138916, "eval_rewards/margins": 2.6126952171325684, "eval_rewards/rejected": -1.5955286026000977, "eval_runtime": 462.7267, "eval_samples_per_second": 20.53, "eval_steps_per_second": 0.642, "step": 100 }, { "epoch": 0.04, "learning_rate": 6.22876557191393e-08, "logits/chosen": -2.7251458168029785, "logits/rejected": -2.51692271232605, "logps/chosen": -204.46116638183594, "logps/rejected": -626.9728393554688, "loss": 0.1745, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3071792125701904, "rewards/margins": 2.7273447513580322, "rewards/rejected": -1.420165777206421, "step": 110 }, { "epoch": 0.04, "learning_rate": 6.795016987542468e-08, "logits/chosen": -2.6868886947631836, "logits/rejected": -2.667238712310791, "logps/chosen": -214.7142791748047, "logps/rejected": -410.58367919921875, "loss": 0.1537, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6009222269058228, "rewards/margins": 3.4552299976348877, "rewards/rejected": -1.8543075323104858, "step": 120 }, { "epoch": 0.04, "learning_rate": 7.361268403171007e-08, "logits/chosen": -2.599154472351074, "logits/rejected": -2.68690824508667, "logps/chosen": -258.9677429199219, "logps/rejected": -450.38922119140625, "loss": 0.1618, "rewards/accuracies": 1.0, "rewards/chosen": 1.4462578296661377, "rewards/margins": 3.623924732208252, "rewards/rejected": -2.1776671409606934, "step": 130 }, { "epoch": 0.05, "learning_rate": 7.927519818799547e-08, "logits/chosen": -2.663123369216919, "logits/rejected": -2.6354243755340576, "logps/chosen": -175.37716674804688, "logps/rejected": -480.14642333984375, "loss": 0.1375, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2520294189453125, "rewards/margins": 3.5804004669189453, "rewards/rejected": -2.3283705711364746, "step": 140 }, { "epoch": 0.05, "learning_rate": 8.493771234428086e-08, "logits/chosen": -2.598686695098877, "logits/rejected": -2.6283838748931885, "logps/chosen": -207.43179321289062, "logps/rejected": -430.9234313964844, "loss": 0.1526, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5090426206588745, "rewards/margins": 3.692683696746826, "rewards/rejected": -2.183641195297241, "step": 150 }, { "epoch": 0.05, "learning_rate": 9.060022650056625e-08, "logits/chosen": -2.499403715133667, "logits/rejected": -2.6443779468536377, "logps/chosen": -196.49412536621094, "logps/rejected": -392.57635498046875, "loss": 0.1331, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5876002311706543, "rewards/margins": 4.034597396850586, "rewards/rejected": -2.4469971656799316, "step": 160 }, { "epoch": 0.06, "learning_rate": 9.626274065685163e-08, "logits/chosen": -2.7243871688842773, "logits/rejected": -2.7112083435058594, "logps/chosen": -291.6788330078125, "logps/rejected": -375.7613220214844, "loss": 0.125, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.656733512878418, "rewards/margins": 4.375513076782227, "rewards/rejected": -2.7187793254852295, "step": 170 }, { "epoch": 0.06, "learning_rate": 1.0192525481313703e-07, "logits/chosen": -2.5340187549591064, "logits/rejected": -2.6408143043518066, "logps/chosen": -319.5456237792969, "logps/rejected": -510.899169921875, "loss": 0.1314, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.748708724975586, "rewards/margins": 4.384921073913574, "rewards/rejected": -2.63621187210083, "step": 180 }, { "epoch": 0.06, "learning_rate": 1.0758776896942241e-07, "logits/chosen": -2.609438896179199, "logits/rejected": -2.698153018951416, "logps/chosen": -199.8627471923828, "logps/rejected": -422.8956604003906, "loss": 0.0914, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.0498266220092773, "rewards/margins": 4.826152324676514, "rewards/rejected": -2.7763259410858154, "step": 190 }, { "epoch": 0.07, "learning_rate": 1.1325028312570781e-07, "logits/chosen": -2.59509539604187, "logits/rejected": -2.6665167808532715, "logps/chosen": -251.11514282226562, "logps/rejected": -342.45550537109375, "loss": 0.1279, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.6934995651245117, "rewards/margins": 4.567671298980713, "rewards/rejected": -2.8741719722747803, "step": 200 }, { "epoch": 0.07, "eval_logits/chosen": -2.7437760829925537, "eval_logits/rejected": -2.6659414768218994, "eval_logps/chosen": -244.02133178710938, "eval_logps/rejected": -431.390625, "eval_loss": 0.10992327332496643, "eval_rewards/accuracies": 0.9621211886405945, "eval_rewards/chosen": 1.8935197591781616, "eval_rewards/margins": 4.8150482177734375, "eval_rewards/rejected": -2.9215283393859863, "eval_runtime": 461.3671, "eval_samples_per_second": 20.591, "eval_steps_per_second": 0.644, "step": 200 }, { "epoch": 0.07, "learning_rate": 1.189127972819932e-07, "logits/chosen": -2.6131131649017334, "logits/rejected": -2.4965083599090576, "logps/chosen": -189.36961364746094, "logps/rejected": -564.0767822265625, "loss": 0.1368, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.1819887161254883, "rewards/margins": 4.7227911949157715, "rewards/rejected": -2.5408027172088623, "step": 210 }, { "epoch": 0.07, "learning_rate": 1.245753114382786e-07, "logits/chosen": -2.7222821712493896, "logits/rejected": -2.6807782649993896, "logps/chosen": -297.37371826171875, "logps/rejected": -364.626953125, "loss": 0.0674, "rewards/accuracies": 1.0, "rewards/chosen": 2.2767741680145264, "rewards/margins": 5.545619964599609, "rewards/rejected": -3.268845796585083, "step": 220 }, { "epoch": 0.08, "learning_rate": 1.3023782559456398e-07, "logits/chosen": -2.6594886779785156, "logits/rejected": -2.6263375282287598, "logps/chosen": -259.6681213378906, "logps/rejected": -450.45965576171875, "loss": 0.0799, "rewards/accuracies": 1.0, "rewards/chosen": 2.2504565715789795, "rewards/margins": 5.4989118576049805, "rewards/rejected": -3.248455762863159, "step": 230 }, { "epoch": 0.08, "learning_rate": 1.3590033975084937e-07, "logits/chosen": -2.6277785301208496, "logits/rejected": -2.5901219844818115, "logps/chosen": -242.6718292236328, "logps/rejected": -561.1946411132812, "loss": 0.0983, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.1083226203918457, "rewards/margins": 5.803457260131836, "rewards/rejected": -3.6951351165771484, "step": 240 }, { "epoch": 0.08, "learning_rate": 1.4156285390713476e-07, "logits/chosen": -2.675647020339966, "logits/rejected": -2.652252197265625, "logps/chosen": -192.68292236328125, "logps/rejected": -421.8887634277344, "loss": 0.2005, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.4014663696289062, "rewards/margins": 5.913949012756348, "rewards/rejected": -3.5124828815460205, "step": 250 }, { "epoch": 0.09, "learning_rate": 1.4722536806342014e-07, "logits/chosen": -2.621668577194214, "logits/rejected": -2.6676833629608154, "logps/chosen": -248.64083862304688, "logps/rejected": -435.6402893066406, "loss": 0.0788, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.5777688026428223, "rewards/margins": 5.418545722961426, "rewards/rejected": -2.8407769203186035, "step": 260 }, { "epoch": 0.09, "learning_rate": 1.5288788221970556e-07, "logits/chosen": -2.69730806350708, "logits/rejected": -2.615682363510132, "logps/chosen": -225.73910522460938, "logps/rejected": -314.1669921875, "loss": 0.0859, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4198713302612305, "rewards/margins": 6.052767753601074, "rewards/rejected": -3.6328964233398438, "step": 270 }, { "epoch": 0.1, "learning_rate": 1.5855039637599094e-07, "logits/chosen": -2.541067123413086, "logits/rejected": -2.6672749519348145, "logps/chosen": -199.87933349609375, "logps/rejected": -307.9038391113281, "loss": 0.1187, "rewards/accuracies": 0.9375, "rewards/chosen": 2.375232696533203, "rewards/margins": 6.675690650939941, "rewards/rejected": -4.300457954406738, "step": 280 }, { "epoch": 0.1, "learning_rate": 1.642129105322763e-07, "logits/chosen": -2.646385669708252, "logits/rejected": -2.6828160285949707, "logps/chosen": -171.08670043945312, "logps/rejected": -426.2986755371094, "loss": 0.0693, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3382785320281982, "rewards/margins": 5.279599666595459, "rewards/rejected": -2.9413208961486816, "step": 290 }, { "epoch": 0.1, "learning_rate": 1.6987542468856172e-07, "logits/chosen": -2.665996789932251, "logits/rejected": -2.704948902130127, "logps/chosen": -190.14559936523438, "logps/rejected": -515.036865234375, "loss": 0.075, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.58656644821167, "rewards/margins": 6.179848670959473, "rewards/rejected": -3.5932822227478027, "step": 300 }, { "epoch": 0.1, "eval_logits/chosen": -2.7254638671875, "eval_logits/rejected": -2.646328926086426, "eval_logps/chosen": -236.56301879882812, "eval_logps/rejected": -433.44207763671875, "eval_loss": 0.10838426649570465, "eval_rewards/accuracies": 0.9789562225341797, "eval_rewards/chosen": 2.6393473148345947, "eval_rewards/margins": 5.766019344329834, "eval_rewards/rejected": -3.12667179107666, "eval_runtime": 460.9004, "eval_samples_per_second": 20.612, "eval_steps_per_second": 0.644, "step": 300 }, { "epoch": 0.11, "learning_rate": 1.755379388448471e-07, "logits/chosen": -2.728576183319092, "logits/rejected": -2.600494384765625, "logps/chosen": -194.33456420898438, "logps/rejected": -412.9208068847656, "loss": 0.1312, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.8017306327819824, "rewards/margins": 5.901663303375244, "rewards/rejected": -3.099932909011841, "step": 310 }, { "epoch": 0.11, "learning_rate": 1.812004530011325e-07, "logits/chosen": -2.5152015686035156, "logits/rejected": -2.582810878753662, "logps/chosen": -360.67559814453125, "logps/rejected": -343.7029724121094, "loss": 0.0956, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.9696030616760254, "rewards/margins": 6.0275115966796875, "rewards/rejected": -3.057907819747925, "step": 320 }, { "epoch": 0.11, "learning_rate": 1.868629671574179e-07, "logits/chosen": -2.598024606704712, "logits/rejected": -2.5849690437316895, "logps/chosen": -242.7597198486328, "logps/rejected": -512.992919921875, "loss": 0.0468, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.047391414642334, "rewards/margins": 6.619978427886963, "rewards/rejected": -3.5725860595703125, "step": 330 }, { "epoch": 0.12, "learning_rate": 1.9252548131370327e-07, "logits/chosen": -2.6105878353118896, "logits/rejected": -2.6640186309814453, "logps/chosen": -176.71536254882812, "logps/rejected": -435.08355712890625, "loss": 0.0675, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.9948973655700684, "rewards/margins": 6.749991416931152, "rewards/rejected": -3.755094051361084, "step": 340 }, { "epoch": 0.12, "learning_rate": 1.9818799546998865e-07, "logits/chosen": -2.6800684928894043, "logits/rejected": -2.5577774047851562, "logps/chosen": -225.32882690429688, "logps/rejected": -496.7113342285156, "loss": 0.1855, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.026808977127075, "rewards/margins": 6.551639556884766, "rewards/rejected": -3.5248305797576904, "step": 350 }, { "epoch": 0.12, "learning_rate": 2.0385050962627407e-07, "logits/chosen": -2.588452100753784, "logits/rejected": -2.6757311820983887, "logps/chosen": -173.0467529296875, "logps/rejected": -351.0324401855469, "loss": 0.0457, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.891030788421631, "rewards/margins": 7.493356227874756, "rewards/rejected": -4.602325439453125, "step": 360 }, { "epoch": 0.13, "learning_rate": 2.0951302378255946e-07, "logits/chosen": -2.6006948947906494, "logits/rejected": -2.5470387935638428, "logps/chosen": -181.95779418945312, "logps/rejected": -672.4630126953125, "loss": 0.1084, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.722691774368286, "rewards/margins": 7.066168308258057, "rewards/rejected": -4.343477249145508, "step": 370 }, { "epoch": 0.13, "learning_rate": 2.1517553793884482e-07, "logits/chosen": -2.6569361686706543, "logits/rejected": -2.5520999431610107, "logps/chosen": -201.32608032226562, "logps/rejected": -579.266845703125, "loss": 0.0717, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.1833584308624268, "rewards/margins": 7.868722438812256, "rewards/rejected": -4.685364723205566, "step": 380 }, { "epoch": 0.13, "learning_rate": 2.2083805209513023e-07, "logits/chosen": -2.5609090328216553, "logits/rejected": -2.4733877182006836, "logps/chosen": -249.2960662841797, "logps/rejected": -369.2193603515625, "loss": 0.0722, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.6326074600219727, "rewards/margins": 7.0068511962890625, "rewards/rejected": -4.37424373626709, "step": 390 }, { "epoch": 0.14, "learning_rate": 2.2650056625141562e-07, "logits/chosen": -2.6253280639648438, "logits/rejected": -2.6117076873779297, "logps/chosen": -189.1066436767578, "logps/rejected": -559.0174560546875, "loss": 0.0656, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.2104220390319824, "rewards/margins": 8.037348747253418, "rewards/rejected": -4.826926231384277, "step": 400 }, { "epoch": 0.14, "eval_logits/chosen": -2.714289426803589, "eval_logits/rejected": -2.6321334838867188, "eval_logps/chosen": -236.2027587890625, "eval_logps/rejected": -445.30218505859375, "eval_loss": 0.06701350957155228, "eval_rewards/accuracies": 0.9840067625045776, "eval_rewards/chosen": 2.67537784576416, "eval_rewards/margins": 6.988061904907227, "eval_rewards/rejected": -4.31268310546875, "eval_runtime": 460.3046, "eval_samples_per_second": 20.639, "eval_steps_per_second": 0.645, "step": 400 }, { "epoch": 0.14, "learning_rate": 2.32163080407701e-07, "logits/chosen": -2.76120662689209, "logits/rejected": -2.458223342895508, "logps/chosen": -188.09536743164062, "logps/rejected": -448.3778381347656, "loss": 0.1071, "rewards/accuracies": 0.9375, "rewards/chosen": 2.92415189743042, "rewards/margins": 6.942471504211426, "rewards/rejected": -4.018319129943848, "step": 410 }, { "epoch": 0.14, "learning_rate": 2.378255945639864e-07, "logits/chosen": -2.5892369747161865, "logits/rejected": -2.530914783477783, "logps/chosen": -190.20925903320312, "logps/rejected": -599.5638427734375, "loss": 0.0734, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.8430845737457275, "rewards/margins": 7.680964469909668, "rewards/rejected": -4.8378801345825195, "step": 420 }, { "epoch": 0.15, "learning_rate": 2.434881087202718e-07, "logits/chosen": -2.6759955883026123, "logits/rejected": -2.5184383392333984, "logps/chosen": -177.0557861328125, "logps/rejected": -535.7196655273438, "loss": 0.0693, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.9040558338165283, "rewards/margins": 7.499063014984131, "rewards/rejected": -4.595007419586182, "step": 430 }, { "epoch": 0.15, "learning_rate": 2.491506228765572e-07, "logits/chosen": -2.6400017738342285, "logits/rejected": -2.5981903076171875, "logps/chosen": -182.1441650390625, "logps/rejected": -398.22869873046875, "loss": 0.0444, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.753174066543579, "rewards/margins": 7.7085418701171875, "rewards/rejected": -4.955367088317871, "step": 440 }, { "epoch": 0.15, "learning_rate": 2.548131370328426e-07, "logits/chosen": -2.5782999992370605, "logits/rejected": -2.769003391265869, "logps/chosen": -223.99887084960938, "logps/rejected": -348.1935119628906, "loss": 0.0826, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2360503673553467, "rewards/margins": 8.73168659210205, "rewards/rejected": -6.495635032653809, "step": 450 }, { "epoch": 0.16, "learning_rate": 2.6047565118912797e-07, "logits/chosen": -2.6025919914245605, "logits/rejected": -2.6407084465026855, "logps/chosen": -244.2267608642578, "logps/rejected": -374.7513732910156, "loss": 0.0566, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.885925769805908, "rewards/margins": 8.382081031799316, "rewards/rejected": -5.49615478515625, "step": 460 }, { "epoch": 0.16, "learning_rate": 2.6613816534541335e-07, "logits/chosen": -2.6487069129943848, "logits/rejected": -2.6397781372070312, "logps/chosen": -324.04547119140625, "logps/rejected": -247.3083038330078, "loss": 0.0507, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.842961072921753, "rewards/margins": 7.92880916595459, "rewards/rejected": -5.085848808288574, "step": 470 }, { "epoch": 0.16, "learning_rate": 2.7180067950169874e-07, "logits/chosen": -2.7063021659851074, "logits/rejected": -2.5292975902557373, "logps/chosen": -184.63059997558594, "logps/rejected": -526.4676513671875, "loss": 0.0409, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.8629398345947266, "rewards/margins": 7.78806209564209, "rewards/rejected": -4.925122261047363, "step": 480 }, { "epoch": 0.17, "learning_rate": 2.7746319365798413e-07, "logits/chosen": -2.7768285274505615, "logits/rejected": -2.610732316970825, "logps/chosen": -199.6244659423828, "logps/rejected": -457.6041564941406, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": 3.019763708114624, "rewards/margins": 10.251375198364258, "rewards/rejected": -7.2316107749938965, "step": 490 }, { "epoch": 0.17, "learning_rate": 2.831257078142695e-07, "logits/chosen": -2.6548938751220703, "logits/rejected": -2.64559006690979, "logps/chosen": -306.976318359375, "logps/rejected": -263.77191162109375, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 2.8983559608459473, "rewards/margins": 7.9603424072265625, "rewards/rejected": -5.061985969543457, "step": 500 }, { "epoch": 0.17, "eval_logits/chosen": -2.726097583770752, "eval_logits/rejected": -2.6533405780792236, "eval_logps/chosen": -235.3429412841797, "eval_logps/rejected": -459.71307373046875, "eval_loss": 0.044368669390678406, "eval_rewards/accuracies": 0.9856902360916138, "eval_rewards/chosen": 2.7613587379455566, "eval_rewards/margins": 8.515129089355469, "eval_rewards/rejected": -5.75377082824707, "eval_runtime": 460.3325, "eval_samples_per_second": 20.637, "eval_steps_per_second": 0.645, "step": 500 }, { "epoch": 0.17, "learning_rate": 2.887882219705549e-07, "logits/chosen": -2.4815495014190674, "logits/rejected": -2.5616536140441895, "logps/chosen": -283.038818359375, "logps/rejected": -568.7218017578125, "loss": 0.0512, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.881859540939331, "rewards/margins": 7.950438499450684, "rewards/rejected": -5.06857967376709, "step": 510 }, { "epoch": 0.18, "learning_rate": 2.944507361268403e-07, "logits/chosen": -2.548823833465576, "logits/rejected": -2.562615156173706, "logps/chosen": -195.1108856201172, "logps/rejected": -490.55877685546875, "loss": 0.0765, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.8928675651550293, "rewards/margins": 9.328428268432617, "rewards/rejected": -6.435560703277588, "step": 520 }, { "epoch": 0.18, "learning_rate": 3.001132502831257e-07, "logits/chosen": -2.6927952766418457, "logits/rejected": -2.509363889694214, "logps/chosen": -177.90225219726562, "logps/rejected": -541.0279541015625, "loss": 0.0532, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.3859009742736816, "rewards/margins": 9.745203971862793, "rewards/rejected": -6.3593034744262695, "step": 530 }, { "epoch": 0.18, "learning_rate": 3.057757644394111e-07, "logits/chosen": -2.6255252361297607, "logits/rejected": -2.613295078277588, "logps/chosen": -175.70750427246094, "logps/rejected": -369.9935302734375, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": 3.1336193084716797, "rewards/margins": 9.239606857299805, "rewards/rejected": -6.105988502502441, "step": 540 }, { "epoch": 0.19, "learning_rate": 3.114382785956965e-07, "logits/chosen": -2.690768241882324, "logits/rejected": -2.5799264907836914, "logps/chosen": -192.04698181152344, "logps/rejected": -441.42694091796875, "loss": 0.0368, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.363586664199829, "rewards/margins": 9.361661911010742, "rewards/rejected": -6.99807596206665, "step": 550 }, { "epoch": 0.19, "learning_rate": 3.171007927519819e-07, "logits/chosen": -2.480285167694092, "logits/rejected": -2.589327096939087, "logps/chosen": -296.5298767089844, "logps/rejected": -409.7449645996094, "loss": 0.0795, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.1403679847717285, "rewards/margins": 9.489206314086914, "rewards/rejected": -7.348838806152344, "step": 560 }, { "epoch": 0.19, "learning_rate": 3.227633069082673e-07, "logits/chosen": -2.6266415119171143, "logits/rejected": -2.6312308311462402, "logps/chosen": -339.8543395996094, "logps/rejected": -321.7017517089844, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 2.3285675048828125, "rewards/margins": 11.146139144897461, "rewards/rejected": -8.817571640014648, "step": 570 }, { "epoch": 0.2, "learning_rate": 3.284258210645526e-07, "logits/chosen": -2.588715076446533, "logits/rejected": -2.67547869682312, "logps/chosen": -306.3263244628906, "logps/rejected": -451.67547607421875, "loss": 0.0497, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.395595073699951, "rewards/margins": 9.868276596069336, "rewards/rejected": -7.472679138183594, "step": 580 }, { "epoch": 0.2, "learning_rate": 3.34088335220838e-07, "logits/chosen": -2.7353405952453613, "logits/rejected": -2.629629373550415, "logps/chosen": -190.70892333984375, "logps/rejected": -357.3830261230469, "loss": 0.1402, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4867031574249268, "rewards/margins": 9.185391426086426, "rewards/rejected": -6.698688507080078, "step": 590 }, { "epoch": 0.2, "learning_rate": 3.3975084937712344e-07, "logits/chosen": -2.5762805938720703, "logits/rejected": -2.6193690299987793, "logps/chosen": -300.1703796386719, "logps/rejected": -602.3262939453125, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": 2.2027058601379395, "rewards/margins": 9.749842643737793, "rewards/rejected": -7.547135829925537, "step": 600 }, { "epoch": 0.2, "eval_logits/chosen": -2.7255735397338867, "eval_logits/rejected": -2.6261141300201416, "eval_logps/chosen": -244.9011688232422, "eval_logps/rejected": -493.4216613769531, "eval_loss": 0.08204595744609833, "eval_rewards/accuracies": 0.9781144857406616, "eval_rewards/chosen": 1.8055354356765747, "eval_rewards/margins": 10.930169105529785, "eval_rewards/rejected": -9.1246337890625, "eval_runtime": 461.0536, "eval_samples_per_second": 20.605, "eval_steps_per_second": 0.644, "step": 600 }, { "epoch": 0.21, "learning_rate": 3.454133635334088e-07, "logits/chosen": -2.4857780933380127, "logits/rejected": -2.63435697555542, "logps/chosen": -238.43844604492188, "logps/rejected": -440.7359313964844, "loss": 0.041, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3338594436645508, "rewards/margins": 11.27547550201416, "rewards/rejected": -9.941615104675293, "step": 610 }, { "epoch": 0.21, "learning_rate": 3.510758776896942e-07, "logits/chosen": -2.561501979827881, "logits/rejected": -2.515075206756592, "logps/chosen": -320.04742431640625, "logps/rejected": -547.2848510742188, "loss": 0.0384, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6276912689208984, "rewards/margins": 11.055011749267578, "rewards/rejected": -9.42732048034668, "step": 620 }, { "epoch": 0.21, "learning_rate": 3.567383918459796e-07, "logits/chosen": -2.484651803970337, "logits/rejected": -2.6119046211242676, "logps/chosen": -280.99884033203125, "logps/rejected": -301.88592529296875, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 1.9051545858383179, "rewards/margins": 11.183754920959473, "rewards/rejected": -9.278600692749023, "step": 630 }, { "epoch": 0.22, "learning_rate": 3.62400906002265e-07, "logits/chosen": -2.543846368789673, "logits/rejected": -2.6858391761779785, "logps/chosen": -258.9605407714844, "logps/rejected": -539.4953002929688, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": 1.868211030960083, "rewards/margins": 12.584554672241211, "rewards/rejected": -10.716344833374023, "step": 640 }, { "epoch": 0.22, "learning_rate": 3.6806342015855037e-07, "logits/chosen": -2.435859203338623, "logits/rejected": -2.689331293106079, "logps/chosen": -242.1764678955078, "logps/rejected": -373.56689453125, "loss": 0.0365, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9374281167984009, "rewards/margins": 11.124711036682129, "rewards/rejected": -9.187280654907227, "step": 650 }, { "epoch": 0.22, "learning_rate": 3.737259343148358e-07, "logits/chosen": -2.6309049129486084, "logits/rejected": -2.677889823913574, "logps/chosen": -200.17002868652344, "logps/rejected": -502.3214416503906, "loss": 0.0373, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.562139868736267, "rewards/margins": 10.998193740844727, "rewards/rejected": -9.436054229736328, "step": 660 }, { "epoch": 0.23, "learning_rate": 3.7938844847112115e-07, "logits/chosen": -2.5589663982391357, "logits/rejected": -2.686365842819214, "logps/chosen": -233.09439086914062, "logps/rejected": -569.119873046875, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": 1.2771753072738647, "rewards/margins": 10.450187683105469, "rewards/rejected": -9.173011779785156, "step": 670 }, { "epoch": 0.23, "learning_rate": 3.8505096262740653e-07, "logits/chosen": -2.6148438453674316, "logits/rejected": -2.643418073654175, "logps/chosen": -212.24697875976562, "logps/rejected": -534.6656494140625, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": 1.4529036283493042, "rewards/margins": 12.015807151794434, "rewards/rejected": -10.56290340423584, "step": 680 }, { "epoch": 0.23, "learning_rate": 3.907134767836919e-07, "logits/chosen": -2.568960666656494, "logits/rejected": -2.4547691345214844, "logps/chosen": -284.35589599609375, "logps/rejected": -562.1549072265625, "loss": 0.3001, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7375848293304443, "rewards/margins": 14.848283767700195, "rewards/rejected": -13.110699653625488, "step": 690 }, { "epoch": 0.24, "learning_rate": 3.963759909399773e-07, "logits/chosen": -2.599010944366455, "logits/rejected": -2.5265021324157715, "logps/chosen": -240.9492645263672, "logps/rejected": -586.3984985351562, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 1.9875065088272095, "rewards/margins": 14.380460739135742, "rewards/rejected": -12.39295482635498, "step": 700 }, { "epoch": 0.24, "eval_logits/chosen": -2.6688716411590576, "eval_logits/rejected": -2.579874277114868, "eval_logps/chosen": -247.6768798828125, "eval_logps/rejected": -534.2635498046875, "eval_loss": 0.07031755894422531, "eval_rewards/accuracies": 0.9856902360916138, "eval_rewards/chosen": 1.52796471118927, "eval_rewards/margins": 14.736777305603027, "eval_rewards/rejected": -13.208812713623047, "eval_runtime": 460.5957, "eval_samples_per_second": 20.625, "eval_steps_per_second": 0.645, "step": 700 }, { "epoch": 0.24, "learning_rate": 4.0203850509626275e-07, "logits/chosen": -2.4941275119781494, "logits/rejected": -2.519707441329956, "logps/chosen": -243.2341766357422, "logps/rejected": -432.4749450683594, "loss": 0.015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5471111536026, "rewards/margins": 13.800142288208008, "rewards/rejected": -12.253030776977539, "step": 710 }, { "epoch": 0.24, "learning_rate": 4.0770101925254814e-07, "logits/chosen": -2.591391086578369, "logits/rejected": -2.5847771167755127, "logps/chosen": -214.2767333984375, "logps/rejected": -429.0003967285156, "loss": 0.0563, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3923547267913818, "rewards/margins": 12.078883171081543, "rewards/rejected": -10.686529159545898, "step": 720 }, { "epoch": 0.25, "learning_rate": 4.133635334088335e-07, "logits/chosen": -2.597377300262451, "logits/rejected": -2.500725030899048, "logps/chosen": -268.75262451171875, "logps/rejected": -512.81640625, "loss": 0.0202, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.706974744796753, "rewards/margins": 11.331899642944336, "rewards/rejected": -9.62492561340332, "step": 730 }, { "epoch": 0.25, "learning_rate": 4.190260475651189e-07, "logits/chosen": -2.554598569869995, "logits/rejected": -2.544752597808838, "logps/chosen": -264.7027282714844, "logps/rejected": -437.59185791015625, "loss": 0.3689, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3912254571914673, "rewards/margins": 11.881486892700195, "rewards/rejected": -10.490262031555176, "step": 740 }, { "epoch": 0.25, "learning_rate": 4.2468856172140424e-07, "logits/chosen": -2.58223295211792, "logits/rejected": -2.4992735385894775, "logps/chosen": -189.06736755371094, "logps/rejected": -541.4937744140625, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 1.8746986389160156, "rewards/margins": 13.727132797241211, "rewards/rejected": -11.852434158325195, "step": 750 }, { "epoch": 0.26, "learning_rate": 4.3035107587768963e-07, "logits/chosen": -2.4738926887512207, "logits/rejected": -2.4859681129455566, "logps/chosen": -198.36935424804688, "logps/rejected": -406.86236572265625, "loss": 0.0688, "rewards/accuracies": 1.0, "rewards/chosen": 1.4896185398101807, "rewards/margins": 12.279550552368164, "rewards/rejected": -10.789932250976562, "step": 760 }, { "epoch": 0.26, "learning_rate": 4.3601359003397507e-07, "logits/chosen": -2.4080967903137207, "logits/rejected": -2.5352911949157715, "logps/chosen": -302.553466796875, "logps/rejected": -487.6495056152344, "loss": 0.0138, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.767062783241272, "rewards/margins": 11.648015022277832, "rewards/rejected": -9.880950927734375, "step": 770 }, { "epoch": 0.27, "learning_rate": 4.4167610419026046e-07, "logits/chosen": -2.549412488937378, "logits/rejected": -2.4838802814483643, "logps/chosen": -317.14654541015625, "logps/rejected": -609.986572265625, "loss": 0.1074, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9986612200737, "rewards/margins": 13.04059886932373, "rewards/rejected": -12.041936874389648, "step": 780 }, { "epoch": 0.27, "learning_rate": 4.4733861834654585e-07, "logits/chosen": -2.564711809158325, "logits/rejected": -2.5131685733795166, "logps/chosen": -247.36691284179688, "logps/rejected": -387.3332824707031, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 1.24368155002594, "rewards/margins": 14.270515441894531, "rewards/rejected": -13.026835441589355, "step": 790 }, { "epoch": 0.27, "learning_rate": 4.5300113250283123e-07, "logits/chosen": -2.472017765045166, "logits/rejected": -2.5164694786071777, "logps/chosen": -253.9539794921875, "logps/rejected": -462.48126220703125, "loss": 0.032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2749547958374023, "rewards/margins": 11.451417922973633, "rewards/rejected": -10.176462173461914, "step": 800 }, { "epoch": 0.27, "eval_logits/chosen": -2.66702938079834, "eval_logits/rejected": -2.576138973236084, "eval_logps/chosen": -248.36875915527344, "eval_logps/rejected": -516.1622314453125, "eval_loss": 0.05833537131547928, "eval_rewards/accuracies": 0.9890572428703308, "eval_rewards/chosen": 1.4587738513946533, "eval_rewards/margins": 12.857465744018555, "eval_rewards/rejected": -11.398690223693848, "eval_runtime": 460.7085, "eval_samples_per_second": 20.62, "eval_steps_per_second": 0.645, "step": 800 }, { "epoch": 0.28, "learning_rate": 4.586636466591166e-07, "logits/chosen": -2.4158847332000732, "logits/rejected": -2.581696033477783, "logps/chosen": -260.9993591308594, "logps/rejected": -595.8202514648438, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1538710594177246, "rewards/margins": 14.070910453796387, "rewards/rejected": -12.917040824890137, "step": 810 }, { "epoch": 0.28, "learning_rate": 4.64326160815402e-07, "logits/chosen": -2.5027642250061035, "logits/rejected": -2.5394458770751953, "logps/chosen": -336.9600524902344, "logps/rejected": -407.3341369628906, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 1.807832956314087, "rewards/margins": 14.215736389160156, "rewards/rejected": -12.407903671264648, "step": 820 }, { "epoch": 0.28, "learning_rate": 4.6998867497168745e-07, "logits/chosen": -2.5967133045196533, "logits/rejected": -2.5023131370544434, "logps/chosen": -195.04090881347656, "logps/rejected": -579.935791015625, "loss": 0.0402, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5676156878471375, "rewards/margins": 14.109151840209961, "rewards/rejected": -13.541537284851074, "step": 830 }, { "epoch": 0.29, "learning_rate": 4.756511891279728e-07, "logits/chosen": -2.522873878479004, "logits/rejected": -2.577030897140503, "logps/chosen": -277.5176086425781, "logps/rejected": -483.15325927734375, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 0.7743971943855286, "rewards/margins": 15.622072219848633, "rewards/rejected": -14.847674369812012, "step": 840 }, { "epoch": 0.29, "learning_rate": 4.813137032842582e-07, "logits/chosen": -2.4601683616638184, "logits/rejected": -2.4073076248168945, "logps/chosen": -277.31201171875, "logps/rejected": -726.3077392578125, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 0.2516477108001709, "rewards/margins": 15.151922225952148, "rewards/rejected": -14.900274276733398, "step": 850 }, { "epoch": 0.29, "learning_rate": 4.869762174405436e-07, "logits/chosen": -2.4297313690185547, "logits/rejected": -2.506155490875244, "logps/chosen": -253.1144256591797, "logps/rejected": -554.284912109375, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": 1.0400097370147705, "rewards/margins": 15.567739486694336, "rewards/rejected": -14.527729988098145, "step": 860 }, { "epoch": 0.3, "learning_rate": 4.92638731596829e-07, "logits/chosen": -2.5881617069244385, "logits/rejected": -2.4507336616516113, "logps/chosen": -327.51812744140625, "logps/rejected": -567.8660888671875, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 1.8835452795028687, "rewards/margins": 12.908963203430176, "rewards/rejected": -11.025418281555176, "step": 870 }, { "epoch": 0.3, "learning_rate": 4.983012457531144e-07, "logits/chosen": -2.5466036796569824, "logits/rejected": -2.5872304439544678, "logps/chosen": -179.5204315185547, "logps/rejected": -480.2381286621094, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 1.8061977624893188, "rewards/margins": 16.413423538208008, "rewards/rejected": -14.60722541809082, "step": 880 }, { "epoch": 0.3, "learning_rate": 4.995593604431575e-07, "logits/chosen": -2.435133218765259, "logits/rejected": -2.501832962036133, "logps/chosen": -248.4174346923828, "logps/rejected": -390.3518371582031, "loss": 0.056, "rewards/accuracies": 1.0, "rewards/chosen": 1.445499062538147, "rewards/margins": 14.839500427246094, "rewards/rejected": -13.394000053405762, "step": 890 }, { "epoch": 0.31, "learning_rate": 4.989298753619539e-07, "logits/chosen": -2.4923131465911865, "logits/rejected": -2.5103683471679688, "logps/chosen": -213.6688232421875, "logps/rejected": -492.6978454589844, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": 1.1568156480789185, "rewards/margins": 17.00368881225586, "rewards/rejected": -15.84687328338623, "step": 900 }, { "epoch": 0.31, "eval_logits/chosen": -2.6552929878234863, "eval_logits/rejected": -2.5744776725769043, "eval_logps/chosen": -254.1800537109375, "eval_logps/rejected": -559.970458984375, "eval_loss": 0.043986935168504715, "eval_rewards/accuracies": 0.9924242496490479, "eval_rewards/chosen": 0.8776453733444214, "eval_rewards/margins": 16.657155990600586, "eval_rewards/rejected": -15.779512405395508, "eval_runtime": 461.2142, "eval_samples_per_second": 20.598, "eval_steps_per_second": 0.644, "step": 900 }, { "epoch": 0.31, "learning_rate": 4.983003902807503e-07, "logits/chosen": -2.5980477333068848, "logits/rejected": -2.396554470062256, "logps/chosen": -300.1101989746094, "logps/rejected": -404.8836669921875, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 0.6338903903961182, "rewards/margins": 15.027383804321289, "rewards/rejected": -14.393491744995117, "step": 910 }, { "epoch": 0.31, "learning_rate": 4.976709051995467e-07, "logits/chosen": -2.632249593734741, "logits/rejected": -2.5720303058624268, "logps/chosen": -214.367919921875, "logps/rejected": -509.8678283691406, "loss": 0.0697, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5685518383979797, "rewards/margins": 17.672880172729492, "rewards/rejected": -17.104328155517578, "step": 920 }, { "epoch": 0.32, "learning_rate": 4.970414201183432e-07, "logits/chosen": -2.6090967655181885, "logits/rejected": -2.6884734630584717, "logps/chosen": -253.2736053466797, "logps/rejected": -594.8142700195312, "loss": 0.0299, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.013087066821753979, "rewards/margins": 18.214691162109375, "rewards/rejected": -18.2277774810791, "step": 930 }, { "epoch": 0.32, "learning_rate": 4.964119350371396e-07, "logits/chosen": -2.5714800357818604, "logits/rejected": -2.578979969024658, "logps/chosen": -215.2510986328125, "logps/rejected": -544.3941650390625, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -0.18770122528076172, "rewards/margins": 15.900426864624023, "rewards/rejected": -16.08812713623047, "step": 940 }, { "epoch": 0.32, "learning_rate": 4.95782449955936e-07, "logits/chosen": -2.672684907913208, "logits/rejected": -2.565187931060791, "logps/chosen": -251.68008422851562, "logps/rejected": -495.5555114746094, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 0.7551397681236267, "rewards/margins": 15.990818977355957, "rewards/rejected": -15.235677719116211, "step": 950 }, { "epoch": 0.33, "learning_rate": 4.951529648747325e-07, "logits/chosen": -2.5399954319000244, "logits/rejected": -2.520082473754883, "logps/chosen": -299.60333251953125, "logps/rejected": -527.4031982421875, "loss": 0.0712, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4932546615600586, "rewards/margins": 15.310656547546387, "rewards/rejected": -13.817400932312012, "step": 960 }, { "epoch": 0.33, "learning_rate": 4.945234797935289e-07, "logits/chosen": -2.686828136444092, "logits/rejected": -2.490656614303589, "logps/chosen": -187.8560028076172, "logps/rejected": -701.8441162109375, "loss": 0.0216, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5330561399459839, "rewards/margins": 19.208683013916016, "rewards/rejected": -17.675630569458008, "step": 970 }, { "epoch": 0.33, "learning_rate": 4.938939947123252e-07, "logits/chosen": -2.6566481590270996, "logits/rejected": -2.5806772708892822, "logps/chosen": -309.18402099609375, "logps/rejected": -643.8297119140625, "loss": 0.0407, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1829935312271118, "rewards/margins": 17.061037063598633, "rewards/rejected": -15.878042221069336, "step": 980 }, { "epoch": 0.34, "learning_rate": 4.932645096311217e-07, "logits/chosen": -2.618457317352295, "logits/rejected": -2.6536054611206055, "logps/chosen": -183.1858367919922, "logps/rejected": -635.5850830078125, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": 1.677351713180542, "rewards/margins": 18.316219329833984, "rewards/rejected": -16.638866424560547, "step": 990 }, { "epoch": 0.34, "learning_rate": 4.926350245499181e-07, "logits/chosen": -2.704935312271118, "logits/rejected": -2.6507019996643066, "logps/chosen": -271.97747802734375, "logps/rejected": -555.6414794921875, "loss": 0.1198, "rewards/accuracies": 1.0, "rewards/chosen": 1.1873575448989868, "rewards/margins": 23.497821807861328, "rewards/rejected": -22.310462951660156, "step": 1000 }, { "epoch": 0.34, "eval_logits/chosen": -2.7356741428375244, "eval_logits/rejected": -2.630770683288574, "eval_logps/chosen": -255.3415985107422, "eval_logps/rejected": -627.3003540039062, "eval_loss": 0.04599127918481827, "eval_rewards/accuracies": 0.9932659864425659, "eval_rewards/chosen": 0.7614928483963013, "eval_rewards/margins": 23.273990631103516, "eval_rewards/rejected": -22.512495040893555, "eval_runtime": 461.0901, "eval_samples_per_second": 20.603, "eval_steps_per_second": 0.644, "step": 1000 }, { "epoch": 0.34, "learning_rate": 4.920055394687146e-07, "logits/chosen": -2.682918071746826, "logits/rejected": -2.6014046669006348, "logps/chosen": -281.0010681152344, "logps/rejected": -471.473388671875, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": 0.7654463648796082, "rewards/margins": 15.174552917480469, "rewards/rejected": -14.409106254577637, "step": 1010 }, { "epoch": 0.35, "learning_rate": 4.91376054387511e-07, "logits/chosen": -2.7156500816345215, "logits/rejected": -2.5619990825653076, "logps/chosen": -191.9119873046875, "logps/rejected": -520.0499267578125, "loss": 0.2928, "rewards/accuracies": 1.0, "rewards/chosen": 0.3990475535392761, "rewards/margins": 15.932522773742676, "rewards/rejected": -15.533473014831543, "step": 1020 }, { "epoch": 0.35, "learning_rate": 4.907465693063074e-07, "logits/chosen": -2.6783156394958496, "logits/rejected": -2.6145482063293457, "logps/chosen": -197.7878875732422, "logps/rejected": -497.7518615722656, "loss": 0.0224, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7534411549568176, "rewards/margins": 17.54076385498047, "rewards/rejected": -16.787322998046875, "step": 1030 }, { "epoch": 0.35, "learning_rate": 4.901170842251039e-07, "logits/chosen": -2.574402332305908, "logits/rejected": -2.5649852752685547, "logps/chosen": -358.0187683105469, "logps/rejected": -674.7064208984375, "loss": 0.1089, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1317404806613922, "rewards/margins": 17.84585952758789, "rewards/rejected": -17.71411895751953, "step": 1040 }, { "epoch": 0.36, "learning_rate": 4.894875991439003e-07, "logits/chosen": -2.6966941356658936, "logits/rejected": -2.694880485534668, "logps/chosen": -340.9144592285156, "logps/rejected": -546.0291748046875, "loss": 0.0797, "rewards/accuracies": 1.0, "rewards/chosen": 0.10131274163722992, "rewards/margins": 20.4351749420166, "rewards/rejected": -20.3338623046875, "step": 1050 }, { "epoch": 0.36, "learning_rate": 4.888581140626966e-07, "logits/chosen": -2.7477335929870605, "logits/rejected": -2.7159507274627686, "logps/chosen": -277.1622314453125, "logps/rejected": -597.2868041992188, "loss": 0.0762, "rewards/accuracies": 1.0, "rewards/chosen": -0.48684263229370117, "rewards/margins": 21.359424591064453, "rewards/rejected": -21.84626579284668, "step": 1060 }, { "epoch": 0.36, "learning_rate": 4.882286289814931e-07, "logits/chosen": -2.7206077575683594, "logits/rejected": -2.630331516265869, "logps/chosen": -311.9520263671875, "logps/rejected": -433.80926513671875, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 0.7353372573852539, "rewards/margins": 12.982867240905762, "rewards/rejected": -12.247530937194824, "step": 1070 }, { "epoch": 0.37, "learning_rate": 4.875991439002896e-07, "logits/chosen": -2.7460532188415527, "logits/rejected": -2.731285810470581, "logps/chosen": -262.0158996582031, "logps/rejected": -425.81939697265625, "loss": 0.0723, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6683778166770935, "rewards/margins": 13.73039722442627, "rewards/rejected": -13.062019348144531, "step": 1080 }, { "epoch": 0.37, "learning_rate": 4.869696588190859e-07, "logits/chosen": -2.5941104888916016, "logits/rejected": -2.7093076705932617, "logps/chosen": -255.13900756835938, "logps/rejected": -361.2904968261719, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 1.3036772012710571, "rewards/margins": 16.27226448059082, "rewards/rejected": -14.968586921691895, "step": 1090 }, { "epoch": 0.37, "learning_rate": 4.863401737378824e-07, "logits/chosen": -2.7411069869995117, "logits/rejected": -2.776799440383911, "logps/chosen": -256.292236328125, "logps/rejected": -566.8067016601562, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": 1.4670058488845825, "rewards/margins": 17.41312026977539, "rewards/rejected": -15.946111679077148, "step": 1100 }, { "epoch": 0.37, "eval_logits/chosen": -2.874436378479004, "eval_logits/rejected": -2.772772789001465, "eval_logps/chosen": -251.75767517089844, "eval_logps/rejected": -548.8187866210938, "eval_loss": 0.02930893376469612, "eval_rewards/accuracies": 0.9949495196342468, "eval_rewards/chosen": 1.1198850870132446, "eval_rewards/margins": 15.784234046936035, "eval_rewards/rejected": -14.664350509643555, "eval_runtime": 460.939, "eval_samples_per_second": 20.61, "eval_steps_per_second": 0.644, "step": 1100 }, { "epoch": 0.38, "learning_rate": 4.857106886566788e-07, "logits/chosen": -2.799093723297119, "logits/rejected": -2.7331230640411377, "logps/chosen": -181.97793579101562, "logps/rejected": -634.3790283203125, "loss": 0.053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6756753921508789, "rewards/margins": 17.25900650024414, "rewards/rejected": -16.583332061767578, "step": 1110 }, { "epoch": 0.38, "learning_rate": 4.850812035754753e-07, "logits/chosen": -2.8031764030456543, "logits/rejected": -2.7442429065704346, "logps/chosen": -211.8267364501953, "logps/rejected": -553.5987548828125, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": 0.5598838329315186, "rewards/margins": 16.774169921875, "rewards/rejected": -16.214284896850586, "step": 1120 }, { "epoch": 0.38, "learning_rate": 4.844517184942716e-07, "logits/chosen": -2.6647393703460693, "logits/rejected": -2.6610209941864014, "logps/chosen": -290.4703063964844, "logps/rejected": -401.39349365234375, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 0.01184847392141819, "rewards/margins": 16.037403106689453, "rewards/rejected": -16.025554656982422, "step": 1130 }, { "epoch": 0.39, "learning_rate": 4.838222334130681e-07, "logits/chosen": -2.7264962196350098, "logits/rejected": -2.714993476867676, "logps/chosen": -271.96337890625, "logps/rejected": -535.1102294921875, "loss": 0.0425, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4735424518585205, "rewards/margins": 20.10916519165039, "rewards/rejected": -20.582706451416016, "step": 1140 }, { "epoch": 0.39, "learning_rate": 4.831927483318645e-07, "logits/chosen": -2.7110633850097656, "logits/rejected": -2.6243748664855957, "logps/chosen": -304.00103759765625, "logps/rejected": -440.99365234375, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": -0.3342191278934479, "rewards/margins": 19.742460250854492, "rewards/rejected": -20.076679229736328, "step": 1150 }, { "epoch": 0.39, "learning_rate": 4.82563263250661e-07, "logits/chosen": -2.600151777267456, "logits/rejected": -2.7741997241973877, "logps/chosen": -329.1345520019531, "logps/rejected": -517.6507568359375, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 0.19879575073719025, "rewards/margins": 19.641027450561523, "rewards/rejected": -19.442230224609375, "step": 1160 }, { "epoch": 0.4, "learning_rate": 4.819337781694573e-07, "logits/chosen": -2.5683507919311523, "logits/rejected": -2.7539186477661133, "logps/chosen": -195.9901580810547, "logps/rejected": -545.2691650390625, "loss": 0.0343, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7748135924339294, "rewards/margins": 16.869482040405273, "rewards/rejected": -16.09467124938965, "step": 1170 }, { "epoch": 0.4, "learning_rate": 4.813042930882538e-07, "logits/chosen": -2.6182196140289307, "logits/rejected": -2.640855312347412, "logps/chosen": -242.75076293945312, "logps/rejected": -717.111083984375, "loss": 0.0227, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9914628863334656, "rewards/margins": 20.088098526000977, "rewards/rejected": -19.096635818481445, "step": 1180 }, { "epoch": 0.4, "learning_rate": 4.806748080070503e-07, "logits/chosen": -2.63869047164917, "logits/rejected": -2.6670069694519043, "logps/chosen": -269.4376525878906, "logps/rejected": -426.5965881347656, "loss": 0.039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.399887204170227, "rewards/margins": 15.331335067749023, "rewards/rejected": -13.931447982788086, "step": 1190 }, { "epoch": 0.41, "learning_rate": 4.800453229258466e-07, "logits/chosen": -2.645261287689209, "logits/rejected": -2.6549782752990723, "logps/chosen": -328.2383728027344, "logps/rejected": -529.7254638671875, "loss": 0.0368, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1934082508087158, "rewards/margins": 18.597471237182617, "rewards/rejected": -17.404064178466797, "step": 1200 }, { "epoch": 0.41, "eval_logits/chosen": -2.7833943367004395, "eval_logits/rejected": -2.6827495098114014, "eval_logps/chosen": -247.96859741210938, "eval_logps/rejected": -589.0680541992188, "eval_loss": 0.03494969382882118, "eval_rewards/accuracies": 0.9924242496490479, "eval_rewards/chosen": 1.4987932443618774, "eval_rewards/margins": 20.18805694580078, "eval_rewards/rejected": -18.68926429748535, "eval_runtime": 460.738, "eval_samples_per_second": 20.619, "eval_steps_per_second": 0.645, "step": 1200 }, { "epoch": 0.41, "learning_rate": 4.79415837844643e-07, "logits/chosen": -2.574401378631592, "logits/rejected": -2.600925922393799, "logps/chosen": -265.2054443359375, "logps/rejected": -410.8392639160156, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": 0.7319608926773071, "rewards/margins": 19.564794540405273, "rewards/rejected": -18.832834243774414, "step": 1210 }, { "epoch": 0.41, "learning_rate": 4.787863527634395e-07, "logits/chosen": -2.506033420562744, "logits/rejected": -2.5704429149627686, "logps/chosen": -239.96054077148438, "logps/rejected": -672.642333984375, "loss": 0.022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1644793450832367, "rewards/margins": 22.234477996826172, "rewards/rejected": -22.069997787475586, "step": 1220 }, { "epoch": 0.42, "learning_rate": 4.781568676822359e-07, "logits/chosen": -2.5792417526245117, "logits/rejected": -2.6120190620422363, "logps/chosen": -189.03421020507812, "logps/rejected": -789.7057495117188, "loss": 0.0174, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3116579055786133, "rewards/margins": 20.205209732055664, "rewards/rejected": -18.893550872802734, "step": 1230 }, { "epoch": 0.42, "learning_rate": 4.775273826010323e-07, "logits/chosen": -2.4609837532043457, "logits/rejected": -2.542442798614502, "logps/chosen": -269.1730041503906, "logps/rejected": -603.2425537109375, "loss": 0.0273, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9308624267578125, "rewards/margins": 18.17339515686035, "rewards/rejected": -17.242530822753906, "step": 1240 }, { "epoch": 0.42, "learning_rate": 4.768978975198288e-07, "logits/chosen": -2.4921116828918457, "logits/rejected": -2.4880805015563965, "logps/chosen": -262.24713134765625, "logps/rejected": -761.4155883789062, "loss": 0.018, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21104009449481964, "rewards/margins": 17.740604400634766, "rewards/rejected": -17.529565811157227, "step": 1250 }, { "epoch": 0.43, "learning_rate": 4.762684124386252e-07, "logits/chosen": -2.443260669708252, "logits/rejected": -2.4482874870300293, "logps/chosen": -271.37835693359375, "logps/rejected": -726.6851806640625, "loss": 0.0865, "rewards/accuracies": 1.0, "rewards/chosen": 1.1155122518539429, "rewards/margins": 20.962247848510742, "rewards/rejected": -19.84673500061035, "step": 1260 }, { "epoch": 0.43, "learning_rate": 4.756389273574216e-07, "logits/chosen": -2.4249587059020996, "logits/rejected": -2.5270984172821045, "logps/chosen": -301.5065002441406, "logps/rejected": -624.126708984375, "loss": 0.0148, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.309164047241211, "rewards/margins": 19.4692325592041, "rewards/rejected": -18.16006851196289, "step": 1270 }, { "epoch": 0.44, "learning_rate": 4.7500944227621803e-07, "logits/chosen": -2.5309841632843018, "logits/rejected": -2.5276074409484863, "logps/chosen": -218.25552368164062, "logps/rejected": -468.033935546875, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 1.360708475112915, "rewards/margins": 19.552738189697266, "rewards/rejected": -18.19202995300293, "step": 1280 }, { "epoch": 0.44, "learning_rate": 4.7437995719501445e-07, "logits/chosen": -2.6075873374938965, "logits/rejected": -2.562488079071045, "logps/chosen": -206.3004150390625, "logps/rejected": -595.8050537109375, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 0.7710903882980347, "rewards/margins": 19.54395866394043, "rewards/rejected": -18.77286720275879, "step": 1290 }, { "epoch": 0.44, "learning_rate": 4.737504721138109e-07, "logits/chosen": -2.4752111434936523, "logits/rejected": -2.4962985515594482, "logps/chosen": -252.0557861328125, "logps/rejected": -608.6836547851562, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 0.24169978499412537, "rewards/margins": 18.718734741210938, "rewards/rejected": -18.477031707763672, "step": 1300 }, { "epoch": 0.44, "eval_logits/chosen": -2.545536518096924, "eval_logits/rejected": -2.4356236457824707, "eval_logps/chosen": -243.78854370117188, "eval_logps/rejected": -537.1611328125, "eval_loss": 0.14055109024047852, "eval_rewards/accuracies": 0.9739057421684265, "eval_rewards/chosen": 1.9167977571487427, "eval_rewards/margins": 15.415374755859375, "eval_rewards/rejected": -13.498576164245605, "eval_runtime": 460.7251, "eval_samples_per_second": 20.62, "eval_steps_per_second": 0.645, "step": 1300 }, { "epoch": 0.45, "learning_rate": 4.7312098703260735e-07, "logits/chosen": -2.4747557640075684, "logits/rejected": -2.474287509918213, "logps/chosen": -194.56134033203125, "logps/rejected": -481.0470275878906, "loss": 0.1716, "rewards/accuracies": 1.0, "rewards/chosen": 1.2244038581848145, "rewards/margins": 17.13334083557129, "rewards/rejected": -15.908937454223633, "step": 1310 }, { "epoch": 0.45, "learning_rate": 4.724915019514038e-07, "logits/chosen": -2.484149694442749, "logits/rejected": -2.5438666343688965, "logps/chosen": -247.08676147460938, "logps/rejected": -675.510009765625, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": 1.2810490131378174, "rewards/margins": 19.48736572265625, "rewards/rejected": -18.206314086914062, "step": 1320 }, { "epoch": 0.45, "learning_rate": 4.7186201687020014e-07, "logits/chosen": -2.4696590900421143, "logits/rejected": -2.599529266357422, "logps/chosen": -242.32260131835938, "logps/rejected": -474.51910400390625, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 1.1106736660003662, "rewards/margins": 21.690330505371094, "rewards/rejected": -20.57965660095215, "step": 1330 }, { "epoch": 0.46, "learning_rate": 4.7123253178899657e-07, "logits/chosen": -2.592541217803955, "logits/rejected": -2.549347400665283, "logps/chosen": -192.3375244140625, "logps/rejected": -531.9471435546875, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 1.2463042736053467, "rewards/margins": 19.711788177490234, "rewards/rejected": -18.465482711791992, "step": 1340 }, { "epoch": 0.46, "learning_rate": 4.70603046707793e-07, "logits/chosen": -2.477987051010132, "logits/rejected": -2.43994140625, "logps/chosen": -194.67831420898438, "logps/rejected": -693.862548828125, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 1.6343986988067627, "rewards/margins": 22.992265701293945, "rewards/rejected": -21.357868194580078, "step": 1350 }, { "epoch": 0.46, "learning_rate": 4.699735616265894e-07, "logits/chosen": -2.5695838928222656, "logits/rejected": -2.477034091949463, "logps/chosen": -250.9779052734375, "logps/rejected": -523.9495239257812, "loss": 0.0765, "rewards/accuracies": 1.0, "rewards/chosen": 2.178835391998291, "rewards/margins": 19.011266708374023, "rewards/rejected": -16.83243179321289, "step": 1360 }, { "epoch": 0.47, "learning_rate": 4.693440765453859e-07, "logits/chosen": -2.400463581085205, "logits/rejected": -2.3830127716064453, "logps/chosen": -213.3478240966797, "logps/rejected": -754.5392456054688, "loss": 0.0596, "rewards/accuracies": 1.0, "rewards/chosen": 0.9365903735160828, "rewards/margins": 19.74733543395996, "rewards/rejected": -18.810747146606445, "step": 1370 }, { "epoch": 0.47, "learning_rate": 4.687145914641823e-07, "logits/chosen": -2.4580488204956055, "logits/rejected": -2.551628828048706, "logps/chosen": -217.7655029296875, "logps/rejected": -591.6602783203125, "loss": 0.3317, "rewards/accuracies": 1.0, "rewards/chosen": 1.477297067642212, "rewards/margins": 16.16473960876465, "rewards/rejected": -14.687443733215332, "step": 1380 }, { "epoch": 0.47, "learning_rate": 4.6808510638297873e-07, "logits/chosen": -2.4831013679504395, "logits/rejected": -2.4327542781829834, "logps/chosen": -222.42843627929688, "logps/rejected": -597.3783569335938, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 1.0689274072647095, "rewards/margins": 16.919414520263672, "rewards/rejected": -15.850488662719727, "step": 1390 }, { "epoch": 0.48, "learning_rate": 4.674556213017751e-07, "logits/chosen": -2.3969273567199707, "logits/rejected": -2.527820587158203, "logps/chosen": -189.3216552734375, "logps/rejected": -546.3126220703125, "loss": 0.0302, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.709881067276001, "rewards/margins": 19.18336296081543, "rewards/rejected": -18.47348403930664, "step": 1400 }, { "epoch": 0.48, "eval_logits/chosen": -2.579900026321411, "eval_logits/rejected": -2.504068613052368, "eval_logps/chosen": -259.4060974121094, "eval_logps/rejected": -605.8153076171875, "eval_loss": 0.01969875581562519, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": 0.3550424873828888, "eval_rewards/margins": 20.719039916992188, "eval_rewards/rejected": -20.363998413085938, "eval_runtime": 460.6931, "eval_samples_per_second": 20.621, "eval_steps_per_second": 0.645, "step": 1400 }, { "epoch": 0.48, "learning_rate": 4.668261362205715e-07, "logits/chosen": -2.42549204826355, "logits/rejected": -2.522770404815674, "logps/chosen": -280.95050048828125, "logps/rejected": -638.1356201171875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 0.6429892182350159, "rewards/margins": 22.743820190429688, "rewards/rejected": -22.100830078125, "step": 1410 }, { "epoch": 0.48, "learning_rate": 4.6619665113936795e-07, "logits/chosen": -2.3744378089904785, "logits/rejected": -2.5064785480499268, "logps/chosen": -265.2015380859375, "logps/rejected": -633.029296875, "loss": 0.0109, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4135085642337799, "rewards/margins": 19.658388137817383, "rewards/rejected": -19.244876861572266, "step": 1420 }, { "epoch": 0.49, "learning_rate": 4.6556716605816437e-07, "logits/chosen": -2.429365396499634, "logits/rejected": -2.40626859664917, "logps/chosen": -262.5450439453125, "logps/rejected": -581.1719360351562, "loss": 0.1005, "rewards/accuracies": 1.0, "rewards/chosen": 0.45702022314071655, "rewards/margins": 18.911334991455078, "rewards/rejected": -18.454315185546875, "step": 1430 }, { "epoch": 0.49, "learning_rate": 4.6493768097696085e-07, "logits/chosen": -2.3064112663269043, "logits/rejected": -2.3991479873657227, "logps/chosen": -342.5967712402344, "logps/rejected": -486.600341796875, "loss": 0.103, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.10810986906290054, "rewards/margins": 17.051197052001953, "rewards/rejected": -17.1593074798584, "step": 1440 }, { "epoch": 0.49, "learning_rate": 4.6430819589575727e-07, "logits/chosen": -2.442412853240967, "logits/rejected": -2.350637912750244, "logps/chosen": -210.9844207763672, "logps/rejected": -426.71441650390625, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 0.36219099164009094, "rewards/margins": 16.618221282958984, "rewards/rejected": -16.256031036376953, "step": 1450 }, { "epoch": 0.5, "learning_rate": 4.636787108145537e-07, "logits/chosen": -2.29394268989563, "logits/rejected": -2.3459458351135254, "logps/chosen": -318.5940856933594, "logps/rejected": -585.8348388671875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.6295406222343445, "rewards/margins": 15.367016792297363, "rewards/rejected": -14.73747730255127, "step": 1460 }, { "epoch": 0.5, "learning_rate": 4.630492257333501e-07, "logits/chosen": -2.3209285736083984, "logits/rejected": -2.3137335777282715, "logps/chosen": -193.85511779785156, "logps/rejected": -473.21600341796875, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 0.3525432348251343, "rewards/margins": 15.626386642456055, "rewards/rejected": -15.273844718933105, "step": 1470 }, { "epoch": 0.5, "learning_rate": 4.624197406521465e-07, "logits/chosen": -2.224771022796631, "logits/rejected": -2.3555684089660645, "logps/chosen": -194.3691864013672, "logps/rejected": -542.8577880859375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 0.4460337162017822, "rewards/margins": 17.739830017089844, "rewards/rejected": -17.29379653930664, "step": 1480 }, { "epoch": 0.51, "learning_rate": 4.617902555709429e-07, "logits/chosen": -2.3657093048095703, "logits/rejected": -2.36057448387146, "logps/chosen": -245.39187622070312, "logps/rejected": -385.16619873046875, "loss": 0.1642, "rewards/accuracies": 1.0, "rewards/chosen": 0.5870780944824219, "rewards/margins": 18.157852172851562, "rewards/rejected": -17.57077407836914, "step": 1490 }, { "epoch": 0.51, "learning_rate": 4.611607704897394e-07, "logits/chosen": -2.3583731651306152, "logits/rejected": -2.344252109527588, "logps/chosen": -310.5423583984375, "logps/rejected": -569.6289672851562, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 1.553929328918457, "rewards/margins": 19.220272064208984, "rewards/rejected": -17.666343688964844, "step": 1500 }, { "epoch": 0.51, "eval_logits/chosen": -2.503629207611084, "eval_logits/rejected": -2.438016176223755, "eval_logps/chosen": -252.3780975341797, "eval_logps/rejected": -574.3317260742188, "eval_loss": 0.023061566054821014, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 1.0578418970108032, "eval_rewards/margins": 18.273483276367188, "eval_rewards/rejected": -17.215639114379883, "eval_runtime": 461.4804, "eval_samples_per_second": 20.586, "eval_steps_per_second": 0.644, "step": 1500 }, { "epoch": 0.51, "learning_rate": 4.605312854085358e-07, "logits/chosen": -2.4512362480163574, "logits/rejected": -2.373633861541748, "logps/chosen": -288.33392333984375, "logps/rejected": -488.0221252441406, "loss": 0.0235, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0387637615203857, "rewards/margins": 17.140483856201172, "rewards/rejected": -16.101722717285156, "step": 1510 }, { "epoch": 0.52, "learning_rate": 4.5990180032733223e-07, "logits/chosen": -2.291243076324463, "logits/rejected": -2.389097213745117, "logps/chosen": -261.04119873046875, "logps/rejected": -619.838623046875, "loss": 0.0583, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0017726421356201, "rewards/margins": 16.769121170043945, "rewards/rejected": -15.767349243164062, "step": 1520 }, { "epoch": 0.52, "learning_rate": 4.5927231524612865e-07, "logits/chosen": -2.271888017654419, "logits/rejected": -2.3553082942962646, "logps/chosen": -242.3424072265625, "logps/rejected": -456.53973388671875, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 0.37248215079307556, "rewards/margins": 18.42282485961914, "rewards/rejected": -18.050342559814453, "step": 1530 }, { "epoch": 0.52, "learning_rate": 4.586428301649251e-07, "logits/chosen": -2.427642345428467, "logits/rejected": -2.215817928314209, "logps/chosen": -208.618896484375, "logps/rejected": -667.5950927734375, "loss": 0.012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7300782203674316, "rewards/margins": 14.973955154418945, "rewards/rejected": -14.243875503540039, "step": 1540 }, { "epoch": 0.53, "learning_rate": 4.5801334508372145e-07, "logits/chosen": -2.3841958045959473, "logits/rejected": -2.4070792198181152, "logps/chosen": -277.1880187988281, "logps/rejected": -488.7342224121094, "loss": 0.0132, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.26346153020858765, "rewards/margins": 16.708303451538086, "rewards/rejected": -16.971763610839844, "step": 1550 }, { "epoch": 0.53, "learning_rate": 4.573838600025179e-07, "logits/chosen": -2.3082339763641357, "logits/rejected": -2.3923990726470947, "logps/chosen": -344.37493896484375, "logps/rejected": -655.8798828125, "loss": 0.0937, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17925870418548584, "rewards/margins": 17.674516677856445, "rewards/rejected": -17.495258331298828, "step": 1560 }, { "epoch": 0.53, "learning_rate": 4.5675437492131434e-07, "logits/chosen": -2.396357774734497, "logits/rejected": -2.4108948707580566, "logps/chosen": -244.89590454101562, "logps/rejected": -604.6080932617188, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 0.48009276390075684, "rewards/margins": 18.81039047241211, "rewards/rejected": -18.330299377441406, "step": 1570 }, { "epoch": 0.54, "learning_rate": 4.5612488984011077e-07, "logits/chosen": -2.4008219242095947, "logits/rejected": -2.481797933578491, "logps/chosen": -262.0800476074219, "logps/rejected": -519.4193115234375, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 0.2823297381401062, "rewards/margins": 17.034990310668945, "rewards/rejected": -16.75265884399414, "step": 1580 }, { "epoch": 0.54, "learning_rate": 4.554954047589072e-07, "logits/chosen": -2.5180606842041016, "logits/rejected": -2.490978717803955, "logps/chosen": -265.3757629394531, "logps/rejected": -521.0712890625, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 0.03387078642845154, "rewards/margins": 21.005691528320312, "rewards/rejected": -20.971820831298828, "step": 1590 }, { "epoch": 0.54, "learning_rate": 4.548659196777036e-07, "logits/chosen": -2.354222297668457, "logits/rejected": -2.5078320503234863, "logps/chosen": -375.18475341796875, "logps/rejected": -509.63934326171875, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 1.8320825099945068, "rewards/margins": 20.013713836669922, "rewards/rejected": -18.181631088256836, "step": 1600 }, { "epoch": 0.54, "eval_logits/chosen": -2.6017355918884277, "eval_logits/rejected": -2.504822254180908, "eval_logps/chosen": -255.21795654296875, "eval_logps/rejected": -598.3050537109375, "eval_loss": 0.026692749932408333, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.7738557457923889, "eval_rewards/margins": 20.386831283569336, "eval_rewards/rejected": -19.61297607421875, "eval_runtime": 461.3879, "eval_samples_per_second": 20.59, "eval_steps_per_second": 0.644, "step": 1600 }, { "epoch": 0.55, "learning_rate": 4.5423643459650003e-07, "logits/chosen": -2.51739501953125, "logits/rejected": -2.4319069385528564, "logps/chosen": -282.01947021484375, "logps/rejected": -745.6038818359375, "loss": 0.0655, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8781415820121765, "rewards/margins": 24.53594970703125, "rewards/rejected": -25.414094924926758, "step": 1610 }, { "epoch": 0.55, "learning_rate": 4.536069495152965e-07, "logits/chosen": -2.505794048309326, "logits/rejected": -2.485548734664917, "logps/chosen": -267.39630126953125, "logps/rejected": -709.6775512695312, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -0.9517226219177246, "rewards/margins": 24.81125259399414, "rewards/rejected": -25.76297378540039, "step": 1620 }, { "epoch": 0.55, "learning_rate": 4.529774644340929e-07, "logits/chosen": -2.509683847427368, "logits/rejected": -2.469111919403076, "logps/chosen": -282.9756774902344, "logps/rejected": -528.6598510742188, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -0.47601503133773804, "rewards/margins": 19.310686111450195, "rewards/rejected": -19.786701202392578, "step": 1630 }, { "epoch": 0.56, "learning_rate": 4.523479793528893e-07, "logits/chosen": -2.2696995735168457, "logits/rejected": -2.41072940826416, "logps/chosen": -265.56414794921875, "logps/rejected": -591.43359375, "loss": 0.007, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6274104714393616, "rewards/margins": 23.395343780517578, "rewards/rejected": -24.02275276184082, "step": 1640 }, { "epoch": 0.56, "learning_rate": 4.517184942716857e-07, "logits/chosen": -2.422550678253174, "logits/rejected": -2.403376579284668, "logps/chosen": -327.978515625, "logps/rejected": -508.09796142578125, "loss": 0.0059, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4412936568260193, "rewards/margins": 21.33486557006836, "rewards/rejected": -21.776159286499023, "step": 1650 }, { "epoch": 0.56, "learning_rate": 4.5108900919048215e-07, "logits/chosen": -2.4987094402313232, "logits/rejected": -2.4563193321228027, "logps/chosen": -282.0686340332031, "logps/rejected": -588.2311401367188, "loss": 0.0396, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7210484147071838, "rewards/margins": 22.610971450805664, "rewards/rejected": -23.332019805908203, "step": 1660 }, { "epoch": 0.57, "learning_rate": 4.5045952410927857e-07, "logits/chosen": -2.5488975048065186, "logits/rejected": -2.4654548168182373, "logps/chosen": -253.63681030273438, "logps/rejected": -621.9171142578125, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": -0.5195436477661133, "rewards/margins": 23.864168167114258, "rewards/rejected": -24.383708953857422, "step": 1670 }, { "epoch": 0.57, "learning_rate": 4.4983003902807505e-07, "logits/chosen": -2.6175780296325684, "logits/rejected": -2.576857089996338, "logps/chosen": -357.30780029296875, "logps/rejected": -857.6213989257812, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": -0.8925703167915344, "rewards/margins": 29.405181884765625, "rewards/rejected": -30.29775047302246, "step": 1680 }, { "epoch": 0.57, "learning_rate": 4.4920055394687147e-07, "logits/chosen": -2.6261651515960693, "logits/rejected": -2.6439290046691895, "logps/chosen": -229.7930450439453, "logps/rejected": -555.5425415039062, "loss": 0.0056, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.382305383682251, "rewards/margins": 22.08050537109375, "rewards/rejected": -23.462812423706055, "step": 1690 }, { "epoch": 0.58, "learning_rate": 4.485710688656679e-07, "logits/chosen": -2.740164279937744, "logits/rejected": -2.4939517974853516, "logps/chosen": -243.57815551757812, "logps/rejected": -657.9866943359375, "loss": 0.0142, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2959176301956177, "rewards/margins": 28.68851089477539, "rewards/rejected": -29.984426498413086, "step": 1700 }, { "epoch": 0.58, "eval_logits/chosen": -2.7769687175750732, "eval_logits/rejected": -2.6206769943237305, "eval_logps/chosen": -270.02783203125, "eval_logps/rejected": -654.0657348632812, "eval_loss": 0.04305613413453102, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": -0.7071316242218018, "eval_rewards/margins": 24.48191261291504, "eval_rewards/rejected": -25.189043045043945, "eval_runtime": 461.4336, "eval_samples_per_second": 20.588, "eval_steps_per_second": 0.644, "step": 1700 }, { "epoch": 0.58, "learning_rate": 4.4794158378446426e-07, "logits/chosen": -2.628582000732422, "logits/rejected": -2.5942301750183105, "logps/chosen": -332.8672180175781, "logps/rejected": -772.806640625, "loss": 0.022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.593887448310852, "rewards/margins": 21.103981018066406, "rewards/rejected": -21.697866439819336, "step": 1710 }, { "epoch": 0.58, "learning_rate": 4.473120987032607e-07, "logits/chosen": -2.6535096168518066, "logits/rejected": -2.546172618865967, "logps/chosen": -259.38238525390625, "logps/rejected": -974.2223510742188, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -0.5611306428909302, "rewards/margins": 25.632862091064453, "rewards/rejected": -26.19399070739746, "step": 1720 }, { "epoch": 0.59, "learning_rate": 4.466826136220571e-07, "logits/chosen": -2.7260518074035645, "logits/rejected": -2.605917453765869, "logps/chosen": -221.68710327148438, "logps/rejected": -524.6183471679688, "loss": 0.0122, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01746426895260811, "rewards/margins": 21.495370864868164, "rewards/rejected": -21.4779052734375, "step": 1730 }, { "epoch": 0.59, "learning_rate": 4.460531285408536e-07, "logits/chosen": -2.590210437774658, "logits/rejected": -2.6012563705444336, "logps/chosen": -379.8719787597656, "logps/rejected": -492.4993591308594, "loss": 0.0143, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.09490986168384552, "rewards/margins": 16.257863998413086, "rewards/rejected": -16.16295623779297, "step": 1740 }, { "epoch": 0.59, "learning_rate": 4.4542364345965e-07, "logits/chosen": -2.618507146835327, "logits/rejected": -2.568246364593506, "logps/chosen": -300.7378845214844, "logps/rejected": -530.2584838867188, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 0.6757928729057312, "rewards/margins": 24.786624908447266, "rewards/rejected": -24.11083221435547, "step": 1750 }, { "epoch": 0.6, "learning_rate": 4.4479415837844643e-07, "logits/chosen": -2.5701630115509033, "logits/rejected": -2.6522536277770996, "logps/chosen": -313.9539489746094, "logps/rejected": -555.4135131835938, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -0.029824286699295044, "rewards/margins": 21.30846405029297, "rewards/rejected": -21.33829116821289, "step": 1760 }, { "epoch": 0.6, "learning_rate": 4.4416467329724285e-07, "logits/chosen": -2.6153361797332764, "logits/rejected": -2.5073320865631104, "logps/chosen": -210.7228546142578, "logps/rejected": -774.5325927734375, "loss": 0.0127, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3486413359642029, "rewards/margins": 25.55997085571289, "rewards/rejected": -25.908611297607422, "step": 1770 }, { "epoch": 0.61, "learning_rate": 4.435351882160392e-07, "logits/chosen": -2.6076109409332275, "logits/rejected": -2.596550703048706, "logps/chosen": -276.94622802734375, "logps/rejected": -519.9389038085938, "loss": 0.0286, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5446842908859253, "rewards/margins": 19.16988754272461, "rewards/rejected": -19.714574813842773, "step": 1780 }, { "epoch": 0.61, "learning_rate": 4.4290570313483564e-07, "logits/chosen": -2.6758131980895996, "logits/rejected": -2.5980515480041504, "logps/chosen": -379.3860778808594, "logps/rejected": -530.65771484375, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 0.1897500455379486, "rewards/margins": 20.282817840576172, "rewards/rejected": -20.093067169189453, "step": 1790 }, { "epoch": 0.61, "learning_rate": 4.422762180536321e-07, "logits/chosen": -2.82654070854187, "logits/rejected": -2.559769868850708, "logps/chosen": -226.9258270263672, "logps/rejected": -679.956298828125, "loss": 0.0367, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3998996913433075, "rewards/margins": 20.316606521606445, "rewards/rejected": -20.716506958007812, "step": 1800 }, { "epoch": 0.61, "eval_logits/chosen": -2.8159258365631104, "eval_logits/rejected": -2.6528401374816895, "eval_logps/chosen": -257.4970397949219, "eval_logps/rejected": -600.9735717773438, "eval_loss": 0.024157235398888588, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.5459464192390442, "eval_rewards/margins": 20.42576789855957, "eval_rewards/rejected": -19.879823684692383, "eval_runtime": 461.1072, "eval_samples_per_second": 20.603, "eval_steps_per_second": 0.644, "step": 1800 }, { "epoch": 0.62, "learning_rate": 4.4164673297242854e-07, "logits/chosen": -2.7546584606170654, "logits/rejected": -2.6239612102508545, "logps/chosen": -316.71173095703125, "logps/rejected": -646.9561767578125, "loss": 0.035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.12387055158615112, "rewards/margins": 20.18880271911621, "rewards/rejected": -20.064929962158203, "step": 1810 }, { "epoch": 0.62, "learning_rate": 4.4101724789122497e-07, "logits/chosen": -2.639857769012451, "logits/rejected": -2.5339207649230957, "logps/chosen": -266.84356689453125, "logps/rejected": -795.7584838867188, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.23976056277751923, "rewards/margins": 17.6815185546875, "rewards/rejected": -17.441757202148438, "step": 1820 }, { "epoch": 0.62, "learning_rate": 4.403877628100214e-07, "logits/chosen": -2.7504031658172607, "logits/rejected": -2.6458921432495117, "logps/chosen": -269.15667724609375, "logps/rejected": -557.2806396484375, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": 0.1446327418088913, "rewards/margins": 18.156879425048828, "rewards/rejected": -18.01224708557129, "step": 1830 }, { "epoch": 0.63, "learning_rate": 4.397582777288178e-07, "logits/chosen": -2.8531742095947266, "logits/rejected": -2.750310182571411, "logps/chosen": -330.1998291015625, "logps/rejected": -721.9674682617188, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 0.4936835765838623, "rewards/margins": 24.61349868774414, "rewards/rejected": -24.119813919067383, "step": 1840 }, { "epoch": 0.63, "learning_rate": 4.3912879264761423e-07, "logits/chosen": -2.8233895301818848, "logits/rejected": -2.692167282104492, "logps/chosen": -269.4231872558594, "logps/rejected": -599.0186157226562, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.3955417573451996, "rewards/margins": 20.58736801147461, "rewards/rejected": -20.98291015625, "step": 1850 }, { "epoch": 0.63, "learning_rate": 4.3849930756641066e-07, "logits/chosen": -2.824463129043579, "logits/rejected": -2.6709704399108887, "logps/chosen": -263.7760314941406, "logps/rejected": -700.943603515625, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.42096608877182007, "rewards/margins": 21.991708755493164, "rewards/rejected": -22.412675857543945, "step": 1860 }, { "epoch": 0.64, "learning_rate": 4.378698224852071e-07, "logits/chosen": -2.8075661659240723, "logits/rejected": -2.7800800800323486, "logps/chosen": -260.26885986328125, "logps/rejected": -573.9409790039062, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -0.05597181245684624, "rewards/margins": 24.10391616821289, "rewards/rejected": -24.159889221191406, "step": 1870 }, { "epoch": 0.64, "learning_rate": 4.372403374040035e-07, "logits/chosen": -2.943547010421753, "logits/rejected": -2.737217426300049, "logps/chosen": -210.87890625, "logps/rejected": -619.54541015625, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -0.23752979934215546, "rewards/margins": 23.68573760986328, "rewards/rejected": -23.923267364501953, "step": 1880 }, { "epoch": 0.64, "learning_rate": 4.366108523227999e-07, "logits/chosen": -2.8510565757751465, "logits/rejected": -2.7894399166107178, "logps/chosen": -284.8015441894531, "logps/rejected": -588.3734741210938, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 0.09627864509820938, "rewards/margins": 22.35280418395996, "rewards/rejected": -22.256526947021484, "step": 1890 }, { "epoch": 0.65, "learning_rate": 4.3598136724159635e-07, "logits/chosen": -2.849848985671997, "logits/rejected": -2.7310214042663574, "logps/chosen": -323.48492431640625, "logps/rejected": -628.1383056640625, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2238646745681763, "rewards/margins": 25.464157104492188, "rewards/rejected": -24.24029541015625, "step": 1900 }, { "epoch": 0.65, "eval_logits/chosen": -2.9450511932373047, "eval_logits/rejected": -2.7699177265167236, "eval_logps/chosen": -260.08355712890625, "eval_logps/rejected": -615.8120727539062, "eval_loss": 0.017028242349624634, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.28729960322380066, "eval_rewards/margins": 21.65097427368164, "eval_rewards/rejected": -21.363676071166992, "eval_runtime": 459.9306, "eval_samples_per_second": 20.655, "eval_steps_per_second": 0.646, "step": 1900 }, { "epoch": 0.65, "learning_rate": 4.3535188216039277e-07, "logits/chosen": -2.8217244148254395, "logits/rejected": -2.673391342163086, "logps/chosen": -242.6709442138672, "logps/rejected": -614.2350463867188, "loss": 0.0125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5617807507514954, "rewards/margins": 21.259593963623047, "rewards/rejected": -20.69780921936035, "step": 1910 }, { "epoch": 0.65, "learning_rate": 4.3472239707918925e-07, "logits/chosen": -3.033750295639038, "logits/rejected": -2.7772135734558105, "logps/chosen": -193.59796142578125, "logps/rejected": -453.7601013183594, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.6605241894721985, "rewards/margins": 20.11834144592285, "rewards/rejected": -19.45781898498535, "step": 1920 }, { "epoch": 0.66, "learning_rate": 4.3409291199798567e-07, "logits/chosen": -2.8472089767456055, "logits/rejected": -2.7734687328338623, "logps/chosen": -223.3201141357422, "logps/rejected": -697.2084350585938, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -0.17774823307991028, "rewards/margins": 18.392614364624023, "rewards/rejected": -18.570362091064453, "step": 1930 }, { "epoch": 0.66, "learning_rate": 4.3346342691678204e-07, "logits/chosen": -2.923046827316284, "logits/rejected": -2.7776436805725098, "logps/chosen": -223.40493774414062, "logps/rejected": -749.6063232421875, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -0.716773271560669, "rewards/margins": 19.020183563232422, "rewards/rejected": -19.736955642700195, "step": 1940 }, { "epoch": 0.66, "learning_rate": 4.3283394183557846e-07, "logits/chosen": -2.8243541717529297, "logits/rejected": -2.6579082012176514, "logps/chosen": -382.27899169921875, "logps/rejected": -513.5050048828125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 1.9082963466644287, "rewards/margins": 18.568363189697266, "rewards/rejected": -16.66006851196289, "step": 1950 }, { "epoch": 0.67, "learning_rate": 4.322044567543749e-07, "logits/chosen": -2.8641061782836914, "logits/rejected": -2.8086347579956055, "logps/chosen": -265.87811279296875, "logps/rejected": -520.3047485351562, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": 0.3638235032558441, "rewards/margins": 18.899547576904297, "rewards/rejected": -18.535724639892578, "step": 1960 }, { "epoch": 0.67, "learning_rate": 4.315749716731713e-07, "logits/chosen": -2.747565507888794, "logits/rejected": -2.6334927082061768, "logps/chosen": -275.7919006347656, "logps/rejected": -725.7633666992188, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": 0.595618724822998, "rewards/margins": 17.64072608947754, "rewards/rejected": -17.04510498046875, "step": 1970 }, { "epoch": 0.67, "learning_rate": 4.309454865919678e-07, "logits/chosen": -2.786933422088623, "logits/rejected": -2.669673442840576, "logps/chosen": -269.32427978515625, "logps/rejected": -494.7167053222656, "loss": 0.0526, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8304384350776672, "rewards/margins": 15.228666305541992, "rewards/rejected": -14.398228645324707, "step": 1980 }, { "epoch": 0.68, "learning_rate": 4.303160015107642e-07, "logits/chosen": -2.6149165630340576, "logits/rejected": -2.6713716983795166, "logps/chosen": -300.479248046875, "logps/rejected": -393.7492370605469, "loss": 0.0157, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.248667597770691, "rewards/margins": 15.361491203308105, "rewards/rejected": -14.112823486328125, "step": 1990 }, { "epoch": 0.68, "learning_rate": 4.2968651642956063e-07, "logits/chosen": -2.693213939666748, "logits/rejected": -2.607062816619873, "logps/chosen": -252.6227569580078, "logps/rejected": -493.85693359375, "loss": 0.0279, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.366830587387085, "rewards/margins": 18.97134780883789, "rewards/rejected": -17.60451889038086, "step": 2000 }, { "epoch": 0.68, "eval_logits/chosen": -2.823742151260376, "eval_logits/rejected": -2.6515955924987793, "eval_logps/chosen": -249.52691650390625, "eval_logps/rejected": -578.0550537109375, "eval_loss": 0.023769309744238853, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": 1.3429608345031738, "eval_rewards/margins": 18.93093490600586, "eval_rewards/rejected": -17.587974548339844, "eval_runtime": 459.9543, "eval_samples_per_second": 20.654, "eval_steps_per_second": 0.646, "step": 2000 }, { "epoch": 0.68, "learning_rate": 4.29057031348357e-07, "logits/chosen": -2.7388675212860107, "logits/rejected": -2.7445366382598877, "logps/chosen": -193.27902221679688, "logps/rejected": -570.7527465820312, "loss": 0.0059, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1801769733428955, "rewards/margins": 19.025001525878906, "rewards/rejected": -17.844825744628906, "step": 2010 }, { "epoch": 0.69, "learning_rate": 4.284275462671534e-07, "logits/chosen": -2.7448132038116455, "logits/rejected": -2.699333667755127, "logps/chosen": -201.09164428710938, "logps/rejected": -648.4170532226562, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 1.3039531707763672, "rewards/margins": 20.788999557495117, "rewards/rejected": -19.48504638671875, "step": 2020 }, { "epoch": 0.69, "learning_rate": 4.2779806118594984e-07, "logits/chosen": -2.664590358734131, "logits/rejected": -2.6213698387145996, "logps/chosen": -301.23651123046875, "logps/rejected": -455.62725830078125, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 1.5179203748703003, "rewards/margins": 16.783260345458984, "rewards/rejected": -15.265339851379395, "step": 2030 }, { "epoch": 0.69, "learning_rate": 4.271685761047463e-07, "logits/chosen": -2.8566718101501465, "logits/rejected": -2.663689374923706, "logps/chosen": -200.2873077392578, "logps/rejected": -553.1908569335938, "loss": 0.0269, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8979865312576294, "rewards/margins": 21.14276695251465, "rewards/rejected": -20.244779586791992, "step": 2040 }, { "epoch": 0.7, "learning_rate": 4.2653909102354274e-07, "logits/chosen": -2.6827783584594727, "logits/rejected": -2.67077898979187, "logps/chosen": -268.46685791015625, "logps/rejected": -529.1029052734375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 0.4461685121059418, "rewards/margins": 19.907438278198242, "rewards/rejected": -19.461271286010742, "step": 2050 }, { "epoch": 0.7, "learning_rate": 4.2590960594233917e-07, "logits/chosen": -2.7148594856262207, "logits/rejected": -2.6811089515686035, "logps/chosen": -273.75152587890625, "logps/rejected": -584.2434692382812, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.6500552296638489, "rewards/margins": 25.41765594482422, "rewards/rejected": -24.767602920532227, "step": 2060 }, { "epoch": 0.7, "learning_rate": 4.252801208611356e-07, "logits/chosen": -2.803816556930542, "logits/rejected": -2.683807849884033, "logps/chosen": -251.1762237548828, "logps/rejected": -545.9790649414062, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 1.3244825601577759, "rewards/margins": 21.335323333740234, "rewards/rejected": -20.010841369628906, "step": 2070 }, { "epoch": 0.71, "learning_rate": 4.24650635779932e-07, "logits/chosen": -2.7602272033691406, "logits/rejected": -2.6705124378204346, "logps/chosen": -203.59274291992188, "logps/rejected": -630.4057006835938, "loss": 0.0917, "rewards/accuracies": 1.0, "rewards/chosen": -0.34889259934425354, "rewards/margins": 36.53969192504883, "rewards/rejected": -36.88858413696289, "step": 2080 }, { "epoch": 0.71, "learning_rate": 4.240211506987284e-07, "logits/chosen": -2.701641321182251, "logits/rejected": -2.5837619304656982, "logps/chosen": -292.7440185546875, "logps/rejected": -652.776611328125, "loss": 0.0384, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2623348832130432, "rewards/margins": 27.61543846130371, "rewards/rejected": -27.353103637695312, "step": 2090 }, { "epoch": 0.71, "learning_rate": 4.233916656175248e-07, "logits/chosen": -2.6581640243530273, "logits/rejected": -2.579444408416748, "logps/chosen": -318.468994140625, "logps/rejected": -852.88916015625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.29787755012512207, "rewards/margins": 26.573780059814453, "rewards/rejected": -26.871658325195312, "step": 2100 }, { "epoch": 0.71, "eval_logits/chosen": -2.8199241161346436, "eval_logits/rejected": -2.6269030570983887, "eval_logps/chosen": -264.0626525878906, "eval_logps/rejected": -675.0391235351562, "eval_loss": 0.019873423501849174, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": -0.11061399430036545, "eval_rewards/margins": 27.175764083862305, "eval_rewards/rejected": -27.286378860473633, "eval_runtime": 461.6331, "eval_samples_per_second": 20.579, "eval_steps_per_second": 0.643, "step": 2100 }, { "epoch": 0.72, "learning_rate": 4.227621805363213e-07, "logits/chosen": -2.7205886840820312, "logits/rejected": -2.5663809776306152, "logps/chosen": -243.8598175048828, "logps/rejected": -587.3167724609375, "loss": 0.03, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5075892210006714, "rewards/margins": 25.749042510986328, "rewards/rejected": -26.256628036499023, "step": 2110 }, { "epoch": 0.72, "learning_rate": 4.221326954551177e-07, "logits/chosen": -2.7620463371276855, "logits/rejected": -2.5519957542419434, "logps/chosen": -222.3040771484375, "logps/rejected": -725.4957885742188, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 0.23271289467811584, "rewards/margins": 23.736108779907227, "rewards/rejected": -23.503395080566406, "step": 2120 }, { "epoch": 0.72, "learning_rate": 4.215032103739141e-07, "logits/chosen": -2.655271530151367, "logits/rejected": -2.5609023571014404, "logps/chosen": -273.19244384765625, "logps/rejected": -730.9161376953125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -0.2364242523908615, "rewards/margins": 27.68893051147461, "rewards/rejected": -27.925357818603516, "step": 2130 }, { "epoch": 0.73, "learning_rate": 4.2087372529271055e-07, "logits/chosen": -2.6452105045318604, "logits/rejected": -2.5943095684051514, "logps/chosen": -238.17333984375, "logps/rejected": -541.7523193359375, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -0.20333370566368103, "rewards/margins": 24.510662078857422, "rewards/rejected": -24.7139949798584, "step": 2140 }, { "epoch": 0.73, "learning_rate": 4.2024424021150697e-07, "logits/chosen": -2.7366740703582764, "logits/rejected": -2.5461695194244385, "logps/chosen": -213.2388458251953, "logps/rejected": -668.3230590820312, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.5536194443702698, "rewards/margins": 24.080286026000977, "rewards/rejected": -23.526668548583984, "step": 2150 }, { "epoch": 0.73, "learning_rate": 4.1961475513030334e-07, "logits/chosen": -2.7042200565338135, "logits/rejected": -2.6419358253479004, "logps/chosen": -328.40887451171875, "logps/rejected": -461.9959411621094, "loss": 0.0371, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5283013582229614, "rewards/margins": 27.091136932373047, "rewards/rejected": -26.56283950805664, "step": 2160 }, { "epoch": 0.74, "learning_rate": 4.189852700490998e-07, "logits/chosen": -2.7990882396698, "logits/rejected": -2.6383023262023926, "logps/chosen": -263.3553771972656, "logps/rejected": -589.2244262695312, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": 0.41476207971572876, "rewards/margins": 28.930557250976562, "rewards/rejected": -28.515792846679688, "step": 2170 }, { "epoch": 0.74, "learning_rate": 4.1835578496789624e-07, "logits/chosen": -2.779778242111206, "logits/rejected": -2.604832172393799, "logps/chosen": -266.7163391113281, "logps/rejected": -637.8961181640625, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 1.1610097885131836, "rewards/margins": 26.60187339782715, "rewards/rejected": -25.44086265563965, "step": 2180 }, { "epoch": 0.74, "learning_rate": 4.1772629988669266e-07, "logits/chosen": -2.8000168800354004, "logits/rejected": -2.666163682937622, "logps/chosen": -269.74658203125, "logps/rejected": -673.542236328125, "loss": 0.0204, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5513342022895813, "rewards/margins": 30.5479679107666, "rewards/rejected": -29.996631622314453, "step": 2190 }, { "epoch": 0.75, "learning_rate": 4.170968148054891e-07, "logits/chosen": -2.645045042037964, "logits/rejected": -2.719619035720825, "logps/chosen": -491.03558349609375, "logps/rejected": -612.4910888671875, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.03465430811047554, "rewards/margins": 26.028339385986328, "rewards/rejected": -26.062992095947266, "step": 2200 }, { "epoch": 0.75, "eval_logits/chosen": -2.9243931770324707, "eval_logits/rejected": -2.7200496196746826, "eval_logps/chosen": -261.1270446777344, "eval_logps/rejected": -665.4395751953125, "eval_loss": 0.01806098408997059, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": 0.18294867873191833, "eval_rewards/margins": 26.509366989135742, "eval_rewards/rejected": -26.326417922973633, "eval_runtime": 461.5491, "eval_samples_per_second": 20.583, "eval_steps_per_second": 0.643, "step": 2200 }, { "epoch": 0.75, "learning_rate": 4.164673297242855e-07, "logits/chosen": -2.6869940757751465, "logits/rejected": -2.6642332077026367, "logps/chosen": -419.41949462890625, "logps/rejected": -670.6546630859375, "loss": 0.0059, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.10923495143651962, "rewards/margins": 24.063011169433594, "rewards/rejected": -24.172243118286133, "step": 2210 }, { "epoch": 0.75, "learning_rate": 4.1583784464308193e-07, "logits/chosen": -2.892453193664551, "logits/rejected": -2.6266751289367676, "logps/chosen": -192.79818725585938, "logps/rejected": -605.1399536132812, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": 0.3278549313545227, "rewards/margins": 26.968551635742188, "rewards/rejected": -26.640695571899414, "step": 2220 }, { "epoch": 0.76, "learning_rate": 4.152083595618784e-07, "logits/chosen": -2.801997661590576, "logits/rejected": -2.738144636154175, "logps/chosen": -198.71047973632812, "logps/rejected": -540.5067138671875, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3819180428981781, "rewards/margins": 25.39408302307129, "rewards/rejected": -25.01216697692871, "step": 2230 }, { "epoch": 0.76, "learning_rate": 4.145788744806748e-07, "logits/chosen": -2.825984477996826, "logits/rejected": -2.6215903759002686, "logps/chosen": -206.67636108398438, "logps/rejected": -625.7019653320312, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8844151496887207, "rewards/margins": 25.18911361694336, "rewards/rejected": -24.304702758789062, "step": 2240 }, { "epoch": 0.76, "learning_rate": 4.139493893994712e-07, "logits/chosen": -2.7909998893737793, "logits/rejected": -2.7546143531799316, "logps/chosen": -221.7032012939453, "logps/rejected": -511.81787109375, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": 0.5463350415229797, "rewards/margins": 25.55159568786621, "rewards/rejected": -25.005260467529297, "step": 2250 }, { "epoch": 0.77, "learning_rate": 4.133199043182676e-07, "logits/chosen": -2.8236422538757324, "logits/rejected": -2.7228808403015137, "logps/chosen": -208.43069458007812, "logps/rejected": -643.4486083984375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.510372519493103, "rewards/margins": 25.80242347717285, "rewards/rejected": -25.29205322265625, "step": 2260 }, { "epoch": 0.77, "learning_rate": 4.1269041923706404e-07, "logits/chosen": -2.7835776805877686, "logits/rejected": -2.771327495574951, "logps/chosen": -305.79656982421875, "logps/rejected": -612.8425903320312, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 0.4499272406101227, "rewards/margins": 27.649242401123047, "rewards/rejected": -27.199316024780273, "step": 2270 }, { "epoch": 0.77, "learning_rate": 4.1206093415586047e-07, "logits/chosen": -2.7974300384521484, "logits/rejected": -2.669985294342041, "logps/chosen": -220.9478759765625, "logps/rejected": -574.649169921875, "loss": 0.0514, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8602330088615417, "rewards/margins": 23.116596221923828, "rewards/rejected": -22.256359100341797, "step": 2280 }, { "epoch": 0.78, "learning_rate": 4.1143144907465694e-07, "logits/chosen": -2.7419795989990234, "logits/rejected": -2.6166789531707764, "logps/chosen": -222.3448486328125, "logps/rejected": -666.5172729492188, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 0.5212323069572449, "rewards/margins": 25.8842716217041, "rewards/rejected": -25.363040924072266, "step": 2290 }, { "epoch": 0.78, "learning_rate": 4.1080196399345336e-07, "logits/chosen": -2.7161436080932617, "logits/rejected": -2.6169309616088867, "logps/chosen": -256.92095947265625, "logps/rejected": -655.6906127929688, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -0.15434584021568298, "rewards/margins": 23.398479461669922, "rewards/rejected": -23.55282211303711, "step": 2300 }, { "epoch": 0.78, "eval_logits/chosen": -2.782435417175293, "eval_logits/rejected": -2.6358001232147217, "eval_logps/chosen": -266.5664978027344, "eval_logps/rejected": -654.9700927734375, "eval_loss": 0.019356682896614075, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": -0.36099734902381897, "eval_rewards/margins": 24.91847801208496, "eval_rewards/rejected": -25.279476165771484, "eval_runtime": 461.021, "eval_samples_per_second": 20.606, "eval_steps_per_second": 0.644, "step": 2300 }, { "epoch": 0.79, "learning_rate": 4.101724789122498e-07, "logits/chosen": -2.654771327972412, "logits/rejected": -2.6775193214416504, "logps/chosen": -343.95166015625, "logps/rejected": -670.9977416992188, "loss": 0.0299, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1622137725353241, "rewards/margins": 26.724456787109375, "rewards/rejected": -26.562244415283203, "step": 2310 }, { "epoch": 0.79, "learning_rate": 4.0954299383104616e-07, "logits/chosen": -2.632749080657959, "logits/rejected": -2.664358377456665, "logps/chosen": -250.6299285888672, "logps/rejected": -554.7068481445312, "loss": 0.0254, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.05479208379983902, "rewards/margins": 21.266366958618164, "rewards/rejected": -21.211578369140625, "step": 2320 }, { "epoch": 0.79, "learning_rate": 4.089135087498426e-07, "logits/chosen": -2.714599132537842, "logits/rejected": -2.7502286434173584, "logps/chosen": -414.62738037109375, "logps/rejected": -572.8042602539062, "loss": 0.0239, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.49183282256126404, "rewards/margins": 23.304805755615234, "rewards/rejected": -22.812971115112305, "step": 2330 }, { "epoch": 0.8, "learning_rate": 4.08284023668639e-07, "logits/chosen": -2.7510106563568115, "logits/rejected": -2.6608102321624756, "logps/chosen": -254.67532348632812, "logps/rejected": -883.89453125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.7735916376113892, "rewards/margins": 23.575878143310547, "rewards/rejected": -22.802288055419922, "step": 2340 }, { "epoch": 0.8, "learning_rate": 4.076545385874355e-07, "logits/chosen": -2.824338436126709, "logits/rejected": -2.712700128555298, "logps/chosen": -244.8394775390625, "logps/rejected": -586.3604125976562, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -0.12318040430545807, "rewards/margins": 30.368709564208984, "rewards/rejected": -30.491891860961914, "step": 2350 }, { "epoch": 0.8, "learning_rate": 4.070250535062319e-07, "logits/chosen": -2.8288607597351074, "logits/rejected": -2.733421564102173, "logps/chosen": -202.61476135253906, "logps/rejected": -569.3336181640625, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 0.4053502082824707, "rewards/margins": 25.887060165405273, "rewards/rejected": -25.481712341308594, "step": 2360 }, { "epoch": 0.81, "learning_rate": 4.063955684250283e-07, "logits/chosen": -2.730517625808716, "logits/rejected": -2.6915996074676514, "logps/chosen": -245.07077026367188, "logps/rejected": -741.0025634765625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.4065241813659668, "rewards/margins": 25.972362518310547, "rewards/rejected": -25.565837860107422, "step": 2370 }, { "epoch": 0.81, "learning_rate": 4.0576608334382475e-07, "logits/chosen": -2.680234432220459, "logits/rejected": -2.6621408462524414, "logps/chosen": -280.0077819824219, "logps/rejected": -556.5657958984375, "loss": 0.064, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2725940942764282, "rewards/margins": 24.296829223632812, "rewards/rejected": -24.56942367553711, "step": 2380 }, { "epoch": 0.81, "learning_rate": 4.051365982626211e-07, "logits/chosen": -2.675715208053589, "logits/rejected": -2.5427017211914062, "logps/chosen": -258.26251220703125, "logps/rejected": -761.8800048828125, "loss": 0.1764, "rewards/accuracies": 1.0, "rewards/chosen": 1.379744529724121, "rewards/margins": 29.04900550842285, "rewards/rejected": -27.669261932373047, "step": 2390 }, { "epoch": 0.82, "learning_rate": 4.0450711318141754e-07, "logits/chosen": -2.725728988647461, "logits/rejected": -2.587568998336792, "logps/chosen": -220.0096893310547, "logps/rejected": -618.4484252929688, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": 1.2481582164764404, "rewards/margins": 22.892587661743164, "rewards/rejected": -21.644428253173828, "step": 2400 }, { "epoch": 0.82, "eval_logits/chosen": -2.758493661880493, "eval_logits/rejected": -2.6171348094940186, "eval_logps/chosen": -250.2310028076172, "eval_logps/rejected": -644.8375854492188, "eval_loss": 0.02273266389966011, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.272552490234375, "eval_rewards/margins": 25.538782119750977, "eval_rewards/rejected": -24.266225814819336, "eval_runtime": 461.1667, "eval_samples_per_second": 20.6, "eval_steps_per_second": 0.644, "step": 2400 }, { "epoch": 0.82, "learning_rate": 4.03877628100214e-07, "logits/chosen": -2.6778273582458496, "logits/rejected": -2.6126132011413574, "logps/chosen": -217.85531616210938, "logps/rejected": -574.7618408203125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 1.264923334121704, "rewards/margins": 27.4355525970459, "rewards/rejected": -26.17063331604004, "step": 2410 }, { "epoch": 0.82, "learning_rate": 4.0324814301901044e-07, "logits/chosen": -2.5505857467651367, "logits/rejected": -2.5584471225738525, "logps/chosen": -281.96429443359375, "logps/rejected": -534.6473999023438, "loss": 0.0061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8898992538452148, "rewards/margins": 24.090633392333984, "rewards/rejected": -22.20073699951172, "step": 2420 }, { "epoch": 0.83, "learning_rate": 4.0261865793780686e-07, "logits/chosen": -2.655441999435425, "logits/rejected": -2.538745403289795, "logps/chosen": -253.83761596679688, "logps/rejected": -650.1085205078125, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 0.9663955569267273, "rewards/margins": 23.972461700439453, "rewards/rejected": -23.006065368652344, "step": 2430 }, { "epoch": 0.83, "learning_rate": 4.019891728566033e-07, "logits/chosen": -2.5701498985290527, "logits/rejected": -2.5804145336151123, "logps/chosen": -266.05389404296875, "logps/rejected": -754.8343505859375, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0600953102111816, "rewards/margins": 25.238994598388672, "rewards/rejected": -24.17889976501465, "step": 2440 }, { "epoch": 0.83, "learning_rate": 4.013596877753997e-07, "logits/chosen": -2.699598789215088, "logits/rejected": -2.4064505100250244, "logps/chosen": -214.0556640625, "logps/rejected": -720.0762939453125, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 0.5004309415817261, "rewards/margins": 26.905696868896484, "rewards/rejected": -26.4052677154541, "step": 2450 }, { "epoch": 0.84, "learning_rate": 4.0073020269419613e-07, "logits/chosen": -2.647881031036377, "logits/rejected": -2.543602466583252, "logps/chosen": -200.25436401367188, "logps/rejected": -657.3226318359375, "loss": 0.0586, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3979828357696533, "rewards/margins": 26.641937255859375, "rewards/rejected": -26.243953704833984, "step": 2460 }, { "epoch": 0.84, "learning_rate": 4.0010071761299255e-07, "logits/chosen": -2.6377675533294678, "logits/rejected": -2.6237003803253174, "logps/chosen": -268.60943603515625, "logps/rejected": -692.0289916992188, "loss": 0.111, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5339861512184143, "rewards/margins": 31.77768898010254, "rewards/rejected": -31.243698120117188, "step": 2470 }, { "epoch": 0.84, "learning_rate": 3.99471232531789e-07, "logits/chosen": -2.5716850757598877, "logits/rejected": -2.644987106323242, "logps/chosen": -282.94903564453125, "logps/rejected": -609.022216796875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 0.6198418140411377, "rewards/margins": 26.19207191467285, "rewards/rejected": -25.572227478027344, "step": 2480 }, { "epoch": 0.85, "learning_rate": 3.988417474505854e-07, "logits/chosen": -2.646796703338623, "logits/rejected": -2.650146961212158, "logps/chosen": -243.31124877929688, "logps/rejected": -506.1397399902344, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 0.20205366611480713, "rewards/margins": 23.656620025634766, "rewards/rejected": -23.454565048217773, "step": 2490 }, { "epoch": 0.85, "learning_rate": 3.982122623693818e-07, "logits/chosen": -2.5957813262939453, "logits/rejected": -2.710216522216797, "logps/chosen": -309.61895751953125, "logps/rejected": -568.7562866210938, "loss": 0.014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5930233597755432, "rewards/margins": 29.912899017333984, "rewards/rejected": -29.319875717163086, "step": 2500 }, { "epoch": 0.85, "eval_logits/chosen": -2.7765257358551025, "eval_logits/rejected": -2.6237926483154297, "eval_logps/chosen": -261.824462890625, "eval_logps/rejected": -697.99609375, "eval_loss": 0.01683010719716549, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.11320476233959198, "eval_rewards/margins": 29.695280075073242, "eval_rewards/rejected": -29.582077026367188, "eval_runtime": 460.7941, "eval_samples_per_second": 20.617, "eval_steps_per_second": 0.645, "step": 2500 }, { "epoch": 0.85, "learning_rate": 3.9758277728817824e-07, "logits/chosen": -2.5689761638641357, "logits/rejected": -2.5648765563964844, "logps/chosen": -277.6985168457031, "logps/rejected": -701.7349243164062, "loss": 0.1303, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20911407470703125, "rewards/margins": 31.960742950439453, "rewards/rejected": -31.751628875732422, "step": 2510 }, { "epoch": 0.86, "learning_rate": 3.9695329220697467e-07, "logits/chosen": -2.6812217235565186, "logits/rejected": -2.5508625507354736, "logps/chosen": -218.9812469482422, "logps/rejected": -908.4054565429688, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.8248122930526733, "rewards/margins": 32.989234924316406, "rewards/rejected": -33.81404495239258, "step": 2520 }, { "epoch": 0.86, "learning_rate": 3.9632380712577114e-07, "logits/chosen": -2.6739261150360107, "logits/rejected": -2.5640244483947754, "logps/chosen": -300.15020751953125, "logps/rejected": -665.2535400390625, "loss": 0.0062, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8815475702285767, "rewards/margins": 27.936695098876953, "rewards/rejected": -27.055145263671875, "step": 2530 }, { "epoch": 0.86, "learning_rate": 3.9569432204456756e-07, "logits/chosen": -2.60783052444458, "logits/rejected": -2.509875535964966, "logps/chosen": -206.7698211669922, "logps/rejected": -556.12841796875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.0304319858551025, "rewards/margins": 27.027912139892578, "rewards/rejected": -25.997478485107422, "step": 2540 }, { "epoch": 0.87, "learning_rate": 3.9506483696336393e-07, "logits/chosen": -2.5998034477233887, "logits/rejected": -2.6505286693573, "logps/chosen": -253.0867919921875, "logps/rejected": -692.447998046875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.8967704772949219, "rewards/margins": 31.6607608795166, "rewards/rejected": -30.763988494873047, "step": 2550 }, { "epoch": 0.87, "learning_rate": 3.9443535188216036e-07, "logits/chosen": -2.698728561401367, "logits/rejected": -2.561741352081299, "logps/chosen": -209.21304321289062, "logps/rejected": -737.6173095703125, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 0.7896028757095337, "rewards/margins": 28.9803524017334, "rewards/rejected": -28.190750122070312, "step": 2560 }, { "epoch": 0.87, "learning_rate": 3.938058668009568e-07, "logits/chosen": -2.5338001251220703, "logits/rejected": -2.5508155822753906, "logps/chosen": -273.3049011230469, "logps/rejected": -682.6311645507812, "loss": 0.0061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.027163326740264893, "rewards/margins": 25.019729614257812, "rewards/rejected": -25.046894073486328, "step": 2570 }, { "epoch": 0.88, "learning_rate": 3.931763817197532e-07, "logits/chosen": -2.505735397338867, "logits/rejected": -2.4544994831085205, "logps/chosen": -259.4238586425781, "logps/rejected": -419.4248962402344, "loss": 0.158, "rewards/accuracies": 1.0, "rewards/chosen": 0.7169443368911743, "rewards/margins": 18.939842224121094, "rewards/rejected": -18.222896575927734, "step": 2580 }, { "epoch": 0.88, "learning_rate": 3.925468966385497e-07, "logits/chosen": -2.561739683151245, "logits/rejected": -2.473377227783203, "logps/chosen": -303.78424072265625, "logps/rejected": -448.43487548828125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.1835906505584717, "rewards/margins": 22.765567779541016, "rewards/rejected": -21.58197593688965, "step": 2590 }, { "epoch": 0.88, "learning_rate": 3.919174115573461e-07, "logits/chosen": -2.57975435256958, "logits/rejected": -2.5134735107421875, "logps/chosen": -205.55380249023438, "logps/rejected": -717.1515502929688, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.0949382781982422, "rewards/margins": 32.87470245361328, "rewards/rejected": -31.77976417541504, "step": 2600 }, { "epoch": 0.88, "eval_logits/chosen": -2.7396764755249023, "eval_logits/rejected": -2.5649099349975586, "eval_logps/chosen": -255.04025268554688, "eval_logps/rejected": -650.6893310546875, "eval_loss": 0.02667526714503765, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.7916285395622253, "eval_rewards/margins": 25.64302635192871, "eval_rewards/rejected": -24.851402282714844, "eval_runtime": 461.1594, "eval_samples_per_second": 20.6, "eval_steps_per_second": 0.644, "step": 2600 }, { "epoch": 0.89, "learning_rate": 3.912879264761425e-07, "logits/chosen": -2.6641430854797363, "logits/rejected": -2.5596208572387695, "logps/chosen": -266.46380615234375, "logps/rejected": -699.4759521484375, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 0.9972246289253235, "rewards/margins": 26.9931640625, "rewards/rejected": -25.99593734741211, "step": 2610 }, { "epoch": 0.89, "learning_rate": 3.906584413949389e-07, "logits/chosen": -2.601964235305786, "logits/rejected": -2.575681209564209, "logps/chosen": -330.3256530761719, "logps/rejected": -524.6996459960938, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.7288700342178345, "rewards/margins": 21.517120361328125, "rewards/rejected": -20.788249969482422, "step": 2620 }, { "epoch": 0.89, "learning_rate": 3.900289563137353e-07, "logits/chosen": -2.512920618057251, "logits/rejected": -2.502119779586792, "logps/chosen": -216.5636749267578, "logps/rejected": -687.009765625, "loss": 0.3472, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.162569284439087, "rewards/margins": 27.958364486694336, "rewards/rejected": -26.795795440673828, "step": 2630 }, { "epoch": 0.9, "learning_rate": 3.8939947123253174e-07, "logits/chosen": -2.379615306854248, "logits/rejected": -2.464650869369507, "logps/chosen": -243.8478546142578, "logps/rejected": -565.3379516601562, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.4634462893009186, "rewards/margins": 24.53717041015625, "rewards/rejected": -24.073726654052734, "step": 2640 }, { "epoch": 0.9, "learning_rate": 3.887699861513282e-07, "logits/chosen": -2.5038154125213623, "logits/rejected": -2.4020674228668213, "logps/chosen": -184.65213012695312, "logps/rejected": -577.9073486328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.834394097328186, "rewards/margins": 29.402633666992188, "rewards/rejected": -28.568241119384766, "step": 2650 }, { "epoch": 0.9, "learning_rate": 3.8814050107012464e-07, "logits/chosen": -2.553673267364502, "logits/rejected": -2.487461566925049, "logps/chosen": -260.7657775878906, "logps/rejected": -520.7728881835938, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.2255420684814453, "rewards/margins": 28.38750648498535, "rewards/rejected": -27.161962509155273, "step": 2660 }, { "epoch": 0.91, "learning_rate": 3.8751101598892106e-07, "logits/chosen": -2.655632734298706, "logits/rejected": -2.388211965560913, "logps/chosen": -218.3198699951172, "logps/rejected": -788.038330078125, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8498915433883667, "rewards/margins": 30.296627044677734, "rewards/rejected": -29.44673728942871, "step": 2670 }, { "epoch": 0.91, "learning_rate": 3.868815309077175e-07, "logits/chosen": -2.5421464443206787, "logits/rejected": -2.4109978675842285, "logps/chosen": -260.9626770019531, "logps/rejected": -830.2786254882812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.79246985912323, "rewards/margins": 30.376445770263672, "rewards/rejected": -29.583974838256836, "step": 2680 }, { "epoch": 0.91, "learning_rate": 3.862520458265139e-07, "logits/chosen": -2.581557273864746, "logits/rejected": -2.436567783355713, "logps/chosen": -291.28021240234375, "logps/rejected": -864.6397705078125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.8273299932479858, "rewards/margins": 31.677173614501953, "rewards/rejected": -30.8498477935791, "step": 2690 }, { "epoch": 0.92, "learning_rate": 3.856225607453103e-07, "logits/chosen": -2.686209201812744, "logits/rejected": -2.4898226261138916, "logps/chosen": -176.74557495117188, "logps/rejected": -597.9735717773438, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 0.6193677186965942, "rewards/margins": 30.461755752563477, "rewards/rejected": -29.842388153076172, "step": 2700 }, { "epoch": 0.92, "eval_logits/chosen": -2.754260301589966, "eval_logits/rejected": -2.5661351680755615, "eval_logps/chosen": -257.86492919921875, "eval_logps/rejected": -687.63818359375, "eval_loss": 0.009040852077305317, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.5091585516929626, "eval_rewards/margins": 29.055450439453125, "eval_rewards/rejected": -28.546289443969727, "eval_runtime": 460.8222, "eval_samples_per_second": 20.615, "eval_steps_per_second": 0.645, "step": 2700 }, { "epoch": 0.92, "learning_rate": 3.8499307566410675e-07, "logits/chosen": -2.640594482421875, "logits/rejected": -2.546736717224121, "logps/chosen": -209.605224609375, "logps/rejected": -667.8870849609375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.048979610204696655, "rewards/margins": 30.92266845703125, "rewards/rejected": -30.87369155883789, "step": 2710 }, { "epoch": 0.92, "learning_rate": 3.843635905829032e-07, "logits/chosen": -2.561689853668213, "logits/rejected": -2.605921983718872, "logps/chosen": -345.9256286621094, "logps/rejected": -751.64404296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.5991737246513367, "rewards/margins": 29.389257431030273, "rewards/rejected": -28.790081024169922, "step": 2720 }, { "epoch": 0.93, "learning_rate": 3.837341055016996e-07, "logits/chosen": -2.6261181831359863, "logits/rejected": -2.591381311416626, "logps/chosen": -212.19711303710938, "logps/rejected": -755.148193359375, "loss": 0.0062, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.07108749449253082, "rewards/margins": 26.755884170532227, "rewards/rejected": -26.82697105407715, "step": 2730 }, { "epoch": 0.93, "learning_rate": 3.83104620420496e-07, "logits/chosen": -2.575974702835083, "logits/rejected": -2.5418152809143066, "logps/chosen": -278.9146423339844, "logps/rejected": -947.7532958984375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.4847508370876312, "rewards/margins": 29.674001693725586, "rewards/rejected": -30.158756256103516, "step": 2740 }, { "epoch": 0.93, "learning_rate": 3.8247513533929244e-07, "logits/chosen": -2.655622720718384, "logits/rejected": -2.5585243701934814, "logps/chosen": -219.5099334716797, "logps/rejected": -808.62109375, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": -0.14701418578624725, "rewards/margins": 29.206857681274414, "rewards/rejected": -29.353870391845703, "step": 2750 }, { "epoch": 0.94, "learning_rate": 3.8184565025808887e-07, "logits/chosen": -2.657684087753296, "logits/rejected": -2.6649317741394043, "logps/chosen": -284.1194763183594, "logps/rejected": -729.489501953125, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 0.9560995101928711, "rewards/margins": 29.412174224853516, "rewards/rejected": -28.456073760986328, "step": 2760 }, { "epoch": 0.94, "learning_rate": 3.8121616517688534e-07, "logits/chosen": -2.6876230239868164, "logits/rejected": -2.5116684436798096, "logps/chosen": -191.7171630859375, "logps/rejected": -676.7745971679688, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.977108359336853, "rewards/margins": 28.7204647064209, "rewards/rejected": -26.743362426757812, "step": 2770 }, { "epoch": 0.94, "learning_rate": 3.805866800956817e-07, "logits/chosen": -2.6264519691467285, "logits/rejected": -2.602463722229004, "logps/chosen": -195.95089721679688, "logps/rejected": -479.21368408203125, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 1.195467472076416, "rewards/margins": 28.899322509765625, "rewards/rejected": -27.703847885131836, "step": 2780 }, { "epoch": 0.95, "learning_rate": 3.7995719501447813e-07, "logits/chosen": -2.7318661212921143, "logits/rejected": -2.5079636573791504, "logps/chosen": -214.06362915039062, "logps/rejected": -627.875, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 1.7320741415023804, "rewards/margins": 30.3930606842041, "rewards/rejected": -28.66098403930664, "step": 2790 }, { "epoch": 0.95, "learning_rate": 3.7932770993327456e-07, "logits/chosen": -2.6885361671447754, "logits/rejected": -2.6349658966064453, "logps/chosen": -205.16162109375, "logps/rejected": -892.1314697265625, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 1.3932931423187256, "rewards/margins": 31.193689346313477, "rewards/rejected": -29.800395965576172, "step": 2800 }, { "epoch": 0.95, "eval_logits/chosen": -2.8053812980651855, "eval_logits/rejected": -2.617931604385376, "eval_logps/chosen": -254.13375854492188, "eval_logps/rejected": -708.4395751953125, "eval_loss": 0.010327051393687725, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.8822786211967468, "eval_rewards/margins": 31.508703231811523, "eval_rewards/rejected": -30.626420974731445, "eval_runtime": 461.0007, "eval_samples_per_second": 20.607, "eval_steps_per_second": 0.644, "step": 2800 }, { "epoch": 0.96, "learning_rate": 3.78698224852071e-07, "logits/chosen": -2.7540574073791504, "logits/rejected": -2.6096949577331543, "logps/chosen": -199.91162109375, "logps/rejected": -562.3629760742188, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.686630129814148, "rewards/margins": 32.43683624267578, "rewards/rejected": -31.750207901000977, "step": 2810 }, { "epoch": 0.96, "learning_rate": 3.780687397708674e-07, "logits/chosen": -2.7039151191711426, "logits/rejected": -2.5283138751983643, "logps/chosen": -216.7244415283203, "logps/rejected": -502.4744567871094, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 0.9116774797439575, "rewards/margins": 25.491653442382812, "rewards/rejected": -24.57997703552246, "step": 2820 }, { "epoch": 0.96, "learning_rate": 3.774392546896638e-07, "logits/chosen": -2.6259799003601074, "logits/rejected": -2.4684836864471436, "logps/chosen": -272.5995178222656, "logps/rejected": -772.5123901367188, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.1303339004516602, "rewards/margins": 26.932714462280273, "rewards/rejected": -25.802379608154297, "step": 2830 }, { "epoch": 0.97, "learning_rate": 3.768097696084603e-07, "logits/chosen": -2.7302961349487305, "logits/rejected": -2.584216833114624, "logps/chosen": -238.9605712890625, "logps/rejected": -717.0538940429688, "loss": 0.0176, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7044904232025146, "rewards/margins": 32.45585250854492, "rewards/rejected": -30.751361846923828, "step": 2840 }, { "epoch": 0.97, "learning_rate": 3.761802845272567e-07, "logits/chosen": -2.7493700981140137, "logits/rejected": -2.537161350250244, "logps/chosen": -247.82351684570312, "logps/rejected": -541.2569580078125, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 2.111823320388794, "rewards/margins": 23.868099212646484, "rewards/rejected": -21.756275177001953, "step": 2850 }, { "epoch": 0.97, "learning_rate": 3.755507994460531e-07, "logits/chosen": -2.8366217613220215, "logits/rejected": -2.597653865814209, "logps/chosen": -213.44400024414062, "logps/rejected": -528.08447265625, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 1.3815059661865234, "rewards/margins": 23.707685470581055, "rewards/rejected": -22.326181411743164, "step": 2860 }, { "epoch": 0.98, "learning_rate": 3.749213143648495e-07, "logits/chosen": -2.6966845989227295, "logits/rejected": -2.6002211570739746, "logps/chosen": -312.1767883300781, "logps/rejected": -678.533203125, "loss": 0.0165, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3659807443618774, "rewards/margins": 23.515565872192383, "rewards/rejected": -22.149585723876953, "step": 2870 }, { "epoch": 0.98, "learning_rate": 3.7429182928364594e-07, "logits/chosen": -2.830348253250122, "logits/rejected": -2.6034045219421387, "logps/chosen": -258.77166748046875, "logps/rejected": -865.7574462890625, "loss": 0.0038, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.134671926498413, "rewards/margins": 26.58443832397461, "rewards/rejected": -25.449769973754883, "step": 2880 }, { "epoch": 0.98, "learning_rate": 3.7366234420244236e-07, "logits/chosen": -2.6024880409240723, "logits/rejected": -2.6797871589660645, "logps/chosen": -374.3378601074219, "logps/rejected": -756.2149658203125, "loss": 0.0102, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.639643907546997, "rewards/margins": 30.2697811126709, "rewards/rejected": -28.630136489868164, "step": 2890 }, { "epoch": 0.99, "learning_rate": 3.7303285912123884e-07, "logits/chosen": -2.8375117778778076, "logits/rejected": -2.6386044025421143, "logps/chosen": -214.44735717773438, "logps/rejected": -613.2222290039062, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.6264175176620483, "rewards/margins": 26.213571548461914, "rewards/rejected": -25.587154388427734, "step": 2900 }, { "epoch": 0.99, "eval_logits/chosen": -2.9013984203338623, "eval_logits/rejected": -2.693875312805176, "eval_logps/chosen": -250.0462646484375, "eval_logps/rejected": -655.1255493164062, "eval_loss": 0.011161337606608868, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.2910281419754028, "eval_rewards/margins": 26.58604621887207, "eval_rewards/rejected": -25.29501724243164, "eval_runtime": 460.2833, "eval_samples_per_second": 20.639, "eval_steps_per_second": 0.645, "step": 2900 }, { "epoch": 0.99, "learning_rate": 3.7240337404003526e-07, "logits/chosen": -2.6777472496032715, "logits/rejected": -2.6005537509918213, "logps/chosen": -205.68325805664062, "logps/rejected": -815.2028198242188, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.8425712585449219, "rewards/margins": 32.69735336303711, "rewards/rejected": -31.854782104492188, "step": 2910 }, { "epoch": 0.99, "learning_rate": 3.717738889588317e-07, "logits/chosen": -2.666714906692505, "logits/rejected": -2.6401782035827637, "logps/chosen": -216.15469360351562, "logps/rejected": -785.9351806640625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.9678772687911987, "rewards/margins": 27.594335556030273, "rewards/rejected": -26.6264591217041, "step": 2920 }, { "epoch": 1.0, "learning_rate": 3.7114440387762805e-07, "logits/chosen": -2.737814426422119, "logits/rejected": -2.637795925140381, "logps/chosen": -201.18386840820312, "logps/rejected": -487.59869384765625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.8285795450210571, "rewards/margins": 24.61110496520996, "rewards/rejected": -23.782527923583984, "step": 2930 }, { "epoch": 1.0, "learning_rate": 3.705149187964245e-07, "logits/chosen": -2.69553542137146, "logits/rejected": -2.6212849617004395, "logps/chosen": -273.14080810546875, "logps/rejected": -635.7806396484375, "loss": 0.0806, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.58188396692276, "rewards/margins": 27.441326141357422, "rewards/rejected": -26.85944175720215, "step": 2940 }, { "epoch": 1.0, "learning_rate": 3.698854337152209e-07, "logits/chosen": -2.7722973823547363, "logits/rejected": -2.6918299198150635, "logps/chosen": -203.45046997070312, "logps/rejected": -642.851806640625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.842759907245636, "rewards/margins": 33.32647705078125, "rewards/rejected": -32.48371505737305, "step": 2950 }, { "epoch": 1.01, "learning_rate": 3.692559486340174e-07, "logits/chosen": -2.8049843311309814, "logits/rejected": -2.7082982063293457, "logps/chosen": -259.6463317871094, "logps/rejected": -1049.972900390625, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -0.09745602309703827, "rewards/margins": 36.42062759399414, "rewards/rejected": -36.51808547973633, "step": 2960 }, { "epoch": 1.01, "learning_rate": 3.686264635528138e-07, "logits/chosen": -2.963876485824585, "logits/rejected": -2.5451674461364746, "logps/chosen": -201.79275512695312, "logps/rejected": -442.172607421875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.38444527983665466, "rewards/margins": 27.966815948486328, "rewards/rejected": -27.58237075805664, "step": 2970 }, { "epoch": 1.01, "learning_rate": 3.679969784716102e-07, "logits/chosen": -2.889106035232544, "logits/rejected": -2.763339042663574, "logps/chosen": -201.71542358398438, "logps/rejected": -610.3404541015625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.37668752670288086, "rewards/margins": 30.214839935302734, "rewards/rejected": -30.591527938842773, "step": 2980 }, { "epoch": 1.02, "learning_rate": 3.6736749339040664e-07, "logits/chosen": -2.7220089435577393, "logits/rejected": -2.5631253719329834, "logps/chosen": -282.6343994140625, "logps/rejected": -739.330078125, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": -0.3209044635295868, "rewards/margins": 28.472904205322266, "rewards/rejected": -28.793807983398438, "step": 2990 }, { "epoch": 1.02, "learning_rate": 3.6673800830920307e-07, "logits/chosen": -2.663116693496704, "logits/rejected": -2.514334201812744, "logps/chosen": -249.870361328125, "logps/rejected": -604.9058837890625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.4294418394565582, "rewards/margins": 23.02956771850586, "rewards/rejected": -22.60012435913086, "step": 3000 }, { "epoch": 1.02, "eval_logits/chosen": -2.8588478565216064, "eval_logits/rejected": -2.602616310119629, "eval_logps/chosen": -268.8800048828125, "eval_logps/rejected": -671.525390625, "eval_loss": 0.014914510771632195, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.5923497080802917, "eval_rewards/margins": 26.34265899658203, "eval_rewards/rejected": -26.93501091003418, "eval_runtime": 460.8966, "eval_samples_per_second": 20.612, "eval_steps_per_second": 0.644, "step": 3000 }, { "epoch": 1.02, "learning_rate": 3.6610852322799943e-07, "logits/chosen": -2.7751994132995605, "logits/rejected": -2.606536388397217, "logps/chosen": -322.23114013671875, "logps/rejected": -673.2225341796875, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -0.5700228810310364, "rewards/margins": 25.522668838500977, "rewards/rejected": -26.092687606811523, "step": 3010 }, { "epoch": 1.03, "learning_rate": 3.654790381467959e-07, "logits/chosen": -2.80159330368042, "logits/rejected": -2.6721062660217285, "logps/chosen": -224.3547821044922, "logps/rejected": -500.1036071777344, "loss": 0.0351, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.008521604351699352, "rewards/margins": 27.1297664642334, "rewards/rejected": -27.12124252319336, "step": 3020 }, { "epoch": 1.03, "learning_rate": 3.6484955306559233e-07, "logits/chosen": -2.708311080932617, "logits/rejected": -2.5160257816314697, "logps/chosen": -333.6979675292969, "logps/rejected": -535.8858642578125, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": 0.7293016910552979, "rewards/margins": 24.816349029541016, "rewards/rejected": -24.087045669555664, "step": 3030 }, { "epoch": 1.03, "learning_rate": 3.6422006798438876e-07, "logits/chosen": -2.890998125076294, "logits/rejected": -2.6891956329345703, "logps/chosen": -249.77108764648438, "logps/rejected": -660.7885131835938, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.7652451395988464, "rewards/margins": 28.470529556274414, "rewards/rejected": -29.23577880859375, "step": 3040 }, { "epoch": 1.04, "learning_rate": 3.635905829031852e-07, "logits/chosen": -2.899890184402466, "logits/rejected": -2.7299091815948486, "logps/chosen": -326.86322021484375, "logps/rejected": -560.8493041992188, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.28824949264526367, "rewards/margins": 26.929113388061523, "rewards/rejected": -27.217361450195312, "step": 3050 }, { "epoch": 1.04, "learning_rate": 3.629610978219816e-07, "logits/chosen": -2.838972568511963, "logits/rejected": -2.7252838611602783, "logps/chosen": -237.58352661132812, "logps/rejected": -815.12890625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.42706379294395447, "rewards/margins": 26.174713134765625, "rewards/rejected": -25.74764633178711, "step": 3060 }, { "epoch": 1.04, "learning_rate": 3.62331612740778e-07, "logits/chosen": -2.8063912391662598, "logits/rejected": -2.7485084533691406, "logps/chosen": -315.24658203125, "logps/rejected": -661.150390625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.506965160369873, "rewards/margins": 27.849956512451172, "rewards/rejected": -27.342992782592773, "step": 3070 }, { "epoch": 1.05, "learning_rate": 3.617021276595745e-07, "logits/chosen": -2.768949270248413, "logits/rejected": -2.6433167457580566, "logps/chosen": -422.2813415527344, "logps/rejected": -555.9739379882812, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.30259379744529724, "rewards/margins": 25.829975128173828, "rewards/rejected": -25.52738380432129, "step": 3080 }, { "epoch": 1.05, "learning_rate": 3.6107264257837087e-07, "logits/chosen": -2.9657464027404785, "logits/rejected": -2.6640779972076416, "logps/chosen": -263.73944091796875, "logps/rejected": -747.2318115234375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.09739839285612106, "rewards/margins": 27.31549072265625, "rewards/rejected": -27.218097686767578, "step": 3090 }, { "epoch": 1.05, "learning_rate": 3.604431574971673e-07, "logits/chosen": -2.772359609603882, "logits/rejected": -2.6546008586883545, "logps/chosen": -252.8335418701172, "logps/rejected": -632.466796875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.3233366310596466, "rewards/margins": 25.172077178955078, "rewards/rejected": -24.848739624023438, "step": 3100 }, { "epoch": 1.05, "eval_logits/chosen": -2.9729793071746826, "eval_logits/rejected": -2.711977243423462, "eval_logps/chosen": -260.5484313964844, "eval_logps/rejected": -680.8969116210938, "eval_loss": 0.01198588963598013, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.2408105581998825, "eval_rewards/margins": 28.11296272277832, "eval_rewards/rejected": -27.87215232849121, "eval_runtime": 461.9728, "eval_samples_per_second": 20.564, "eval_steps_per_second": 0.643, "step": 3100 }, { "epoch": 1.06, "learning_rate": 3.598136724159637e-07, "logits/chosen": -2.8393616676330566, "logits/rejected": -2.6849372386932373, "logps/chosen": -308.0016174316406, "logps/rejected": -544.7275390625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.24959810078144073, "rewards/margins": 29.6921443939209, "rewards/rejected": -29.442546844482422, "step": 3110 }, { "epoch": 1.06, "learning_rate": 3.5918418733476014e-07, "logits/chosen": -2.9004597663879395, "logits/rejected": -2.6759543418884277, "logps/chosen": -323.9352111816406, "logps/rejected": -655.8312377929688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.6416661143302917, "rewards/margins": 30.336132049560547, "rewards/rejected": -29.694469451904297, "step": 3120 }, { "epoch": 1.06, "learning_rate": 3.5855470225355656e-07, "logits/chosen": -2.9727416038513184, "logits/rejected": -2.7100167274475098, "logps/chosen": -261.725341796875, "logps/rejected": -669.6080322265625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.7132970690727234, "rewards/margins": 26.03215980529785, "rewards/rejected": -25.318862915039062, "step": 3130 }, { "epoch": 1.07, "learning_rate": 3.5792521717235304e-07, "logits/chosen": -2.8985419273376465, "logits/rejected": -2.638688087463379, "logps/chosen": -262.9071960449219, "logps/rejected": -548.6202392578125, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 0.8214443325996399, "rewards/margins": 25.121463775634766, "rewards/rejected": -24.30002212524414, "step": 3140 }, { "epoch": 1.07, "learning_rate": 3.5729573209114946e-07, "logits/chosen": -2.860199213027954, "logits/rejected": -2.709285259246826, "logps/chosen": -196.29931640625, "logps/rejected": -604.6511840820312, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 1.8532356023788452, "rewards/margins": 23.939332962036133, "rewards/rejected": -22.086095809936523, "step": 3150 }, { "epoch": 1.07, "learning_rate": 3.5666624700994583e-07, "logits/chosen": -2.921654462814331, "logits/rejected": -2.6363933086395264, "logps/chosen": -201.8719482421875, "logps/rejected": -952.8933715820312, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.97003972530365, "rewards/margins": 29.07989501953125, "rewards/rejected": -27.1098575592041, "step": 3160 }, { "epoch": 1.08, "learning_rate": 3.5603676192874225e-07, "logits/chosen": -3.0016918182373047, "logits/rejected": -2.6835455894470215, "logps/chosen": -186.29013061523438, "logps/rejected": -495.48541259765625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 2.1252477169036865, "rewards/margins": 23.352680206298828, "rewards/rejected": -21.227432250976562, "step": 3170 }, { "epoch": 1.08, "learning_rate": 3.554072768475387e-07, "logits/chosen": -2.845195770263672, "logits/rejected": -2.7401249408721924, "logps/chosen": -302.9378356933594, "logps/rejected": -593.8045654296875, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 3.2608158588409424, "rewards/margins": 25.455018997192383, "rewards/rejected": -22.194202423095703, "step": 3180 }, { "epoch": 1.08, "learning_rate": 3.547777917663351e-07, "logits/chosen": -2.8650283813476562, "logits/rejected": -2.610814332962036, "logps/chosen": -237.9384765625, "logps/rejected": -743.420654296875, "loss": 0.0018, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5605041980743408, "rewards/margins": 27.747814178466797, "rewards/rejected": -26.187313079833984, "step": 3190 }, { "epoch": 1.09, "learning_rate": 3.5414830668513157e-07, "logits/chosen": -2.926809072494507, "logits/rejected": -2.7191388607025146, "logps/chosen": -180.11441040039062, "logps/rejected": -530.0303955078125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.9733213186264038, "rewards/margins": 26.15768051147461, "rewards/rejected": -24.18436050415039, "step": 3200 }, { "epoch": 1.09, "eval_logits/chosen": -2.9655685424804688, "eval_logits/rejected": -2.7375237941741943, "eval_logps/chosen": -245.53443908691406, "eval_logps/rejected": -651.8002319335938, "eval_loss": 0.009792977944016457, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.742207646369934, "eval_rewards/margins": 26.704700469970703, "eval_rewards/rejected": -24.962491989135742, "eval_runtime": 461.3287, "eval_samples_per_second": 20.593, "eval_steps_per_second": 0.644, "step": 3200 }, { "epoch": 1.09, "learning_rate": 3.53518821603928e-07, "logits/chosen": -2.8433148860931396, "logits/rejected": -2.7689921855926514, "logps/chosen": -190.5950469970703, "logps/rejected": -572.350341796875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.338184118270874, "rewards/margins": 28.97393226623535, "rewards/rejected": -27.6357479095459, "step": 3210 }, { "epoch": 1.09, "learning_rate": 3.528893365227244e-07, "logits/chosen": -2.7966361045837402, "logits/rejected": -2.698848247528076, "logps/chosen": -318.8022155761719, "logps/rejected": -605.4102783203125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 2.3153076171875, "rewards/margins": 28.706073760986328, "rewards/rejected": -26.39076805114746, "step": 3220 }, { "epoch": 1.1, "learning_rate": 3.5225985144152084e-07, "logits/chosen": -2.9813590049743652, "logits/rejected": -2.7447524070739746, "logps/chosen": -201.7269287109375, "logps/rejected": -637.5533447265625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.9241443872451782, "rewards/margins": 35.30531692504883, "rewards/rejected": -33.38117218017578, "step": 3230 }, { "epoch": 1.1, "learning_rate": 3.516303663603172e-07, "logits/chosen": -2.725893497467041, "logits/rejected": -2.6985909938812256, "logps/chosen": -254.16213989257812, "logps/rejected": -614.5772705078125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.2774721384048462, "rewards/margins": 27.315011978149414, "rewards/rejected": -26.037540435791016, "step": 3240 }, { "epoch": 1.1, "learning_rate": 3.5100088127911363e-07, "logits/chosen": -2.8185977935791016, "logits/rejected": -2.786975145339966, "logps/chosen": -291.4767761230469, "logps/rejected": -562.5369262695312, "loss": 0.004, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.700746774673462, "rewards/margins": 30.13591957092285, "rewards/rejected": -28.4351749420166, "step": 3250 }, { "epoch": 1.11, "learning_rate": 3.503713961979101e-07, "logits/chosen": -2.631239891052246, "logits/rejected": -2.7092223167419434, "logps/chosen": -233.8480682373047, "logps/rejected": -694.7057495117188, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 2.6297669410705566, "rewards/margins": 33.944358825683594, "rewards/rejected": -31.314590454101562, "step": 3260 }, { "epoch": 1.11, "learning_rate": 3.4974191111670653e-07, "logits/chosen": -2.850060224533081, "logits/rejected": -2.691786766052246, "logps/chosen": -261.58648681640625, "logps/rejected": -780.8245849609375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.7691078186035156, "rewards/margins": 25.794509887695312, "rewards/rejected": -24.025402069091797, "step": 3270 }, { "epoch": 1.11, "learning_rate": 3.4911242603550296e-07, "logits/chosen": -2.6630806922912598, "logits/rejected": -2.667738437652588, "logps/chosen": -223.91690063476562, "logps/rejected": -523.408447265625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 1.224603295326233, "rewards/margins": 21.877857208251953, "rewards/rejected": -20.65325355529785, "step": 3280 }, { "epoch": 1.12, "learning_rate": 3.484829409542994e-07, "logits/chosen": -2.866847515106201, "logits/rejected": -2.6986701488494873, "logps/chosen": -286.4620361328125, "logps/rejected": -617.9022216796875, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 2.1173300743103027, "rewards/margins": 27.009252548217773, "rewards/rejected": -24.89192008972168, "step": 3290 }, { "epoch": 1.12, "learning_rate": 3.478534558730958e-07, "logits/chosen": -2.8313660621643066, "logits/rejected": -2.785753011703491, "logps/chosen": -248.7388153076172, "logps/rejected": -565.0833740234375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.4821687936782837, "rewards/margins": 25.61962890625, "rewards/rejected": -24.1374568939209, "step": 3300 }, { "epoch": 1.12, "eval_logits/chosen": -2.995020627975464, "eval_logits/rejected": -2.762420654296875, "eval_logps/chosen": -249.07855224609375, "eval_logps/rejected": -649.5485229492188, "eval_loss": 0.015761887654662132, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.3877959251403809, "eval_rewards/margins": 26.125104904174805, "eval_rewards/rejected": -24.737306594848633, "eval_runtime": 461.5202, "eval_samples_per_second": 20.584, "eval_steps_per_second": 0.644, "step": 3300 }, { "epoch": 1.13, "learning_rate": 3.4722397079189217e-07, "logits/chosen": -2.979170083999634, "logits/rejected": -2.763411045074463, "logps/chosen": -216.3646697998047, "logps/rejected": -679.7388916015625, "loss": 0.0905, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.049783121794462204, "rewards/margins": 25.403564453125, "rewards/rejected": -25.453350067138672, "step": 3310 }, { "epoch": 1.13, "learning_rate": 3.4659448571068865e-07, "logits/chosen": -2.8861911296844482, "logits/rejected": -2.7602972984313965, "logps/chosen": -349.4170837402344, "logps/rejected": -700.7196044921875, "loss": 0.0171, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3524277210235596, "rewards/margins": 31.769351959228516, "rewards/rejected": -30.416919708251953, "step": 3320 }, { "epoch": 1.13, "learning_rate": 3.4596500062948507e-07, "logits/chosen": -2.915405511856079, "logits/rejected": -2.712636947631836, "logps/chosen": -282.6596984863281, "logps/rejected": -663.5471801757812, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.512639045715332, "rewards/margins": 27.87890625, "rewards/rejected": -26.36626625061035, "step": 3330 }, { "epoch": 1.14, "learning_rate": 3.453355155482815e-07, "logits/chosen": -2.8998916149139404, "logits/rejected": -2.7367100715637207, "logps/chosen": -251.6038818359375, "logps/rejected": -874.5947265625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.1314613819122314, "rewards/margins": 27.009368896484375, "rewards/rejected": -25.877910614013672, "step": 3340 }, { "epoch": 1.14, "learning_rate": 3.447060304670779e-07, "logits/chosen": -3.054274082183838, "logits/rejected": -2.720668315887451, "logps/chosen": -209.67465209960938, "logps/rejected": -698.5900268554688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.2332206964492798, "rewards/margins": 33.24088668823242, "rewards/rejected": -32.007667541503906, "step": 3350 }, { "epoch": 1.14, "learning_rate": 3.4407654538587434e-07, "logits/chosen": -2.867328643798828, "logits/rejected": -2.7749171257019043, "logps/chosen": -277.6558837890625, "logps/rejected": -707.1676025390625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.44479140639305115, "rewards/margins": 25.03409767150879, "rewards/rejected": -24.589305877685547, "step": 3360 }, { "epoch": 1.15, "learning_rate": 3.4344706030467076e-07, "logits/chosen": -2.903066635131836, "logits/rejected": -2.7633419036865234, "logps/chosen": -206.77316284179688, "logps/rejected": -823.3929443359375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.4973962903022766, "rewards/margins": 28.876251220703125, "rewards/rejected": -28.378854751586914, "step": 3370 }, { "epoch": 1.15, "learning_rate": 3.4281757522346724e-07, "logits/chosen": -2.8428375720977783, "logits/rejected": -2.771228313446045, "logps/chosen": -316.61041259765625, "logps/rejected": -573.78271484375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.6287176012992859, "rewards/margins": 25.960758209228516, "rewards/rejected": -25.332040786743164, "step": 3380 }, { "epoch": 1.15, "learning_rate": 3.421880901422636e-07, "logits/chosen": -2.9360239505767822, "logits/rejected": -2.754889726638794, "logps/chosen": -261.99365234375, "logps/rejected": -786.5055541992188, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.06978748738765717, "rewards/margins": 28.102584838867188, "rewards/rejected": -28.172372817993164, "step": 3390 }, { "epoch": 1.16, "learning_rate": 3.4155860506106003e-07, "logits/chosen": -2.9817283153533936, "logits/rejected": -2.678464651107788, "logps/chosen": -274.711181640625, "logps/rejected": -714.3954467773438, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.4287026524543762, "rewards/margins": 29.36665916442871, "rewards/rejected": -28.937957763671875, "step": 3400 }, { "epoch": 1.16, "eval_logits/chosen": -3.0069329738616943, "eval_logits/rejected": -2.7869741916656494, "eval_logps/chosen": -253.5991668701172, "eval_logps/rejected": -676.2398071289062, "eval_loss": 0.013029967434704304, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.9357333779335022, "eval_rewards/margins": 28.342182159423828, "eval_rewards/rejected": -27.406450271606445, "eval_runtime": 461.4543, "eval_samples_per_second": 20.587, "eval_steps_per_second": 0.644, "step": 3400 }, { "epoch": 1.16, "learning_rate": 3.4092911997985645e-07, "logits/chosen": -2.877763032913208, "logits/rejected": -2.7161500453948975, "logps/chosen": -249.86453247070312, "logps/rejected": -686.5499877929688, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.40290361642837524, "rewards/margins": 27.63692283630371, "rewards/rejected": -27.234020233154297, "step": 3410 }, { "epoch": 1.16, "learning_rate": 3.402996348986529e-07, "logits/chosen": -2.8711249828338623, "logits/rejected": -2.735163688659668, "logps/chosen": -323.6860656738281, "logps/rejected": -726.78271484375, "loss": 0.0048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3661189675331116, "rewards/margins": 24.992210388183594, "rewards/rejected": -24.6260929107666, "step": 3420 }, { "epoch": 1.17, "learning_rate": 3.396701498174493e-07, "logits/chosen": -2.87437105178833, "logits/rejected": -2.7314188480377197, "logps/chosen": -276.09942626953125, "logps/rejected": -634.0580444335938, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.8610731959342957, "rewards/margins": 27.336559295654297, "rewards/rejected": -26.47548484802246, "step": 3430 }, { "epoch": 1.17, "learning_rate": 3.3904066473624577e-07, "logits/chosen": -2.9558238983154297, "logits/rejected": -2.7920570373535156, "logps/chosen": -194.19419860839844, "logps/rejected": -714.0545043945312, "loss": 0.0048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.30834153294563293, "rewards/margins": 27.63800621032715, "rewards/rejected": -27.329666137695312, "step": 3440 }, { "epoch": 1.17, "learning_rate": 3.384111796550422e-07, "logits/chosen": -2.9548964500427246, "logits/rejected": -2.7379510402679443, "logps/chosen": -210.5632781982422, "logps/rejected": -753.6837158203125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.2780233919620514, "rewards/margins": 28.258432388305664, "rewards/rejected": -27.98040771484375, "step": 3450 }, { "epoch": 1.18, "learning_rate": 3.377816945738386e-07, "logits/chosen": -2.9926254749298096, "logits/rejected": -2.74137544631958, "logps/chosen": -185.19064331054688, "logps/rejected": -593.9371337890625, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.12484810501337051, "rewards/margins": 26.58856201171875, "rewards/rejected": -26.463714599609375, "step": 3460 }, { "epoch": 1.18, "learning_rate": 3.37152209492635e-07, "logits/chosen": -2.8882956504821777, "logits/rejected": -2.824545383453369, "logps/chosen": -251.1592254638672, "logps/rejected": -724.17529296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.26794368028640747, "rewards/margins": 28.08188247680664, "rewards/rejected": -27.81393814086914, "step": 3470 }, { "epoch": 1.18, "learning_rate": 3.365227244114314e-07, "logits/chosen": -2.9974746704101562, "logits/rejected": -2.7161548137664795, "logps/chosen": -197.24435424804688, "logps/rejected": -602.18603515625, "loss": 0.0015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7581983804702759, "rewards/margins": 27.583328247070312, "rewards/rejected": -26.82512855529785, "step": 3480 }, { "epoch": 1.19, "learning_rate": 3.3589323933022783e-07, "logits/chosen": -2.836381435394287, "logits/rejected": -2.8309133052825928, "logps/chosen": -286.3896484375, "logps/rejected": -715.1646728515625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.015443995594978333, "rewards/margins": 31.670909881591797, "rewards/rejected": -31.655466079711914, "step": 3490 }, { "epoch": 1.19, "learning_rate": 3.3526375424902426e-07, "logits/chosen": -2.853415012359619, "logits/rejected": -2.865809440612793, "logps/chosen": -260.1250305175781, "logps/rejected": -630.5604858398438, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.5119583606719971, "rewards/margins": 27.370594024658203, "rewards/rejected": -26.8586368560791, "step": 3500 }, { "epoch": 1.19, "eval_logits/chosen": -3.0339839458465576, "eval_logits/rejected": -2.798781394958496, "eval_logps/chosen": -255.46060180664062, "eval_logps/rejected": -689.833251953125, "eval_loss": 0.010707746259868145, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.7495967149734497, "eval_rewards/margins": 29.515392303466797, "eval_rewards/rejected": -28.765796661376953, "eval_runtime": 460.9032, "eval_samples_per_second": 20.612, "eval_steps_per_second": 0.644, "step": 3500 }, { "epoch": 1.19, "learning_rate": 3.3463426916782073e-07, "logits/chosen": -2.9213147163391113, "logits/rejected": -2.746760845184326, "logps/chosen": -247.7586669921875, "logps/rejected": -581.4436645507812, "loss": 0.003, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.13430052995681763, "rewards/margins": 28.546361923217773, "rewards/rejected": -28.412063598632812, "step": 3510 }, { "epoch": 1.2, "learning_rate": 3.3400478408661716e-07, "logits/chosen": -2.8717570304870605, "logits/rejected": -2.797159194946289, "logps/chosen": -222.08053588867188, "logps/rejected": -516.3294067382812, "loss": 0.0018, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8055248260498047, "rewards/margins": 29.2651424407959, "rewards/rejected": -28.459619522094727, "step": 3520 }, { "epoch": 1.2, "learning_rate": 3.333752990054136e-07, "logits/chosen": -2.899606943130493, "logits/rejected": -2.684436798095703, "logps/chosen": -252.02645874023438, "logps/rejected": -801.7807006835938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.13138747215271, "rewards/margins": 32.30779266357422, "rewards/rejected": -31.176406860351562, "step": 3530 }, { "epoch": 1.2, "learning_rate": 3.3274581392420995e-07, "logits/chosen": -2.951033592224121, "logits/rejected": -2.764392614364624, "logps/chosen": -196.06312561035156, "logps/rejected": -570.6377563476562, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.6776518225669861, "rewards/margins": 30.66329574584961, "rewards/rejected": -29.985645294189453, "step": 3540 }, { "epoch": 1.21, "learning_rate": 3.3211632884300637e-07, "logits/chosen": -2.866495132446289, "logits/rejected": -2.8054893016815186, "logps/chosen": -297.3688659667969, "logps/rejected": -700.8709106445312, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.883233904838562, "rewards/margins": 30.7996883392334, "rewards/rejected": -29.916454315185547, "step": 3550 }, { "epoch": 1.21, "learning_rate": 3.314868437618028e-07, "logits/chosen": -2.8782236576080322, "logits/rejected": -2.7927258014678955, "logps/chosen": -316.05499267578125, "logps/rejected": -658.6573486328125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.3507769107818604, "rewards/margins": 29.34478759765625, "rewards/rejected": -27.994009017944336, "step": 3560 }, { "epoch": 1.21, "learning_rate": 3.3085735868059927e-07, "logits/chosen": -2.871127128601074, "logits/rejected": -2.8401038646698, "logps/chosen": -367.17230224609375, "logps/rejected": -567.1422119140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9685766696929932, "rewards/margins": 29.93875503540039, "rewards/rejected": -27.970184326171875, "step": 3570 }, { "epoch": 1.22, "learning_rate": 3.302278735993957e-07, "logits/chosen": -2.8781118392944336, "logits/rejected": -2.738436698913574, "logps/chosen": -332.22967529296875, "logps/rejected": -899.6937255859375, "loss": 0.0507, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.28324759006500244, "rewards/margins": 31.325862884521484, "rewards/rejected": -31.609111785888672, "step": 3580 }, { "epoch": 1.22, "learning_rate": 3.295983885181921e-07, "logits/chosen": -2.882572650909424, "logits/rejected": -2.6347718238830566, "logps/chosen": -390.6462707519531, "logps/rejected": -810.5206298828125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.6097986698150635, "rewards/margins": 29.81662940979004, "rewards/rejected": -28.206832885742188, "step": 3590 }, { "epoch": 1.22, "learning_rate": 3.2896890343698854e-07, "logits/chosen": -2.917947292327881, "logits/rejected": -2.708514928817749, "logps/chosen": -194.08358764648438, "logps/rejected": -658.7647705078125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.69496089220047, "rewards/margins": 29.213394165039062, "rewards/rejected": -28.518436431884766, "step": 3600 }, { "epoch": 1.22, "eval_logits/chosen": -2.973989725112915, "eval_logits/rejected": -2.7427051067352295, "eval_logps/chosen": -256.25372314453125, "eval_logps/rejected": -704.10400390625, "eval_loss": 0.00908406637609005, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.6702799797058105, "eval_rewards/margins": 30.863149642944336, "eval_rewards/rejected": -30.192869186401367, "eval_runtime": 459.9633, "eval_samples_per_second": 20.654, "eval_steps_per_second": 0.646, "step": 3600 }, { "epoch": 1.23, "learning_rate": 3.2833941835578496e-07, "logits/chosen": -2.83280348777771, "logits/rejected": -2.648158073425293, "logps/chosen": -223.338134765625, "logps/rejected": -795.1192626953125, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 0.5657335519790649, "rewards/margins": 34.37791442871094, "rewards/rejected": -33.81218719482422, "step": 3610 }, { "epoch": 1.23, "learning_rate": 3.2770993327458133e-07, "logits/chosen": -2.895251512527466, "logits/rejected": -2.7829792499542236, "logps/chosen": -264.54791259765625, "logps/rejected": -614.9221801757812, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.504339337348938, "rewards/margins": 34.65849685668945, "rewards/rejected": -34.154151916503906, "step": 3620 }, { "epoch": 1.23, "learning_rate": 3.270804481933778e-07, "logits/chosen": -2.9059786796569824, "logits/rejected": -2.7112174034118652, "logps/chosen": -197.21957397460938, "logps/rejected": -698.3447265625, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 1.4053034782409668, "rewards/margins": 28.148921966552734, "rewards/rejected": -26.74361801147461, "step": 3630 }, { "epoch": 1.24, "learning_rate": 3.2645096311217423e-07, "logits/chosen": -2.858887195587158, "logits/rejected": -2.7045111656188965, "logps/chosen": -260.25531005859375, "logps/rejected": -502.76983642578125, "loss": 0.0016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3793197870254517, "rewards/margins": 27.177597045898438, "rewards/rejected": -25.79827880859375, "step": 3640 }, { "epoch": 1.24, "learning_rate": 3.2582147803097065e-07, "logits/chosen": -2.9572010040283203, "logits/rejected": -2.6675450801849365, "logps/chosen": -228.85543823242188, "logps/rejected": -699.2418212890625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.4300611019134521, "rewards/margins": 30.530532836914062, "rewards/rejected": -29.100473403930664, "step": 3650 }, { "epoch": 1.24, "learning_rate": 3.251919929497671e-07, "logits/chosen": -2.8738491535186768, "logits/rejected": -2.7163283824920654, "logps/chosen": -283.8681640625, "logps/rejected": -587.699462890625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.9474754333496094, "rewards/margins": 28.724834442138672, "rewards/rejected": -27.777355194091797, "step": 3660 }, { "epoch": 1.25, "learning_rate": 3.245625078685635e-07, "logits/chosen": -2.8308136463165283, "logits/rejected": -2.7664363384246826, "logps/chosen": -345.8439025878906, "logps/rejected": -832.5578002929688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.7836852073669434, "rewards/margins": 30.036762237548828, "rewards/rejected": -29.25307846069336, "step": 3670 }, { "epoch": 1.25, "learning_rate": 3.239330227873599e-07, "logits/chosen": -2.877434730529785, "logits/rejected": -2.7226452827453613, "logps/chosen": -238.4366912841797, "logps/rejected": -618.0101928710938, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.7774938941001892, "rewards/margins": 30.218769073486328, "rewards/rejected": -29.441274642944336, "step": 3680 }, { "epoch": 1.25, "learning_rate": 3.233035377061564e-07, "logits/chosen": -2.8385889530181885, "logits/rejected": -2.6710593700408936, "logps/chosen": -259.98834228515625, "logps/rejected": -642.3009033203125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.1802050769329071, "rewards/margins": 28.62386703491211, "rewards/rejected": -28.8040714263916, "step": 3690 }, { "epoch": 1.26, "learning_rate": 3.2267405262495277e-07, "logits/chosen": -2.8176867961883545, "logits/rejected": -2.7674574851989746, "logps/chosen": -253.277587890625, "logps/rejected": -633.1143798828125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.27235105633735657, "rewards/margins": 29.82845687866211, "rewards/rejected": -30.10080909729004, "step": 3700 }, { "epoch": 1.26, "eval_logits/chosen": -2.964661121368408, "eval_logits/rejected": -2.734287977218628, "eval_logps/chosen": -257.2007751464844, "eval_logps/rejected": -684.675048828125, "eval_loss": 0.009789006784558296, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.5755741000175476, "eval_rewards/margins": 28.825544357299805, "eval_rewards/rejected": -28.249967575073242, "eval_runtime": 460.971, "eval_samples_per_second": 20.609, "eval_steps_per_second": 0.644, "step": 3700 }, { "epoch": 1.26, "learning_rate": 3.220445675437492e-07, "logits/chosen": -2.901756763458252, "logits/rejected": -2.6470272541046143, "logps/chosen": -219.7659454345703, "logps/rejected": -787.4397583007812, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 0.3163870573043823, "rewards/margins": 28.80877113342285, "rewards/rejected": -28.492382049560547, "step": 3710 }, { "epoch": 1.26, "learning_rate": 3.214150824625456e-07, "logits/chosen": -2.8629584312438965, "logits/rejected": -2.663905143737793, "logps/chosen": -208.0748748779297, "logps/rejected": -710.3724365234375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.20447878539562225, "rewards/margins": 27.714303970336914, "rewards/rejected": -27.50982666015625, "step": 3720 }, { "epoch": 1.27, "learning_rate": 3.2078559738134203e-07, "logits/chosen": -2.7836689949035645, "logits/rejected": -2.643033504486084, "logps/chosen": -312.95782470703125, "logps/rejected": -765.175537109375, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -0.7836293578147888, "rewards/margins": 30.157394409179688, "rewards/rejected": -30.941024780273438, "step": 3730 }, { "epoch": 1.27, "learning_rate": 3.2015611230013846e-07, "logits/chosen": -2.923621892929077, "logits/rejected": -2.727410078048706, "logps/chosen": -219.7535400390625, "logps/rejected": -836.8802490234375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.4070291519165039, "rewards/margins": 30.299518585205078, "rewards/rejected": -29.89249038696289, "step": 3740 }, { "epoch": 1.27, "learning_rate": 3.1952662721893493e-07, "logits/chosen": -2.913867473602295, "logits/rejected": -2.706606388092041, "logps/chosen": -193.8428192138672, "logps/rejected": -699.8363647460938, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.2069007158279419, "rewards/margins": 27.583209991455078, "rewards/rejected": -27.37630844116211, "step": 3750 }, { "epoch": 1.28, "learning_rate": 3.1889714213773135e-07, "logits/chosen": -2.8810362815856934, "logits/rejected": -2.720161199569702, "logps/chosen": -260.941162109375, "logps/rejected": -691.4710693359375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.3216225802898407, "rewards/margins": 28.467037200927734, "rewards/rejected": -28.14541244506836, "step": 3760 }, { "epoch": 1.28, "learning_rate": 3.182676570565277e-07, "logits/chosen": -2.8992319107055664, "logits/rejected": -2.7172107696533203, "logps/chosen": -206.8603515625, "logps/rejected": -761.8200073242188, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 0.6002095937728882, "rewards/margins": 38.535377502441406, "rewards/rejected": -37.93517303466797, "step": 3770 }, { "epoch": 1.28, "learning_rate": 3.1763817197532415e-07, "logits/chosen": -2.7893216609954834, "logits/rejected": -2.6856822967529297, "logps/chosen": -260.65985107421875, "logps/rejected": -786.3257446289062, "loss": 0.0084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.004128456115722656, "rewards/margins": 44.904945373535156, "rewards/rejected": -44.900821685791016, "step": 3780 }, { "epoch": 1.29, "learning_rate": 3.1700868689412057e-07, "logits/chosen": -2.8233675956726074, "logits/rejected": -2.69585919380188, "logps/chosen": -336.022216796875, "logps/rejected": -889.0552978515625, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8989012837409973, "rewards/margins": 43.08487319946289, "rewards/rejected": -42.185970306396484, "step": 3790 }, { "epoch": 1.29, "learning_rate": 3.16379201812917e-07, "logits/chosen": -2.8796184062957764, "logits/rejected": -2.724398136138916, "logps/chosen": -192.54225158691406, "logps/rejected": -812.7857666015625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.49641457200050354, "rewards/margins": 32.80237579345703, "rewards/rejected": -32.30595779418945, "step": 3800 }, { "epoch": 1.29, "eval_logits/chosen": -2.947807788848877, "eval_logits/rejected": -2.718738317489624, "eval_logps/chosen": -263.48541259765625, "eval_logps/rejected": -732.2225952148438, "eval_loss": 0.006836127024143934, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.052887748926877975, "eval_rewards/margins": 32.951839447021484, "eval_rewards/rejected": -33.00472640991211, "eval_runtime": 460.4257, "eval_samples_per_second": 20.633, "eval_steps_per_second": 0.645, "step": 3800 }, { "epoch": 1.3, "learning_rate": 3.1574971673171347e-07, "logits/chosen": -2.781703472137451, "logits/rejected": -2.7286877632141113, "logps/chosen": -324.5968322753906, "logps/rejected": -634.8831787109375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.7346505522727966, "rewards/margins": 34.57854461669922, "rewards/rejected": -35.31319808959961, "step": 3810 }, { "epoch": 1.3, "learning_rate": 3.151202316505099e-07, "logits/chosen": -2.9526524543762207, "logits/rejected": -2.704850673675537, "logps/chosen": -228.02053833007812, "logps/rejected": -845.83056640625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.07456684857606888, "rewards/margins": 37.140380859375, "rewards/rejected": -37.21494674682617, "step": 3820 }, { "epoch": 1.3, "learning_rate": 3.144907465693063e-07, "logits/chosen": -2.773261547088623, "logits/rejected": -2.5027103424072266, "logps/chosen": -259.6689453125, "logps/rejected": -702.3695678710938, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.7093507051467896, "rewards/margins": 29.48362159729004, "rewards/rejected": -30.192974090576172, "step": 3830 }, { "epoch": 1.31, "learning_rate": 3.1386126148810274e-07, "logits/chosen": -2.883657217025757, "logits/rejected": -2.753113269805908, "logps/chosen": -351.4837341308594, "logps/rejected": -730.5638427734375, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.2683006227016449, "rewards/margins": 38.08830261230469, "rewards/rejected": -38.356605529785156, "step": 3840 }, { "epoch": 1.31, "learning_rate": 3.132317764068991e-07, "logits/chosen": -2.8615670204162598, "logits/rejected": -2.6625609397888184, "logps/chosen": -221.5238037109375, "logps/rejected": -828.9417114257812, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -0.026356136426329613, "rewards/margins": 31.565547943115234, "rewards/rejected": -31.591903686523438, "step": 3850 }, { "epoch": 1.31, "learning_rate": 3.1260229132569553e-07, "logits/chosen": -2.731123447418213, "logits/rejected": -2.716470718383789, "logps/chosen": -314.4375915527344, "logps/rejected": -617.2342529296875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.17142614722251892, "rewards/margins": 34.80061721801758, "rewards/rejected": -34.62918472290039, "step": 3860 }, { "epoch": 1.32, "learning_rate": 3.11972806244492e-07, "logits/chosen": -2.736818313598633, "logits/rejected": -2.755732774734497, "logps/chosen": -262.03546142578125, "logps/rejected": -706.3619384765625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.19321072101593018, "rewards/margins": 30.112655639648438, "rewards/rejected": -29.919443130493164, "step": 3870 }, { "epoch": 1.32, "learning_rate": 3.1134332116328843e-07, "logits/chosen": -2.8789989948272705, "logits/rejected": -2.7425944805145264, "logps/chosen": -253.03945922851562, "logps/rejected": -820.9622802734375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.2953759729862213, "rewards/margins": 34.52014923095703, "rewards/rejected": -34.224769592285156, "step": 3880 }, { "epoch": 1.32, "learning_rate": 3.1071383608208485e-07, "logits/chosen": -2.7741522789001465, "logits/rejected": -2.7170252799987793, "logps/chosen": -299.11627197265625, "logps/rejected": -580.8271484375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.41178783774375916, "rewards/margins": 34.19072723388672, "rewards/rejected": -34.602516174316406, "step": 3890 }, { "epoch": 1.33, "learning_rate": 3.1008435100088127e-07, "logits/chosen": -2.877669334411621, "logits/rejected": -2.6832282543182373, "logps/chosen": -221.47933959960938, "logps/rejected": -961.53857421875, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.34339937567710876, "rewards/margins": 38.49998092651367, "rewards/rejected": -38.156578063964844, "step": 3900 }, { "epoch": 1.33, "eval_logits/chosen": -2.95457124710083, "eval_logits/rejected": -2.730085611343384, "eval_logps/chosen": -260.6388854980469, "eval_logps/rejected": -717.798095703125, "eval_loss": 0.0061119659803807735, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.2317633330821991, "eval_rewards/margins": 31.79404067993164, "eval_rewards/rejected": -31.56227684020996, "eval_runtime": 461.0946, "eval_samples_per_second": 20.603, "eval_steps_per_second": 0.644, "step": 3900 }, { "epoch": 1.33, "learning_rate": 3.094548659196777e-07, "logits/chosen": -2.7719740867614746, "logits/rejected": -2.7072925567626953, "logps/chosen": -388.2748718261719, "logps/rejected": -848.0862426757812, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.5868638157844543, "rewards/margins": 30.737018585205078, "rewards/rejected": -30.1501522064209, "step": 3910 }, { "epoch": 1.33, "learning_rate": 3.0882538083847407e-07, "logits/chosen": -2.838963508605957, "logits/rejected": -2.6601414680480957, "logps/chosen": -279.97576904296875, "logps/rejected": -615.7132568359375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.5268959403038025, "rewards/margins": 30.111352920532227, "rewards/rejected": -29.584453582763672, "step": 3920 }, { "epoch": 1.34, "learning_rate": 3.0819589575727054e-07, "logits/chosen": -2.6373291015625, "logits/rejected": -2.611196279525757, "logps/chosen": -415.51446533203125, "logps/rejected": -560.3062744140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.7828716039657593, "rewards/margins": 31.1794376373291, "rewards/rejected": -30.396564483642578, "step": 3930 }, { "epoch": 1.34, "learning_rate": 3.0756641067606696e-07, "logits/chosen": -2.844838857650757, "logits/rejected": -2.65082049369812, "logps/chosen": -223.60653686523438, "logps/rejected": -638.375244140625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.19149819016456604, "rewards/margins": 28.128108978271484, "rewards/rejected": -28.31960678100586, "step": 3940 }, { "epoch": 1.34, "learning_rate": 3.069369255948634e-07, "logits/chosen": -2.847378969192505, "logits/rejected": -2.7069625854492188, "logps/chosen": -218.39431762695312, "logps/rejected": -833.7306518554688, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.127278670668602, "rewards/margins": 39.469993591308594, "rewards/rejected": -39.342716217041016, "step": 3950 }, { "epoch": 1.35, "learning_rate": 3.063074405136598e-07, "logits/chosen": -2.8059515953063965, "logits/rejected": -2.696211814880371, "logps/chosen": -227.1031951904297, "logps/rejected": -714.4603271484375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.3242960572242737, "rewards/margins": 35.73683547973633, "rewards/rejected": -35.41254425048828, "step": 3960 }, { "epoch": 1.35, "learning_rate": 3.0567795543245623e-07, "logits/chosen": -2.894101619720459, "logits/rejected": -2.685908794403076, "logps/chosen": -268.7279357910156, "logps/rejected": -784.7110595703125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.25763624906539917, "rewards/margins": 31.990589141845703, "rewards/rejected": -31.732952117919922, "step": 3970 }, { "epoch": 1.35, "learning_rate": 3.0504847035125266e-07, "logits/chosen": -2.7788329124450684, "logits/rejected": -2.677741527557373, "logps/chosen": -287.3855895996094, "logps/rejected": -698.7899169921875, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 0.9362618327140808, "rewards/margins": 31.670995712280273, "rewards/rejected": -30.7347354888916, "step": 3980 }, { "epoch": 1.36, "learning_rate": 3.0441898527004913e-07, "logits/chosen": -2.846498966217041, "logits/rejected": -2.708000659942627, "logps/chosen": -208.5373992919922, "logps/rejected": -688.662841796875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3066147565841675, "rewards/margins": 32.57853698730469, "rewards/rejected": -32.88515090942383, "step": 3990 }, { "epoch": 1.36, "learning_rate": 3.037895001888455e-07, "logits/chosen": -2.9328017234802246, "logits/rejected": -2.6987147331237793, "logps/chosen": -272.616455078125, "logps/rejected": -691.2911987304688, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.07908304035663605, "rewards/margins": 31.47201156616211, "rewards/rejected": -31.551097869873047, "step": 4000 }, { "epoch": 1.36, "eval_logits/chosen": -2.924492120742798, "eval_logits/rejected": -2.7023532390594482, "eval_logps/chosen": -267.7919921875, "eval_logps/rejected": -745.9522094726562, "eval_loss": 0.006148109212517738, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.4835454821586609, "eval_rewards/margins": 33.894142150878906, "eval_rewards/rejected": -34.377685546875, "eval_runtime": 461.3311, "eval_samples_per_second": 20.593, "eval_steps_per_second": 0.644, "step": 4000 }, { "epoch": 1.36, "learning_rate": 3.031600151076419e-07, "logits/chosen": -2.846287727355957, "logits/rejected": -2.6879289150238037, "logps/chosen": -222.98556518554688, "logps/rejected": -779.0908203125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.7848774790763855, "rewards/margins": 35.604637145996094, "rewards/rejected": -36.38951873779297, "step": 4010 }, { "epoch": 1.37, "learning_rate": 3.0253053002643835e-07, "logits/chosen": -2.8661882877349854, "logits/rejected": -2.7454569339752197, "logps/chosen": -226.4896240234375, "logps/rejected": -829.1182861328125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.7889223694801331, "rewards/margins": 41.409645080566406, "rewards/rejected": -42.198570251464844, "step": 4020 }, { "epoch": 1.37, "learning_rate": 3.0190104494523477e-07, "logits/chosen": -2.7407419681549072, "logits/rejected": -2.6453170776367188, "logps/chosen": -240.0795440673828, "logps/rejected": -997.6741333007812, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.7889522314071655, "rewards/margins": 43.315879821777344, "rewards/rejected": -45.104835510253906, "step": 4030 }, { "epoch": 1.37, "learning_rate": 3.012715598640312e-07, "logits/chosen": -2.64636492729187, "logits/rejected": -2.5987484455108643, "logps/chosen": -335.8018798828125, "logps/rejected": -793.2855834960938, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 0.9875871539115906, "rewards/margins": 40.53483581542969, "rewards/rejected": -39.5472526550293, "step": 4040 }, { "epoch": 1.38, "learning_rate": 3.0064207478282767e-07, "logits/chosen": -2.6760735511779785, "logits/rejected": -2.690502166748047, "logps/chosen": -319.78411865234375, "logps/rejected": -735.3406982421875, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.45544975996017456, "rewards/margins": 33.41617202758789, "rewards/rejected": -32.960723876953125, "step": 4050 }, { "epoch": 1.38, "learning_rate": 3.000125897016241e-07, "logits/chosen": -2.7757976055145264, "logits/rejected": -2.578579902648926, "logps/chosen": -237.58590698242188, "logps/rejected": -697.9693603515625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.6293140649795532, "rewards/margins": 29.30428695678711, "rewards/rejected": -29.9335994720459, "step": 4060 }, { "epoch": 1.38, "learning_rate": 2.993831046204205e-07, "logits/chosen": -2.695049285888672, "logits/rejected": -2.5840914249420166, "logps/chosen": -350.73974609375, "logps/rejected": -617.5267944335938, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.41110366582870483, "rewards/margins": 30.19058609008789, "rewards/rejected": -29.77947998046875, "step": 4070 }, { "epoch": 1.39, "learning_rate": 2.987536195392169e-07, "logits/chosen": -2.704336643218994, "logits/rejected": -2.6175036430358887, "logps/chosen": -281.12835693359375, "logps/rejected": -718.6896362304688, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.36629363894462585, "rewards/margins": 39.50093460083008, "rewards/rejected": -39.134639739990234, "step": 4080 }, { "epoch": 1.39, "learning_rate": 2.981241344580133e-07, "logits/chosen": -2.689499616622925, "logits/rejected": -2.6604161262512207, "logps/chosen": -273.2850646972656, "logps/rejected": -663.3641967773438, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 0.5533127188682556, "rewards/margins": 31.76581382751465, "rewards/rejected": -31.21250343322754, "step": 4090 }, { "epoch": 1.39, "learning_rate": 2.9749464937680973e-07, "logits/chosen": -2.7489993572235107, "logits/rejected": -2.508171558380127, "logps/chosen": -222.8814239501953, "logps/rejected": -502.1905212402344, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.9515475034713745, "rewards/margins": 31.5323543548584, "rewards/rejected": -30.580806732177734, "step": 4100 }, { "epoch": 1.39, "eval_logits/chosen": -2.8669464588165283, "eval_logits/rejected": -2.6585676670074463, "eval_logps/chosen": -258.8861999511719, "eval_logps/rejected": -712.25439453125, "eval_loss": 0.004780417308211327, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.40703296661376953, "eval_rewards/margins": 31.41494369506836, "eval_rewards/rejected": -31.00790786743164, "eval_runtime": 462.2629, "eval_samples_per_second": 20.551, "eval_steps_per_second": 0.642, "step": 4100 }, { "epoch": 1.4, "learning_rate": 2.968651642956062e-07, "logits/chosen": -2.7797963619232178, "logits/rejected": -2.42704439163208, "logps/chosen": -208.4052734375, "logps/rejected": -680.1314697265625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.27941128611564636, "rewards/margins": 29.355850219726562, "rewards/rejected": -29.635263442993164, "step": 4110 }, { "epoch": 1.4, "learning_rate": 2.9623567921440263e-07, "logits/chosen": -2.7294259071350098, "logits/rejected": -2.4310061931610107, "logps/chosen": -218.0826416015625, "logps/rejected": -571.7494506835938, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.9731583595275879, "rewards/margins": 26.150989532470703, "rewards/rejected": -27.124149322509766, "step": 4120 }, { "epoch": 1.4, "learning_rate": 2.9560619413319905e-07, "logits/chosen": -2.5674891471862793, "logits/rejected": -2.6014277935028076, "logps/chosen": -427.23455810546875, "logps/rejected": -846.3826904296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.49421629309654236, "rewards/margins": 32.15135192871094, "rewards/rejected": -32.64556884765625, "step": 4130 }, { "epoch": 1.41, "learning_rate": 2.9497670905199547e-07, "logits/chosen": -2.569112777709961, "logits/rejected": -2.5538196563720703, "logps/chosen": -399.3161926269531, "logps/rejected": -723.154296875, "loss": 0.0146, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2191426753997803, "rewards/margins": 31.2829532623291, "rewards/rejected": -30.063812255859375, "step": 4140 }, { "epoch": 1.41, "learning_rate": 2.9434722397079184e-07, "logits/chosen": -2.6196889877319336, "logits/rejected": -2.6018166542053223, "logps/chosen": -302.16473388671875, "logps/rejected": -834.4959106445312, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -0.08485205471515656, "rewards/margins": 26.548824310302734, "rewards/rejected": -26.633676528930664, "step": 4150 }, { "epoch": 1.41, "learning_rate": 2.9371773888958827e-07, "logits/chosen": -2.693727970123291, "logits/rejected": -2.5961389541625977, "logps/chosen": -223.1240997314453, "logps/rejected": -636.4374389648438, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.399338036775589, "rewards/margins": 26.784439086914062, "rewards/rejected": -26.38510513305664, "step": 4160 }, { "epoch": 1.42, "learning_rate": 2.9308825380838474e-07, "logits/chosen": -2.655277967453003, "logits/rejected": -2.5512490272521973, "logps/chosen": -277.2023010253906, "logps/rejected": -599.8840942382812, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.5765337944030762, "rewards/margins": 28.96539306640625, "rewards/rejected": -28.388858795166016, "step": 4170 }, { "epoch": 1.42, "learning_rate": 2.9245876872718116e-07, "logits/chosen": -2.5889811515808105, "logits/rejected": -2.558422565460205, "logps/chosen": -263.55316162109375, "logps/rejected": -647.8112182617188, "loss": 0.0547, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.06452493369579315, "rewards/margins": 26.051403045654297, "rewards/rejected": -25.986881256103516, "step": 4180 }, { "epoch": 1.42, "learning_rate": 2.918292836459776e-07, "logits/chosen": -2.6662046909332275, "logits/rejected": -2.5493884086608887, "logps/chosen": -231.3740997314453, "logps/rejected": -864.99658203125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.2850487232208252, "rewards/margins": 27.493453979492188, "rewards/rejected": -26.208404541015625, "step": 4190 }, { "epoch": 1.43, "learning_rate": 2.91199798564774e-07, "logits/chosen": -2.783965826034546, "logits/rejected": -2.617244243621826, "logps/chosen": -197.84547424316406, "logps/rejected": -642.6910400390625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.13341796398162842, "rewards/margins": 27.039730072021484, "rewards/rejected": -26.906314849853516, "step": 4200 }, { "epoch": 1.43, "eval_logits/chosen": -2.8776144981384277, "eval_logits/rejected": -2.657221555709839, "eval_logps/chosen": -260.6947021484375, "eval_logps/rejected": -678.6433715820312, "eval_loss": 0.006725949700921774, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.22618162631988525, "eval_rewards/margins": 27.872982025146484, "eval_rewards/rejected": -27.64679718017578, "eval_runtime": 462.1485, "eval_samples_per_second": 20.556, "eval_steps_per_second": 0.643, "step": 4200 }, { "epoch": 1.43, "learning_rate": 2.9057031348357043e-07, "logits/chosen": -2.8531885147094727, "logits/rejected": -2.646796464920044, "logps/chosen": -216.73110961914062, "logps/rejected": -809.6451416015625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.2372235804796219, "rewards/margins": 28.96242332458496, "rewards/rejected": -28.725200653076172, "step": 4210 }, { "epoch": 1.43, "learning_rate": 2.8994082840236686e-07, "logits/chosen": -2.6983377933502197, "logits/rejected": -2.6060147285461426, "logps/chosen": -320.4348449707031, "logps/rejected": -748.2528076171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.3141578435897827, "rewards/margins": 29.0186710357666, "rewards/rejected": -28.704509735107422, "step": 4220 }, { "epoch": 1.44, "learning_rate": 2.893113433211632e-07, "logits/chosen": -2.766878128051758, "logits/rejected": -2.6351218223571777, "logps/chosen": -225.78402709960938, "logps/rejected": -552.7686767578125, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 0.6258989572525024, "rewards/margins": 24.461589813232422, "rewards/rejected": -23.835691452026367, "step": 4230 }, { "epoch": 1.44, "learning_rate": 2.886818582399597e-07, "logits/chosen": -2.5963807106018066, "logits/rejected": -2.598022222518921, "logps/chosen": -261.10614013671875, "logps/rejected": -529.913330078125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.30877813696861267, "rewards/margins": 24.679567337036133, "rewards/rejected": -24.370792388916016, "step": 4240 }, { "epoch": 1.44, "learning_rate": 2.880523731587561e-07, "logits/chosen": -2.6120171546936035, "logits/rejected": -2.552940607070923, "logps/chosen": -358.513427734375, "logps/rejected": -976.51513671875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.3600661754608154, "rewards/margins": 26.413166046142578, "rewards/rejected": -25.0531005859375, "step": 4250 }, { "epoch": 1.45, "learning_rate": 2.8742288807755255e-07, "logits/chosen": -2.7398133277893066, "logits/rejected": -2.5171186923980713, "logps/chosen": -258.6328125, "logps/rejected": -803.0509643554688, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 0.8711425065994263, "rewards/margins": 27.679407119750977, "rewards/rejected": -26.808263778686523, "step": 4260 }, { "epoch": 1.45, "learning_rate": 2.8679340299634897e-07, "logits/chosen": -2.8204236030578613, "logits/rejected": -2.6427371501922607, "logps/chosen": -197.3133544921875, "logps/rejected": -988.5374145507812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.0457288026809692, "rewards/margins": 30.457317352294922, "rewards/rejected": -29.411590576171875, "step": 4270 }, { "epoch": 1.45, "learning_rate": 2.861639179151454e-07, "logits/chosen": -2.763899087905884, "logits/rejected": -2.61624813079834, "logps/chosen": -230.610107421875, "logps/rejected": -777.4322509765625, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -0.031659964472055435, "rewards/margins": 33.47124481201172, "rewards/rejected": -33.50291061401367, "step": 4280 }, { "epoch": 1.46, "learning_rate": 2.855344328339418e-07, "logits/chosen": -2.7443761825561523, "logits/rejected": -2.6463332176208496, "logps/chosen": -205.0055694580078, "logps/rejected": -781.7361450195312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.7390271425247192, "rewards/margins": 36.282955169677734, "rewards/rejected": -35.54393005371094, "step": 4290 }, { "epoch": 1.46, "learning_rate": 2.849049477527383e-07, "logits/chosen": -2.7578072547912598, "logits/rejected": -2.603610038757324, "logps/chosen": -215.01846313476562, "logps/rejected": -579.824462890625, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 0.686456561088562, "rewards/margins": 33.57085418701172, "rewards/rejected": -32.884395599365234, "step": 4300 }, { "epoch": 1.46, "eval_logits/chosen": -2.8567276000976562, "eval_logits/rejected": -2.66465425491333, "eval_logps/chosen": -261.52557373046875, "eval_logps/rejected": -785.3499755859375, "eval_loss": 0.008891169913113117, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.1430947631597519, "eval_rewards/margins": 38.46055221557617, "eval_rewards/rejected": -38.31745910644531, "eval_runtime": 461.4055, "eval_samples_per_second": 20.589, "eval_steps_per_second": 0.644, "step": 4300 }, { "epoch": 1.46, "learning_rate": 2.8427546267153466e-07, "logits/chosen": -2.766993522644043, "logits/rejected": -2.662322759628296, "logps/chosen": -228.8236541748047, "logps/rejected": -874.7486572265625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.37523913383483887, "rewards/margins": 38.80022430419922, "rewards/rejected": -39.17546463012695, "step": 4310 }, { "epoch": 1.47, "learning_rate": 2.836459775903311e-07, "logits/chosen": -2.7486112117767334, "logits/rejected": -2.6629908084869385, "logps/chosen": -309.25933837890625, "logps/rejected": -663.23828125, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 0.5993863344192505, "rewards/margins": 35.58545684814453, "rewards/rejected": -34.98606872558594, "step": 4320 }, { "epoch": 1.47, "learning_rate": 2.830164925091275e-07, "logits/chosen": -2.7881641387939453, "logits/rejected": -2.664046049118042, "logps/chosen": -212.25112915039062, "logps/rejected": -694.9156494140625, "loss": 0.0402, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19427692890167236, "rewards/margins": 34.012939453125, "rewards/rejected": -33.818660736083984, "step": 4330 }, { "epoch": 1.48, "learning_rate": 2.8238700742792393e-07, "logits/chosen": -2.7717843055725098, "logits/rejected": -2.689058780670166, "logps/chosen": -250.7357940673828, "logps/rejected": -527.68603515625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.293671727180481, "rewards/margins": 29.186044692993164, "rewards/rejected": -27.892370223999023, "step": 4340 }, { "epoch": 1.48, "learning_rate": 2.8175752234672035e-07, "logits/chosen": -2.640716075897217, "logits/rejected": -2.602928638458252, "logps/chosen": -302.7138671875, "logps/rejected": -623.222900390625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.7052724361419678, "rewards/margins": 28.394643783569336, "rewards/rejected": -26.68937110900879, "step": 4350 }, { "epoch": 1.48, "learning_rate": 2.8112803726551683e-07, "logits/chosen": -2.6802711486816406, "logits/rejected": -2.6362807750701904, "logps/chosen": -327.9851379394531, "logps/rejected": -448.47637939453125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.210737943649292, "rewards/margins": 23.303722381591797, "rewards/rejected": -22.092985153198242, "step": 4360 }, { "epoch": 1.49, "learning_rate": 2.8049855218431325e-07, "logits/chosen": -2.7490339279174805, "logits/rejected": -2.5496137142181396, "logps/chosen": -213.89688110351562, "logps/rejected": -563.30126953125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.0842018127441406, "rewards/margins": 25.106163024902344, "rewards/rejected": -24.021961212158203, "step": 4370 }, { "epoch": 1.49, "learning_rate": 2.7986906710310967e-07, "logits/chosen": -2.6895484924316406, "logits/rejected": -2.5920329093933105, "logps/chosen": -263.38836669921875, "logps/rejected": -952.5206909179688, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.3598905801773071, "rewards/margins": 32.234352111816406, "rewards/rejected": -30.874460220336914, "step": 4380 }, { "epoch": 1.49, "learning_rate": 2.7923958202190604e-07, "logits/chosen": -2.7882306575775146, "logits/rejected": -2.5809566974639893, "logps/chosen": -190.88221740722656, "logps/rejected": -931.6907348632812, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.6649356484413147, "rewards/margins": 30.795312881469727, "rewards/rejected": -30.130374908447266, "step": 4390 }, { "epoch": 1.5, "learning_rate": 2.7861009694070247e-07, "logits/chosen": -2.708773612976074, "logits/rejected": -2.6548779010772705, "logps/chosen": -284.2604675292969, "logps/rejected": -558.4825439453125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.9561972618103027, "rewards/margins": 30.171533584594727, "rewards/rejected": -29.215335845947266, "step": 4400 }, { "epoch": 1.5, "eval_logits/chosen": -2.875624895095825, "eval_logits/rejected": -2.674558401107788, "eval_logps/chosen": -251.38954162597656, "eval_logps/rejected": -684.098876953125, "eval_loss": 0.0052030328661203384, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.156697392463684, "eval_rewards/margins": 29.34904670715332, "eval_rewards/rejected": -28.192350387573242, "eval_runtime": 461.1744, "eval_samples_per_second": 20.6, "eval_steps_per_second": 0.644, "step": 4400 }, { "epoch": 1.5, "learning_rate": 2.779806118594989e-07, "logits/chosen": -2.7441837787628174, "logits/rejected": -2.5583243370056152, "logps/chosen": -266.9261474609375, "logps/rejected": -775.1690673828125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.7644990682601929, "rewards/margins": 30.6723575592041, "rewards/rejected": -28.907861709594727, "step": 4410 }, { "epoch": 1.5, "learning_rate": 2.7735112677829536e-07, "logits/chosen": -2.872344970703125, "logits/rejected": -2.5706145763397217, "logps/chosen": -196.55636596679688, "logps/rejected": -646.5297241210938, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": 1.0086524486541748, "rewards/margins": 27.684524536132812, "rewards/rejected": -26.675872802734375, "step": 4420 }, { "epoch": 1.51, "learning_rate": 2.767216416970918e-07, "logits/chosen": -2.755558729171753, "logits/rejected": -2.6231961250305176, "logps/chosen": -191.6370849609375, "logps/rejected": -539.1219482421875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.5513501167297363, "rewards/margins": 28.45609474182129, "rewards/rejected": -26.904743194580078, "step": 4430 }, { "epoch": 1.51, "learning_rate": 2.760921566158882e-07, "logits/chosen": -2.8150930404663086, "logits/rejected": -2.580435276031494, "logps/chosen": -242.98745727539062, "logps/rejected": -689.3055419921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.9172603487968445, "rewards/margins": 29.14654541015625, "rewards/rejected": -28.22928237915039, "step": 4440 }, { "epoch": 1.51, "learning_rate": 2.7546267153468463e-07, "logits/chosen": -2.581252336502075, "logits/rejected": -2.544241428375244, "logps/chosen": -402.9229736328125, "logps/rejected": -623.0001831054688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.5738767385482788, "rewards/margins": 27.09048843383789, "rewards/rejected": -25.51660919189453, "step": 4450 }, { "epoch": 1.52, "learning_rate": 2.74833186453481e-07, "logits/chosen": -2.643822431564331, "logits/rejected": -2.6076011657714844, "logps/chosen": -317.9627990722656, "logps/rejected": -690.5311889648438, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.6298885345458984, "rewards/margins": 29.454818725585938, "rewards/rejected": -27.82493019104004, "step": 4460 }, { "epoch": 1.52, "learning_rate": 2.742037013722774e-07, "logits/chosen": -2.7147934436798096, "logits/rejected": -2.5830719470977783, "logps/chosen": -264.97760009765625, "logps/rejected": -709.0381469726562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.555571973323822, "rewards/margins": 30.121829986572266, "rewards/rejected": -29.56625747680664, "step": 4470 }, { "epoch": 1.52, "learning_rate": 2.735742162910739e-07, "logits/chosen": -2.697695255279541, "logits/rejected": -2.637030839920044, "logps/chosen": -293.4368896484375, "logps/rejected": -520.5118408203125, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.654098391532898, "rewards/margins": 26.076345443725586, "rewards/rejected": -25.422245025634766, "step": 4480 }, { "epoch": 1.53, "learning_rate": 2.729447312098703e-07, "logits/chosen": -2.7270455360412598, "logits/rejected": -2.6716148853302, "logps/chosen": -358.12432861328125, "logps/rejected": -755.7617797851562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.3034100532531738, "rewards/margins": 29.965600967407227, "rewards/rejected": -28.662189483642578, "step": 4490 }, { "epoch": 1.53, "learning_rate": 2.7231524612866675e-07, "logits/chosen": -2.7699177265167236, "logits/rejected": -2.6394355297088623, "logps/chosen": -204.4804229736328, "logps/rejected": -739.7496337890625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.07344651222229, "rewards/margins": 29.472265243530273, "rewards/rejected": -28.398818969726562, "step": 4500 }, { "epoch": 1.53, "eval_logits/chosen": -2.839737892150879, "eval_logits/rejected": -2.6321487426757812, "eval_logps/chosen": -253.41726684570312, "eval_logps/rejected": -686.9837646484375, "eval_loss": 0.004785547498613596, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.9539247155189514, "eval_rewards/margins": 29.434757232666016, "eval_rewards/rejected": -28.480833053588867, "eval_runtime": 462.239, "eval_samples_per_second": 20.552, "eval_steps_per_second": 0.643, "step": 4500 }, { "epoch": 1.53, "learning_rate": 2.7168576104746317e-07, "logits/chosen": -2.7701313495635986, "logits/rejected": -2.6958632469177246, "logps/chosen": -220.55020141601562, "logps/rejected": -748.0827026367188, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.4515727758407593, "rewards/margins": 31.12331771850586, "rewards/rejected": -29.6717472076416, "step": 4510 }, { "epoch": 1.54, "learning_rate": 2.710562759662596e-07, "logits/chosen": -2.7820420265197754, "logits/rejected": -2.5065484046936035, "logps/chosen": -259.4388122558594, "logps/rejected": -679.7838134765625, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.926753282546997, "rewards/margins": 26.65243911743164, "rewards/rejected": -24.725683212280273, "step": 4520 }, { "epoch": 1.54, "learning_rate": 2.70426790885056e-07, "logits/chosen": -2.694397449493408, "logits/rejected": -2.5708096027374268, "logps/chosen": -240.5037384033203, "logps/rejected": -844.8701171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.654218077659607, "rewards/margins": 31.156539916992188, "rewards/rejected": -29.5023193359375, "step": 4530 }, { "epoch": 1.54, "learning_rate": 2.6979730580385244e-07, "logits/chosen": -2.677518367767334, "logits/rejected": -2.6636338233947754, "logps/chosen": -316.16497802734375, "logps/rejected": -684.26220703125, "loss": 0.0049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3141165971755981, "rewards/margins": 25.035091400146484, "rewards/rejected": -23.720975875854492, "step": 4540 }, { "epoch": 1.55, "learning_rate": 2.6916782072264886e-07, "logits/chosen": -2.8328967094421387, "logits/rejected": -2.6569197177886963, "logps/chosen": -250.9690704345703, "logps/rejected": -543.0106811523438, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 1.2351460456848145, "rewards/margins": 24.298084259033203, "rewards/rejected": -23.06293487548828, "step": 4550 }, { "epoch": 1.55, "learning_rate": 2.685383356414453e-07, "logits/chosen": -2.745004177093506, "logits/rejected": -2.6724696159362793, "logps/chosen": -260.5653076171875, "logps/rejected": -590.9301147460938, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.4756407737731934, "rewards/margins": 28.6966609954834, "rewards/rejected": -27.221023559570312, "step": 4560 }, { "epoch": 1.55, "learning_rate": 2.679088505602417e-07, "logits/chosen": -2.7574963569641113, "logits/rejected": -2.7231857776641846, "logps/chosen": -289.28485107421875, "logps/rejected": -765.3423461914062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.1829581260681152, "rewards/margins": 31.030338287353516, "rewards/rejected": -28.84737777709961, "step": 4570 }, { "epoch": 1.56, "learning_rate": 2.6727936547903813e-07, "logits/chosen": -2.7917351722717285, "logits/rejected": -2.517425537109375, "logps/chosen": -199.94094848632812, "logps/rejected": -644.8704223632812, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.4789985418319702, "rewards/margins": 22.439237594604492, "rewards/rejected": -20.96023941040039, "step": 4580 }, { "epoch": 1.56, "learning_rate": 2.6664988039783455e-07, "logits/chosen": -2.7801995277404785, "logits/rejected": -2.6880738735198975, "logps/chosen": -193.4755401611328, "logps/rejected": -503.1551818847656, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.1128803491592407, "rewards/margins": 24.420345306396484, "rewards/rejected": -23.30746078491211, "step": 4590 }, { "epoch": 1.56, "learning_rate": 2.66020395316631e-07, "logits/chosen": -2.787672519683838, "logits/rejected": -2.678805112838745, "logps/chosen": -196.39443969726562, "logps/rejected": -529.9307861328125, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 1.0455574989318848, "rewards/margins": 24.29790687561035, "rewards/rejected": -23.252347946166992, "step": 4600 }, { "epoch": 1.56, "eval_logits/chosen": -2.8767926692962646, "eval_logits/rejected": -2.6732332706451416, "eval_logps/chosen": -250.75633239746094, "eval_logps/rejected": -660.46435546875, "eval_loss": 0.005265547428280115, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.2200191020965576, "eval_rewards/margins": 27.04892349243164, "eval_rewards/rejected": -25.82890510559082, "eval_runtime": 461.1937, "eval_samples_per_second": 20.599, "eval_steps_per_second": 0.644, "step": 4600 }, { "epoch": 1.57, "learning_rate": 2.6539091023542745e-07, "logits/chosen": -2.7719967365264893, "logits/rejected": -2.6079840660095215, "logps/chosen": -197.73980712890625, "logps/rejected": -775.3809814453125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.7519117593765259, "rewards/margins": 28.819604873657227, "rewards/rejected": -27.06769371032715, "step": 4610 }, { "epoch": 1.57, "learning_rate": 2.647614251542238e-07, "logits/chosen": -2.719355821609497, "logits/rejected": -2.610138416290283, "logps/chosen": -223.3323211669922, "logps/rejected": -761.9006958007812, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.7778436541557312, "rewards/margins": 26.297012329101562, "rewards/rejected": -25.519168853759766, "step": 4620 }, { "epoch": 1.57, "learning_rate": 2.6413194007302024e-07, "logits/chosen": -2.6796488761901855, "logits/rejected": -2.5908825397491455, "logps/chosen": -217.9093780517578, "logps/rejected": -657.6917114257812, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.08230184018611908, "rewards/margins": 23.844013214111328, "rewards/rejected": -23.76171112060547, "step": 4630 }, { "epoch": 1.58, "learning_rate": 2.6350245499181666e-07, "logits/chosen": -2.7216479778289795, "logits/rejected": -2.622500419616699, "logps/chosen": -275.26873779296875, "logps/rejected": -666.6151123046875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.8442145586013794, "rewards/margins": 26.581493377685547, "rewards/rejected": -25.737279891967773, "step": 4640 }, { "epoch": 1.58, "learning_rate": 2.628729699106131e-07, "logits/chosen": -2.842674493789673, "logits/rejected": -2.695049524307251, "logps/chosen": -208.21963500976562, "logps/rejected": -679.7228393554688, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.731654167175293, "rewards/margins": 30.78403091430664, "rewards/rejected": -30.052377700805664, "step": 4650 }, { "epoch": 1.58, "learning_rate": 2.6224348482940956e-07, "logits/chosen": -2.68668532371521, "logits/rejected": -2.6042068004608154, "logps/chosen": -345.448974609375, "logps/rejected": -822.5787353515625, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.2789802551269531, "rewards/margins": 31.517492294311523, "rewards/rejected": -30.238513946533203, "step": 4660 }, { "epoch": 1.59, "learning_rate": 2.61613999748206e-07, "logits/chosen": -2.671724796295166, "logits/rejected": -2.669206380844116, "logps/chosen": -278.0516052246094, "logps/rejected": -553.95556640625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.4551740884780884, "rewards/margins": 31.40007972717285, "rewards/rejected": -29.944904327392578, "step": 4670 }, { "epoch": 1.59, "learning_rate": 2.609845146670024e-07, "logits/chosen": -2.822788953781128, "logits/rejected": -2.6263537406921387, "logps/chosen": -188.5457305908203, "logps/rejected": -552.0889892578125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.6563887596130371, "rewards/margins": 29.025609970092773, "rewards/rejected": -28.36922264099121, "step": 4680 }, { "epoch": 1.59, "learning_rate": 2.603550295857988e-07, "logits/chosen": -2.790506601333618, "logits/rejected": -2.5776543617248535, "logps/chosen": -226.98757934570312, "logps/rejected": -703.2592163085938, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.16731971502304077, "rewards/margins": 26.862178802490234, "rewards/rejected": -27.02950096130371, "step": 4690 }, { "epoch": 1.6, "learning_rate": 2.597255445045952e-07, "logits/chosen": -2.62505841255188, "logits/rejected": -2.6986446380615234, "logps/chosen": -314.7447814941406, "logps/rejected": -575.5631103515625, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -0.10584727674722672, "rewards/margins": 28.999563217163086, "rewards/rejected": -29.10541343688965, "step": 4700 }, { "epoch": 1.6, "eval_logits/chosen": -2.863471031188965, "eval_logits/rejected": -2.668707847595215, "eval_logps/chosen": -257.53448486328125, "eval_logps/rejected": -701.1362915039062, "eval_loss": 0.0037278791423887014, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.5422021150588989, "eval_rewards/margins": 30.438302993774414, "eval_rewards/rejected": -29.89609718322754, "eval_runtime": 461.6127, "eval_samples_per_second": 20.58, "eval_steps_per_second": 0.643, "step": 4700 }, { "epoch": 1.6, "learning_rate": 2.590960594233916e-07, "logits/chosen": -2.757835626602173, "logits/rejected": -2.595532178878784, "logps/chosen": -234.9602508544922, "logps/rejected": -711.530029296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.37777942419052124, "rewards/margins": 28.227977752685547, "rewards/rejected": -27.850204467773438, "step": 4710 }, { "epoch": 1.6, "learning_rate": 2.584665743421881e-07, "logits/chosen": -2.7668519020080566, "logits/rejected": -2.6427574157714844, "logps/chosen": -217.3827667236328, "logps/rejected": -562.5655517578125, "loss": 0.0538, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6540214419364929, "rewards/margins": 29.097362518310547, "rewards/rejected": -29.751384735107422, "step": 4720 }, { "epoch": 1.61, "learning_rate": 2.578370892609845e-07, "logits/chosen": -2.784456253051758, "logits/rejected": -2.7531657218933105, "logps/chosen": -251.67568969726562, "logps/rejected": -629.7329711914062, "loss": 0.0015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5052122473716736, "rewards/margins": 34.061126708984375, "rewards/rejected": -33.555912017822266, "step": 4730 }, { "epoch": 1.61, "learning_rate": 2.5720760417978095e-07, "logits/chosen": -2.8087658882141113, "logits/rejected": -2.7776081562042236, "logps/chosen": -320.38641357421875, "logps/rejected": -668.893310546875, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 0.016849270090460777, "rewards/margins": 30.43368911743164, "rewards/rejected": -30.41684341430664, "step": 4740 }, { "epoch": 1.61, "learning_rate": 2.5657811909857737e-07, "logits/chosen": -2.808671712875366, "logits/rejected": -2.7438178062438965, "logps/chosen": -280.91632080078125, "logps/rejected": -631.5396728515625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.7036969065666199, "rewards/margins": 29.705297470092773, "rewards/rejected": -29.001596450805664, "step": 4750 }, { "epoch": 1.62, "learning_rate": 2.559486340173738e-07, "logits/chosen": -2.957623243331909, "logits/rejected": -2.826572895050049, "logps/chosen": -219.1203155517578, "logps/rejected": -635.8316650390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.17810681462287903, "rewards/margins": 33.19062423706055, "rewards/rejected": -33.01251983642578, "step": 4760 }, { "epoch": 1.62, "learning_rate": 2.5531914893617016e-07, "logits/chosen": -2.8885440826416016, "logits/rejected": -2.7619853019714355, "logps/chosen": -218.9480743408203, "logps/rejected": -932.0040283203125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.2859153151512146, "rewards/margins": 34.177154541015625, "rewards/rejected": -34.46306610107422, "step": 4770 }, { "epoch": 1.62, "learning_rate": 2.5468966385496664e-07, "logits/chosen": -2.9239144325256348, "logits/rejected": -2.698477268218994, "logps/chosen": -213.2068328857422, "logps/rejected": -552.6846923828125, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.35699471831321716, "rewards/margins": 31.123382568359375, "rewards/rejected": -30.76638412475586, "step": 4780 }, { "epoch": 1.63, "learning_rate": 2.5406017877376306e-07, "logits/chosen": -2.9060378074645996, "logits/rejected": -2.7666268348693848, "logps/chosen": -250.65640258789062, "logps/rejected": -683.8248291015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.2791534960269928, "rewards/margins": 34.078834533691406, "rewards/rejected": -33.799678802490234, "step": 4790 }, { "epoch": 1.63, "learning_rate": 2.534306936925595e-07, "logits/chosen": -2.8886232376098633, "logits/rejected": -2.8052754402160645, "logps/chosen": -266.97259521484375, "logps/rejected": -721.6685180664062, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 0.9588086009025574, "rewards/margins": 36.294097900390625, "rewards/rejected": -35.33529281616211, "step": 4800 }, { "epoch": 1.63, "eval_logits/chosen": -3.0418217182159424, "eval_logits/rejected": -2.819225788116455, "eval_logps/chosen": -261.05938720703125, "eval_logps/rejected": -753.3645629882812, "eval_loss": 0.005790026858448982, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.18971292674541473, "eval_rewards/margins": 35.30863571166992, "eval_rewards/rejected": -35.118919372558594, "eval_runtime": 461.6874, "eval_samples_per_second": 20.577, "eval_steps_per_second": 0.643, "step": 4800 }, { "epoch": 1.63, "learning_rate": 2.528012086113559e-07, "logits/chosen": -2.8857369422912598, "logits/rejected": -2.7257068157196045, "logps/chosen": -217.79537963867188, "logps/rejected": -921.7642822265625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.6817263960838318, "rewards/margins": 39.19708251953125, "rewards/rejected": -38.515357971191406, "step": 4810 }, { "epoch": 1.64, "learning_rate": 2.5217172353015233e-07, "logits/chosen": -2.97232723236084, "logits/rejected": -2.734052896499634, "logps/chosen": -258.02606201171875, "logps/rejected": -788.9205322265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.949163794517517, "rewards/margins": 35.10908889770508, "rewards/rejected": -33.15993118286133, "step": 4820 }, { "epoch": 1.64, "learning_rate": 2.5154223844894875e-07, "logits/chosen": -2.964486837387085, "logits/rejected": -2.8825886249542236, "logps/chosen": -286.3688659667969, "logps/rejected": -552.3123168945312, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.22870314121246338, "rewards/margins": 28.493450164794922, "rewards/rejected": -28.264751434326172, "step": 4830 }, { "epoch": 1.65, "learning_rate": 2.509127533677452e-07, "logits/chosen": -2.992199659347534, "logits/rejected": -2.7925055027008057, "logps/chosen": -269.0264587402344, "logps/rejected": -490.70086669921875, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": 1.2721827030181885, "rewards/margins": 28.247661590576172, "rewards/rejected": -26.975482940673828, "step": 4840 }, { "epoch": 1.65, "learning_rate": 2.502832682865416e-07, "logits/chosen": -3.009685754776001, "logits/rejected": -2.8138554096221924, "logps/chosen": -269.70294189453125, "logps/rejected": -619.4361572265625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.755861520767212, "rewards/margins": 25.429370880126953, "rewards/rejected": -23.67350959777832, "step": 4850 }, { "epoch": 1.65, "learning_rate": 2.49653783205338e-07, "logits/chosen": -2.9673562049865723, "logits/rejected": -2.806311845779419, "logps/chosen": -205.8111114501953, "logps/rejected": -637.414794921875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 0.9344761967658997, "rewards/margins": 28.880802154541016, "rewards/rejected": -27.946325302124023, "step": 4860 }, { "epoch": 1.66, "learning_rate": 2.4902429812413444e-07, "logits/chosen": -2.868058681488037, "logits/rejected": -2.7854363918304443, "logps/chosen": -209.317138671875, "logps/rejected": -663.5977783203125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.899246871471405, "rewards/margins": 28.764175415039062, "rewards/rejected": -27.86492919921875, "step": 4870 }, { "epoch": 1.66, "learning_rate": 2.4839481304293086e-07, "logits/chosen": -2.8362536430358887, "logits/rejected": -2.7397358417510986, "logps/chosen": -254.9237823486328, "logps/rejected": -732.1149291992188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.6498401761054993, "rewards/margins": 28.90823745727539, "rewards/rejected": -28.258398056030273, "step": 4880 }, { "epoch": 1.66, "learning_rate": 2.477653279617273e-07, "logits/chosen": -2.891831636428833, "logits/rejected": -2.762570858001709, "logps/chosen": -288.5021057128906, "logps/rejected": -937.44775390625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.139258623123169, "rewards/margins": 32.65577697753906, "rewards/rejected": -31.516515731811523, "step": 4890 }, { "epoch": 1.67, "learning_rate": 2.471358428805237e-07, "logits/chosen": -2.8550195693969727, "logits/rejected": -2.7539262771606445, "logps/chosen": -266.2559509277344, "logps/rejected": -722.3253173828125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.6254793405532837, "rewards/margins": 30.363643646240234, "rewards/rejected": -28.7381649017334, "step": 4900 }, { "epoch": 1.67, "eval_logits/chosen": -2.9981296062469482, "eval_logits/rejected": -2.773132801055908, "eval_logps/chosen": -252.8162384033203, "eval_logps/rejected": -697.8760375976562, "eval_loss": 0.0034651614259928465, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.0140293836593628, "eval_rewards/margins": 30.58409881591797, "eval_rewards/rejected": -29.570072174072266, "eval_runtime": 461.535, "eval_samples_per_second": 20.583, "eval_steps_per_second": 0.644, "step": 4900 }, { "epoch": 1.67, "learning_rate": 2.4650635779932013e-07, "logits/chosen": -2.8980488777160645, "logits/rejected": -2.8199546337127686, "logps/chosen": -211.7263946533203, "logps/rejected": -598.0134887695312, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.45471420884132385, "rewards/margins": 31.205820083618164, "rewards/rejected": -30.7511043548584, "step": 4910 }, { "epoch": 1.67, "learning_rate": 2.4587687271811656e-07, "logits/chosen": -2.8987154960632324, "logits/rejected": -2.6653835773468018, "logps/chosen": -221.4749755859375, "logps/rejected": -799.7275390625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.2500927448272705, "rewards/margins": 29.367584228515625, "rewards/rejected": -28.117488861083984, "step": 4920 }, { "epoch": 1.68, "learning_rate": 2.45247387636913e-07, "logits/chosen": -2.8618173599243164, "logits/rejected": -2.713264226913452, "logps/chosen": -235.89468383789062, "logps/rejected": -618.2537231445312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.000112533569336, "rewards/margins": 26.836069107055664, "rewards/rejected": -24.835956573486328, "step": 4930 }, { "epoch": 1.68, "learning_rate": 2.446179025557094e-07, "logits/chosen": -2.752509355545044, "logits/rejected": -2.641965866088867, "logps/chosen": -369.4002990722656, "logps/rejected": -775.9088134765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.1115810871124268, "rewards/margins": 28.26593017578125, "rewards/rejected": -27.15435218811035, "step": 4940 }, { "epoch": 1.68, "learning_rate": 2.439884174745059e-07, "logits/chosen": -2.878221273422241, "logits/rejected": -2.7562150955200195, "logps/chosen": -261.14508056640625, "logps/rejected": -636.1710815429688, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 0.7023094892501831, "rewards/margins": 31.490222930908203, "rewards/rejected": -30.787914276123047, "step": 4950 }, { "epoch": 1.69, "learning_rate": 2.4335893239330225e-07, "logits/chosen": -2.776170015335083, "logits/rejected": -2.7344746589660645, "logps/chosen": -302.3923645019531, "logps/rejected": -637.8427734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.2469639778137207, "rewards/margins": 26.760326385498047, "rewards/rejected": -25.51336097717285, "step": 4960 }, { "epoch": 1.69, "learning_rate": 2.4272944731209867e-07, "logits/chosen": -2.9131665229797363, "logits/rejected": -2.694739818572998, "logps/chosen": -265.6712646484375, "logps/rejected": -641.267822265625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.4986623525619507, "rewards/margins": 30.596887588500977, "rewards/rejected": -29.098224639892578, "step": 4970 }, { "epoch": 1.69, "learning_rate": 2.4209996223089514e-07, "logits/chosen": -2.8293423652648926, "logits/rejected": -2.6903605461120605, "logps/chosen": -254.6172332763672, "logps/rejected": -614.4579467773438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.920792818069458, "rewards/margins": 25.221065521240234, "rewards/rejected": -23.300273895263672, "step": 4980 }, { "epoch": 1.7, "learning_rate": 2.4147047714969157e-07, "logits/chosen": -2.9621009826660156, "logits/rejected": -2.746459484100342, "logps/chosen": -207.6763458251953, "logps/rejected": -652.8716430664062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.6471809148788452, "rewards/margins": 22.882892608642578, "rewards/rejected": -21.2357120513916, "step": 4990 }, { "epoch": 1.7, "learning_rate": 2.4084099206848794e-07, "logits/chosen": -2.853076457977295, "logits/rejected": -2.685256242752075, "logps/chosen": -200.51876831054688, "logps/rejected": -542.948974609375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.0516247749328613, "rewards/margins": 23.682172775268555, "rewards/rejected": -21.630550384521484, "step": 5000 }, { "epoch": 1.7, "eval_logits/chosen": -2.9752540588378906, "eval_logits/rejected": -2.7513301372528076, "eval_logps/chosen": -247.38255310058594, "eval_logps/rejected": -652.6386108398438, "eval_loss": 0.002509304555132985, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.5573989152908325, "eval_rewards/margins": 26.60372543334961, "eval_rewards/rejected": -25.04632568359375, "eval_runtime": 460.7298, "eval_samples_per_second": 20.619, "eval_steps_per_second": 0.645, "step": 5000 }, { "epoch": 1.7, "learning_rate": 2.402115069872844e-07, "logits/chosen": -2.9073281288146973, "logits/rejected": -2.7530109882354736, "logps/chosen": -217.35888671875, "logps/rejected": -555.173095703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.3652015924453735, "rewards/margins": 29.04729652404785, "rewards/rejected": -27.68209457397461, "step": 5010 }, { "epoch": 1.71, "learning_rate": 2.3958202190608084e-07, "logits/chosen": -2.982266902923584, "logits/rejected": -2.750180959701538, "logps/chosen": -268.06396484375, "logps/rejected": -558.2090454101562, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.5124289989471436, "rewards/margins": 25.514629364013672, "rewards/rejected": -24.002201080322266, "step": 5020 }, { "epoch": 1.71, "learning_rate": 2.3895253682487726e-07, "logits/chosen": -2.8144264221191406, "logits/rejected": -2.733407497406006, "logps/chosen": -210.7088165283203, "logps/rejected": -641.718017578125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.8368736505508423, "rewards/margins": 25.1309814453125, "rewards/rejected": -23.29410743713379, "step": 5030 }, { "epoch": 1.71, "learning_rate": 2.3832305174367368e-07, "logits/chosen": -2.8141582012176514, "logits/rejected": -2.7753374576568604, "logps/chosen": -255.0494842529297, "logps/rejected": -658.6012573242188, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 1.2364368438720703, "rewards/margins": 31.72926902770996, "rewards/rejected": -30.492834091186523, "step": 5040 }, { "epoch": 1.72, "learning_rate": 2.3769356666247008e-07, "logits/chosen": -2.9148569107055664, "logits/rejected": -2.7456679344177246, "logps/chosen": -252.4602508544922, "logps/rejected": -676.4150390625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.7373751401901245, "rewards/margins": 29.974075317382812, "rewards/rejected": -29.2367000579834, "step": 5050 }, { "epoch": 1.72, "learning_rate": 2.370640815812665e-07, "logits/chosen": -2.940296173095703, "logits/rejected": -2.7597320079803467, "logps/chosen": -269.8207092285156, "logps/rejected": -787.490966796875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.9210324287414551, "rewards/margins": 38.961124420166016, "rewards/rejected": -38.04009246826172, "step": 5060 }, { "epoch": 1.72, "learning_rate": 2.3643459650006295e-07, "logits/chosen": -2.9466586112976074, "logits/rejected": -2.8953967094421387, "logps/chosen": -269.36834716796875, "logps/rejected": -536.0892333984375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.0494163036346436, "rewards/margins": 31.193185806274414, "rewards/rejected": -30.14377212524414, "step": 5070 }, { "epoch": 1.73, "learning_rate": 2.3580511141885937e-07, "logits/chosen": -2.935952663421631, "logits/rejected": -2.8308651447296143, "logps/chosen": -258.36962890625, "logps/rejected": -568.4273681640625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.6959034204483032, "rewards/margins": 29.860828399658203, "rewards/rejected": -29.1649227142334, "step": 5080 }, { "epoch": 1.73, "learning_rate": 2.3517562633765577e-07, "logits/chosen": -2.8458030223846436, "logits/rejected": -2.8045437335968018, "logps/chosen": -190.5179901123047, "logps/rejected": -560.0125732421875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.1409196853637695, "rewards/margins": 30.81087303161621, "rewards/rejected": -29.66995620727539, "step": 5090 }, { "epoch": 1.73, "learning_rate": 2.3454614125645222e-07, "logits/chosen": -2.86490797996521, "logits/rejected": -2.7126636505126953, "logps/chosen": -236.3907470703125, "logps/rejected": -756.9246826171875, "loss": 0.008, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4101478159427643, "rewards/margins": 31.074304580688477, "rewards/rejected": -30.66415786743164, "step": 5100 }, { "epoch": 1.73, "eval_logits/chosen": -3.0091922283172607, "eval_logits/rejected": -2.7995107173919678, "eval_logps/chosen": -251.6932830810547, "eval_logps/rejected": -705.36328125, "eval_loss": 0.002607343252748251, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.1263256072998047, "eval_rewards/margins": 31.445117950439453, "eval_rewards/rejected": -30.318790435791016, "eval_runtime": 461.2333, "eval_samples_per_second": 20.597, "eval_steps_per_second": 0.644, "step": 5100 }, { "epoch": 1.74, "learning_rate": 2.3391665617524864e-07, "logits/chosen": -2.946150064468384, "logits/rejected": -2.7362828254699707, "logps/chosen": -312.8003234863281, "logps/rejected": -905.1656494140625, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": 1.5917870998382568, "rewards/margins": 31.510730743408203, "rewards/rejected": -29.918941497802734, "step": 5110 }, { "epoch": 1.74, "learning_rate": 2.3328717109404506e-07, "logits/chosen": -2.9066858291625977, "logits/rejected": -2.8574788570404053, "logps/chosen": -210.5686798095703, "logps/rejected": -624.9244384765625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.3280174732208252, "rewards/margins": 33.82992172241211, "rewards/rejected": -32.50190734863281, "step": 5120 }, { "epoch": 1.74, "learning_rate": 2.3265768601284149e-07, "logits/chosen": -2.8956592082977295, "logits/rejected": -2.7955193519592285, "logps/chosen": -340.9004211425781, "logps/rejected": -744.781005859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.0603870153427124, "rewards/margins": 32.37177276611328, "rewards/rejected": -31.311386108398438, "step": 5130 }, { "epoch": 1.75, "learning_rate": 2.320282009316379e-07, "logits/chosen": -2.937199354171753, "logits/rejected": -2.7700271606445312, "logps/chosen": -320.4074401855469, "logps/rejected": -885.9381103515625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 2.326355218887329, "rewards/margins": 35.57560348510742, "rewards/rejected": -33.24924850463867, "step": 5140 }, { "epoch": 1.75, "learning_rate": 2.3139871585043433e-07, "logits/chosen": -2.8773674964904785, "logits/rejected": -2.822056531906128, "logps/chosen": -258.0752258300781, "logps/rejected": -597.1814575195312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.997346043586731, "rewards/margins": 29.815160751342773, "rewards/rejected": -28.81781578063965, "step": 5150 }, { "epoch": 1.75, "learning_rate": 2.3076923076923078e-07, "logits/chosen": -2.8165054321289062, "logits/rejected": -2.7633092403411865, "logps/chosen": -345.38677978515625, "logps/rejected": -639.490966796875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 2.318730354309082, "rewards/margins": 30.846614837646484, "rewards/rejected": -28.527883529663086, "step": 5160 }, { "epoch": 1.76, "learning_rate": 2.3013974568802718e-07, "logits/chosen": -2.918994665145874, "logits/rejected": -2.754453420639038, "logps/chosen": -256.8601989746094, "logps/rejected": -590.8077392578125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.7915420532226562, "rewards/margins": 29.140727996826172, "rewards/rejected": -27.34918785095215, "step": 5170 }, { "epoch": 1.76, "learning_rate": 2.295102606068236e-07, "logits/chosen": -2.9767158031463623, "logits/rejected": -2.719226121902466, "logps/chosen": -210.2613983154297, "logps/rejected": -893.4655151367188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.149674654006958, "rewards/margins": 31.4141788482666, "rewards/rejected": -30.264501571655273, "step": 5180 }, { "epoch": 1.76, "learning_rate": 2.2888077552562005e-07, "logits/chosen": -3.028263807296753, "logits/rejected": -2.7711679935455322, "logps/chosen": -208.24093627929688, "logps/rejected": -706.638671875, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 1.8289821147918701, "rewards/margins": 29.511911392211914, "rewards/rejected": -27.68292808532715, "step": 5190 }, { "epoch": 1.77, "learning_rate": 2.2825129044441647e-07, "logits/chosen": -2.9488751888275146, "logits/rejected": -2.8495614528656006, "logps/chosen": -190.95599365234375, "logps/rejected": -631.0022583007812, "loss": 0.0015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4716931581497192, "rewards/margins": 30.549968719482422, "rewards/rejected": -29.07827377319336, "step": 5200 }, { "epoch": 1.77, "eval_logits/chosen": -3.0185530185699463, "eval_logits/rejected": -2.7971041202545166, "eval_logps/chosen": -249.83712768554688, "eval_logps/rejected": -692.8795166015625, "eval_loss": 0.0020495066419243813, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.3119395971298218, "eval_rewards/margins": 30.38235855102539, "eval_rewards/rejected": -29.070417404174805, "eval_runtime": 461.9243, "eval_samples_per_second": 20.566, "eval_steps_per_second": 0.643, "step": 5200 }, { "epoch": 1.77, "learning_rate": 2.2762180536321287e-07, "logits/chosen": -2.987854480743408, "logits/rejected": -2.774258613586426, "logps/chosen": -202.2731475830078, "logps/rejected": -718.7804565429688, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.3275153636932373, "rewards/margins": 28.509197235107422, "rewards/rejected": -27.181682586669922, "step": 5210 }, { "epoch": 1.77, "learning_rate": 2.2699232028200932e-07, "logits/chosen": -2.938005208969116, "logits/rejected": -2.7565159797668457, "logps/chosen": -258.75335693359375, "logps/rejected": -780.9220581054688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.762995719909668, "rewards/margins": 28.634166717529297, "rewards/rejected": -26.871173858642578, "step": 5220 }, { "epoch": 1.78, "learning_rate": 2.2636283520080574e-07, "logits/chosen": -2.77738094329834, "logits/rejected": -2.8602359294891357, "logps/chosen": -301.9053955078125, "logps/rejected": -625.86083984375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.9801918268203735, "rewards/margins": 29.302776336669922, "rewards/rejected": -28.322586059570312, "step": 5230 }, { "epoch": 1.78, "learning_rate": 2.2573335011960216e-07, "logits/chosen": -2.9115452766418457, "logits/rejected": -2.828941583633423, "logps/chosen": -193.50820922851562, "logps/rejected": -847.7408447265625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.0358009338378906, "rewards/margins": 35.03406524658203, "rewards/rejected": -32.99826431274414, "step": 5240 }, { "epoch": 1.78, "learning_rate": 2.2510386503839856e-07, "logits/chosen": -2.916912794113159, "logits/rejected": -2.811922550201416, "logps/chosen": -209.5175323486328, "logps/rejected": -730.2145385742188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.4705846309661865, "rewards/margins": 31.413890838623047, "rewards/rejected": -29.94330406188965, "step": 5250 }, { "epoch": 1.79, "learning_rate": 2.24474379957195e-07, "logits/chosen": -2.826608180999756, "logits/rejected": -2.793463945388794, "logps/chosen": -335.00885009765625, "logps/rejected": -847.3977661132812, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 2.5097720623016357, "rewards/margins": 35.4361457824707, "rewards/rejected": -32.92637252807617, "step": 5260 }, { "epoch": 1.79, "learning_rate": 2.2384489487599143e-07, "logits/chosen": -2.846627712249756, "logits/rejected": -2.74066162109375, "logps/chosen": -331.5596008300781, "logps/rejected": -870.1644287109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.4074209928512573, "rewards/margins": 33.799339294433594, "rewards/rejected": -32.39191818237305, "step": 5270 }, { "epoch": 1.79, "learning_rate": 2.2321540979478783e-07, "logits/chosen": -2.926321268081665, "logits/rejected": -2.8468410968780518, "logps/chosen": -246.4873046875, "logps/rejected": -781.46240234375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.5376794338226318, "rewards/margins": 35.39277267456055, "rewards/rejected": -33.8550910949707, "step": 5280 }, { "epoch": 1.8, "learning_rate": 2.2258592471358428e-07, "logits/chosen": -2.8553290367126465, "logits/rejected": -2.7143630981445312, "logps/chosen": -198.0902557373047, "logps/rejected": -751.7550048828125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.2365524768829346, "rewards/margins": 29.749826431274414, "rewards/rejected": -28.513275146484375, "step": 5290 }, { "epoch": 1.8, "learning_rate": 2.219564396323807e-07, "logits/chosen": -2.876582384109497, "logits/rejected": -2.7414793968200684, "logps/chosen": -200.0975799560547, "logps/rejected": -848.3489379882812, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.9524555206298828, "rewards/margins": 32.65062713623047, "rewards/rejected": -31.698171615600586, "step": 5300 }, { "epoch": 1.8, "eval_logits/chosen": -2.9832546710968018, "eval_logits/rejected": -2.7967560291290283, "eval_logps/chosen": -255.6256103515625, "eval_logps/rejected": -715.2409057617188, "eval_loss": 0.002391957910731435, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.7330923080444336, "eval_rewards/margins": 32.039649963378906, "eval_rewards/rejected": -31.30655860900879, "eval_runtime": 461.8952, "eval_samples_per_second": 20.567, "eval_steps_per_second": 0.643, "step": 5300 }, { "epoch": 1.8, "learning_rate": 2.2132695455117712e-07, "logits/chosen": -2.87308931350708, "logits/rejected": -2.7999184131622314, "logps/chosen": -216.3625030517578, "logps/rejected": -580.763671875, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8720202445983887, "rewards/margins": 32.0785026550293, "rewards/rejected": -31.20648193359375, "step": 5310 }, { "epoch": 1.81, "learning_rate": 2.2069746946997355e-07, "logits/chosen": -2.973492383956909, "logits/rejected": -2.7731668949127197, "logps/chosen": -181.79434204101562, "logps/rejected": -511.72216796875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.773087739944458, "rewards/margins": 30.23404312133789, "rewards/rejected": -29.460952758789062, "step": 5320 }, { "epoch": 1.81, "learning_rate": 2.2006798438876997e-07, "logits/chosen": -2.7984681129455566, "logits/rejected": -2.7301318645477295, "logps/chosen": -336.24591064453125, "logps/rejected": -789.4699096679688, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 0.2903138995170593, "rewards/margins": 28.739704132080078, "rewards/rejected": -28.44939613342285, "step": 5330 }, { "epoch": 1.82, "learning_rate": 2.194384993075664e-07, "logits/chosen": -2.8977208137512207, "logits/rejected": -2.7249040603637695, "logps/chosen": -191.90060424804688, "logps/rejected": -745.9398193359375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.0484778881072998, "rewards/margins": 33.75321960449219, "rewards/rejected": -32.70474624633789, "step": 5340 }, { "epoch": 1.82, "learning_rate": 2.1880901422636284e-07, "logits/chosen": -2.9166712760925293, "logits/rejected": -2.829765796661377, "logps/chosen": -206.7417449951172, "logps/rejected": -665.331298828125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.2946635484695435, "rewards/margins": 33.93096160888672, "rewards/rejected": -32.63629913330078, "step": 5350 }, { "epoch": 1.82, "learning_rate": 2.1817952914515924e-07, "logits/chosen": -2.932237386703491, "logits/rejected": -2.715658187866211, "logps/chosen": -204.3180694580078, "logps/rejected": -667.2489013671875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.9717049598693848, "rewards/margins": 29.627283096313477, "rewards/rejected": -28.655574798583984, "step": 5360 }, { "epoch": 1.83, "learning_rate": 2.1755004406395566e-07, "logits/chosen": -2.8765196800231934, "logits/rejected": -2.8220582008361816, "logps/chosen": -284.3928527832031, "logps/rejected": -652.8698120117188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.4888198375701904, "rewards/margins": 38.991214752197266, "rewards/rejected": -37.50238800048828, "step": 5370 }, { "epoch": 1.83, "learning_rate": 2.169205589827521e-07, "logits/chosen": -2.989978313446045, "logits/rejected": -2.8169093132019043, "logps/chosen": -206.48483276367188, "logps/rejected": -688.3533935546875, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 1.1359379291534424, "rewards/margins": 36.876564025878906, "rewards/rejected": -35.74062728881836, "step": 5380 }, { "epoch": 1.83, "learning_rate": 2.1629107390154853e-07, "logits/chosen": -2.9017786979675293, "logits/rejected": -2.7739078998565674, "logps/chosen": -207.5626983642578, "logps/rejected": -790.0360717773438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.2445843666791916, "rewards/margins": 33.22171401977539, "rewards/rejected": -32.97713088989258, "step": 5390 }, { "epoch": 1.84, "learning_rate": 2.1566158882034493e-07, "logits/chosen": -2.933865785598755, "logits/rejected": -2.7881312370300293, "logps/chosen": -202.25643920898438, "logps/rejected": -554.2001953125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.5922783613204956, "rewards/margins": 30.090112686157227, "rewards/rejected": -28.49783706665039, "step": 5400 }, { "epoch": 1.84, "eval_logits/chosen": -2.999650001525879, "eval_logits/rejected": -2.8127315044403076, "eval_logps/chosen": -248.81423950195312, "eval_logps/rejected": -689.407470703125, "eval_loss": 0.001833468209952116, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 1.414229154586792, "eval_rewards/margins": 30.137441635131836, "eval_rewards/rejected": -28.72321128845215, "eval_runtime": 462.0468, "eval_samples_per_second": 20.561, "eval_steps_per_second": 0.643, "step": 5400 }, { "epoch": 1.84, "learning_rate": 2.1503210373914138e-07, "logits/chosen": -2.830824136734009, "logits/rejected": -2.7003893852233887, "logps/chosen": -255.3296356201172, "logps/rejected": -772.9188232421875, "loss": 0.0027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7909753322601318, "rewards/margins": 26.443273544311523, "rewards/rejected": -24.65229606628418, "step": 5410 }, { "epoch": 1.84, "learning_rate": 2.144026186579378e-07, "logits/chosen": -2.8549439907073975, "logits/rejected": -2.8045029640197754, "logps/chosen": -188.82579040527344, "logps/rejected": -822.0360107421875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.9658099412918091, "rewards/margins": 33.598609924316406, "rewards/rejected": -32.63280487060547, "step": 5420 }, { "epoch": 1.85, "learning_rate": 2.1377313357673422e-07, "logits/chosen": -2.8298494815826416, "logits/rejected": -2.7501726150512695, "logps/chosen": -251.5898895263672, "logps/rejected": -703.871337890625, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.9755623936653137, "rewards/margins": 27.590396881103516, "rewards/rejected": -26.614837646484375, "step": 5430 }, { "epoch": 1.85, "learning_rate": 2.1314364849553065e-07, "logits/chosen": -2.8292980194091797, "logits/rejected": -2.799264669418335, "logps/chosen": -241.1392822265625, "logps/rejected": -608.9600830078125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.8993104100227356, "rewards/margins": 29.017807006835938, "rewards/rejected": -28.11849594116211, "step": 5440 }, { "epoch": 1.85, "learning_rate": 2.1251416341432707e-07, "logits/chosen": -2.88850998878479, "logits/rejected": -2.831869602203369, "logps/chosen": -271.41082763671875, "logps/rejected": -789.72021484375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.4852228164672852, "rewards/margins": 32.394691467285156, "rewards/rejected": -30.909465789794922, "step": 5450 }, { "epoch": 1.86, "learning_rate": 2.118846783331235e-07, "logits/chosen": -2.848776340484619, "logits/rejected": -2.82658052444458, "logps/chosen": -297.9201965332031, "logps/rejected": -751.9637451171875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.3182508945465088, "rewards/margins": 34.9101676940918, "rewards/rejected": -34.591915130615234, "step": 5460 }, { "epoch": 1.86, "learning_rate": 2.1125519325191994e-07, "logits/chosen": -2.902890682220459, "logits/rejected": -2.778369188308716, "logps/chosen": -224.80355834960938, "logps/rejected": -875.2190551757812, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.4171640872955322, "rewards/margins": 39.292503356933594, "rewards/rejected": -38.875343322753906, "step": 5470 }, { "epoch": 1.86, "learning_rate": 2.1062570817071634e-07, "logits/chosen": -2.8473634719848633, "logits/rejected": -2.7071101665496826, "logps/chosen": -296.2424011230469, "logps/rejected": -694.191650390625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 2.048738479614258, "rewards/margins": 28.451343536376953, "rewards/rejected": -26.402603149414062, "step": 5480 }, { "epoch": 1.87, "learning_rate": 2.0999622308951276e-07, "logits/chosen": -2.8509011268615723, "logits/rejected": -2.8320412635803223, "logps/chosen": -268.5683288574219, "logps/rejected": -778.0692138671875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.040027596056461334, "rewards/margins": 33.10874557495117, "rewards/rejected": -33.14876937866211, "step": 5490 }, { "epoch": 1.87, "learning_rate": 2.093667380083092e-07, "logits/chosen": -2.945328712463379, "logits/rejected": -2.747690200805664, "logps/chosen": -221.6448211669922, "logps/rejected": -680.5921020507812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.43152323365211487, "rewards/margins": 32.66862106323242, "rewards/rejected": -33.10014724731445, "step": 5500 }, { "epoch": 1.87, "eval_logits/chosen": -2.989307165145874, "eval_logits/rejected": -2.7900378704071045, "eval_logps/chosen": -260.2943115234375, "eval_logps/rejected": -732.9266357421875, "eval_loss": 0.003613789565861225, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.26622042059898376, "eval_rewards/margins": 33.34135055541992, "eval_rewards/rejected": -33.075130462646484, "eval_runtime": 462.2155, "eval_samples_per_second": 20.553, "eval_steps_per_second": 0.643, "step": 5500 }, { "epoch": 1.87, "learning_rate": 2.087372529271056e-07, "logits/chosen": -2.799511432647705, "logits/rejected": -2.6928353309631348, "logps/chosen": -340.76385498046875, "logps/rejected": -839.37451171875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.3308259844779968, "rewards/margins": 36.554649353027344, "rewards/rejected": -36.223819732666016, "step": 5510 }, { "epoch": 1.88, "learning_rate": 2.0810776784590203e-07, "logits/chosen": -2.839022636413574, "logits/rejected": -2.8389415740966797, "logps/chosen": -221.9869842529297, "logps/rejected": -741.388427734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.1838090568780899, "rewards/margins": 35.688724517822266, "rewards/rejected": -35.872535705566406, "step": 5520 }, { "epoch": 1.88, "learning_rate": 2.0747828276469848e-07, "logits/chosen": -2.832371234893799, "logits/rejected": -2.6842000484466553, "logps/chosen": -314.35528564453125, "logps/rejected": -840.4636840820312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.778884768486023, "rewards/margins": 32.16352081298828, "rewards/rejected": -31.384634017944336, "step": 5530 }, { "epoch": 1.88, "learning_rate": 2.068487976834949e-07, "logits/chosen": -2.8037264347076416, "logits/rejected": -2.7548813819885254, "logps/chosen": -237.09487915039062, "logps/rejected": -716.5177001953125, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 0.8414624333381653, "rewards/margins": 33.195655822753906, "rewards/rejected": -32.35419845581055, "step": 5540 }, { "epoch": 1.89, "learning_rate": 2.062193126022913e-07, "logits/chosen": -2.8027706146240234, "logits/rejected": -2.765307903289795, "logps/chosen": -198.70651245117188, "logps/rejected": -818.5101318359375, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 0.5487383604049683, "rewards/margins": 33.399993896484375, "rewards/rejected": -32.851253509521484, "step": 5550 }, { "epoch": 1.89, "learning_rate": 2.0558982752108775e-07, "logits/chosen": -2.8399293422698975, "logits/rejected": -2.7316842079162598, "logps/chosen": -221.551025390625, "logps/rejected": -813.0087890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.7806974649429321, "rewards/margins": 33.87023162841797, "rewards/rejected": -33.089542388916016, "step": 5560 }, { "epoch": 1.89, "learning_rate": 2.0496034243988417e-07, "logits/chosen": -2.8953604698181152, "logits/rejected": -2.687709093093872, "logps/chosen": -263.1261901855469, "logps/rejected": -758.8233642578125, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.002683603670448065, "rewards/margins": 27.648239135742188, "rewards/rejected": -27.645557403564453, "step": 5570 }, { "epoch": 1.9, "learning_rate": 2.043308573586806e-07, "logits/chosen": -2.772735118865967, "logits/rejected": -2.703713893890381, "logps/chosen": -321.61688232421875, "logps/rejected": -845.48681640625, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 0.8782274127006531, "rewards/margins": 32.16605758666992, "rewards/rejected": -31.287830352783203, "step": 5580 }, { "epoch": 1.9, "learning_rate": 2.0370137227747701e-07, "logits/chosen": -2.891231060028076, "logits/rejected": -2.7171096801757812, "logps/chosen": -272.9656677246094, "logps/rejected": -781.4466552734375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.8648177981376648, "rewards/margins": 31.664356231689453, "rewards/rejected": -30.79953384399414, "step": 5590 }, { "epoch": 1.9, "learning_rate": 2.0307188719627344e-07, "logits/chosen": -2.819847583770752, "logits/rejected": -2.809103488922119, "logps/chosen": -325.50146484375, "logps/rejected": -591.2935791015625, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 0.4807140827178955, "rewards/margins": 27.13551902770996, "rewards/rejected": -26.654804229736328, "step": 5600 }, { "epoch": 1.9, "eval_logits/chosen": -2.987917900085449, "eval_logits/rejected": -2.7901220321655273, "eval_logps/chosen": -254.3345947265625, "eval_logps/rejected": -692.9180297851562, "eval_loss": 0.003393348306417465, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.8621917366981506, "eval_rewards/margins": 29.93646812438965, "eval_rewards/rejected": -29.07427406311035, "eval_runtime": 462.1863, "eval_samples_per_second": 20.554, "eval_steps_per_second": 0.643, "step": 5600 }, { "epoch": 1.91, "learning_rate": 2.0244240211506986e-07, "logits/chosen": -2.8956751823425293, "logits/rejected": -2.7481658458709717, "logps/chosen": -232.8221893310547, "logps/rejected": -550.9234619140625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.7878303527832031, "rewards/margins": 30.331119537353516, "rewards/rejected": -29.543289184570312, "step": 5610 }, { "epoch": 1.91, "learning_rate": 2.018129170338663e-07, "logits/chosen": -2.7262301445007324, "logits/rejected": -2.8159515857696533, "logps/chosen": -405.1571350097656, "logps/rejected": -641.8922119140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.0144734382629395, "rewards/margins": 32.32817840576172, "rewards/rejected": -30.313705444335938, "step": 5620 }, { "epoch": 1.91, "learning_rate": 2.011834319526627e-07, "logits/chosen": -2.9084417819976807, "logits/rejected": -2.681584358215332, "logps/chosen": -253.14797973632812, "logps/rejected": -607.4147338867188, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 2.0435192584991455, "rewards/margins": 27.067535400390625, "rewards/rejected": -25.024017333984375, "step": 5630 }, { "epoch": 1.92, "learning_rate": 2.0055394687145913e-07, "logits/chosen": -2.8633666038513184, "logits/rejected": -2.665072202682495, "logps/chosen": -262.385986328125, "logps/rejected": -853.0087890625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.1509922742843628, "rewards/margins": 24.31646156311035, "rewards/rejected": -23.16546630859375, "step": 5640 }, { "epoch": 1.92, "learning_rate": 1.9992446179025558e-07, "logits/chosen": -2.9054250717163086, "logits/rejected": -2.6543641090393066, "logps/chosen": -195.81463623046875, "logps/rejected": -692.4088134765625, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 1.6165673732757568, "rewards/margins": 24.630199432373047, "rewards/rejected": -23.01363182067871, "step": 5650 }, { "epoch": 1.92, "learning_rate": 1.99294976709052e-07, "logits/chosen": -2.848848342895508, "logits/rejected": -2.7470502853393555, "logps/chosen": -212.4910888671875, "logps/rejected": -619.08349609375, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 1.1662139892578125, "rewards/margins": 23.944896697998047, "rewards/rejected": -22.7786808013916, "step": 5660 }, { "epoch": 1.93, "learning_rate": 1.986654916278484e-07, "logits/chosen": -2.840167760848999, "logits/rejected": -2.737488269805908, "logps/chosen": -197.65139770507812, "logps/rejected": -624.5264892578125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.5624594688415527, "rewards/margins": 23.27709197998047, "rewards/rejected": -21.714632034301758, "step": 5670 }, { "epoch": 1.93, "learning_rate": 1.9803600654664484e-07, "logits/chosen": -2.872981548309326, "logits/rejected": -2.6655077934265137, "logps/chosen": -202.87167358398438, "logps/rejected": -855.5267333984375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.581272840499878, "rewards/margins": 25.27524757385254, "rewards/rejected": -23.6939754486084, "step": 5680 }, { "epoch": 1.93, "learning_rate": 1.9740652146544127e-07, "logits/chosen": -2.8762524127960205, "logits/rejected": -2.6750481128692627, "logps/chosen": -275.6251525878906, "logps/rejected": -661.1127319335938, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 1.5074294805526733, "rewards/margins": 27.372119903564453, "rewards/rejected": -25.86469078063965, "step": 5690 }, { "epoch": 1.94, "learning_rate": 1.9677703638423766e-07, "logits/chosen": -2.873110294342041, "logits/rejected": -2.7290358543395996, "logps/chosen": -215.15914916992188, "logps/rejected": -610.2805786132812, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.6576387286186218, "rewards/margins": 30.1495361328125, "rewards/rejected": -29.491901397705078, "step": 5700 }, { "epoch": 1.94, "eval_logits/chosen": -2.9436776638031006, "eval_logits/rejected": -2.754331588745117, "eval_logps/chosen": -248.3372802734375, "eval_logps/rejected": -663.8614501953125, "eval_loss": 0.002738919574767351, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.461927056312561, "eval_rewards/margins": 27.63053321838379, "eval_rewards/rejected": -26.168603897094727, "eval_runtime": 463.3738, "eval_samples_per_second": 20.502, "eval_steps_per_second": 0.641, "step": 5700 }, { "epoch": 1.94, "learning_rate": 1.961475513030341e-07, "logits/chosen": -2.8087079524993896, "logits/rejected": -2.6897964477539062, "logps/chosen": -310.9383850097656, "logps/rejected": -719.6809692382812, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 1.464524507522583, "rewards/margins": 29.667705535888672, "rewards/rejected": -28.20318031311035, "step": 5710 }, { "epoch": 1.94, "learning_rate": 1.9551806622183054e-07, "logits/chosen": -2.899355411529541, "logits/rejected": -2.694115400314331, "logps/chosen": -198.91067504882812, "logps/rejected": -862.0003051757812, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.0132769346237183, "rewards/margins": 40.72917938232422, "rewards/rejected": -39.71590042114258, "step": 5720 }, { "epoch": 1.95, "learning_rate": 1.9488858114062696e-07, "logits/chosen": -2.8267452716827393, "logits/rejected": -2.791876792907715, "logps/chosen": -212.8760986328125, "logps/rejected": -613.7713012695312, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.2772358655929565, "rewards/margins": 33.93440246582031, "rewards/rejected": -32.657161712646484, "step": 5730 }, { "epoch": 1.95, "learning_rate": 1.9425909605942338e-07, "logits/chosen": -2.827893018722534, "logits/rejected": -2.811506986618042, "logps/chosen": -383.85443115234375, "logps/rejected": -681.3345947265625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.2061269283294678, "rewards/margins": 27.3925838470459, "rewards/rejected": -26.186458587646484, "step": 5740 }, { "epoch": 1.95, "learning_rate": 1.936296109782198e-07, "logits/chosen": -2.7803616523742676, "logits/rejected": -2.697388172149658, "logps/chosen": -250.2433624267578, "logps/rejected": -951.0523681640625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.7728372812271118, "rewards/margins": 35.46739959716797, "rewards/rejected": -34.69456481933594, "step": 5750 }, { "epoch": 1.96, "learning_rate": 1.9300012589701623e-07, "logits/chosen": -2.882336139678955, "logits/rejected": -2.6744227409362793, "logps/chosen": -275.05816650390625, "logps/rejected": -802.2880859375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.5482699871063232, "rewards/margins": 28.587512969970703, "rewards/rejected": -27.039241790771484, "step": 5760 }, { "epoch": 1.96, "learning_rate": 1.9237064081581268e-07, "logits/chosen": -2.8887083530426025, "logits/rejected": -2.843846321105957, "logps/chosen": -210.67282104492188, "logps/rejected": -766.5552978515625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.9817678928375244, "rewards/margins": 31.716556549072266, "rewards/rejected": -29.734790802001953, "step": 5770 }, { "epoch": 1.96, "learning_rate": 1.9174115573460907e-07, "logits/chosen": -2.9513111114501953, "logits/rejected": -2.7530534267425537, "logps/chosen": -271.60711669921875, "logps/rejected": -739.3729248046875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.474717378616333, "rewards/margins": 29.984683990478516, "rewards/rejected": -29.509963989257812, "step": 5780 }, { "epoch": 1.97, "learning_rate": 1.911116706534055e-07, "logits/chosen": -2.774172306060791, "logits/rejected": -2.7209994792938232, "logps/chosen": -335.79107666015625, "logps/rejected": -667.1422729492188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.0124547481536865, "rewards/margins": 27.706676483154297, "rewards/rejected": -26.6942195892334, "step": 5790 }, { "epoch": 1.97, "learning_rate": 1.9048218557220194e-07, "logits/chosen": -2.8875389099121094, "logits/rejected": -2.8062191009521484, "logps/chosen": -261.13665771484375, "logps/rejected": -600.215087890625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.119125485420227, "rewards/margins": 34.34370422363281, "rewards/rejected": -33.224578857421875, "step": 5800 }, { "epoch": 1.97, "eval_logits/chosen": -2.964097499847412, "eval_logits/rejected": -2.77774977684021, "eval_logps/chosen": -249.89120483398438, "eval_logps/rejected": -692.492919921875, "eval_loss": 0.0026250199880450964, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.3065277338027954, "eval_rewards/margins": 30.33829116821289, "eval_rewards/rejected": -29.031763076782227, "eval_runtime": 462.813, "eval_samples_per_second": 20.527, "eval_steps_per_second": 0.642, "step": 5800 }, { "epoch": 1.97, "learning_rate": 1.8985270049099837e-07, "logits/chosen": -2.8191709518432617, "logits/rejected": -2.726722002029419, "logps/chosen": -215.62728881835938, "logps/rejected": -790.0726318359375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.8161112070083618, "rewards/margins": 32.523216247558594, "rewards/rejected": -31.707111358642578, "step": 5810 }, { "epoch": 1.98, "learning_rate": 1.8922321540979476e-07, "logits/chosen": -2.8709492683410645, "logits/rejected": -2.7779974937438965, "logps/chosen": -328.52178955078125, "logps/rejected": -576.2950439453125, "loss": 0.0069, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1403292417526245, "rewards/margins": 30.873641967773438, "rewards/rejected": -29.73331642150879, "step": 5820 }, { "epoch": 1.98, "learning_rate": 1.885937303285912e-07, "logits/chosen": -2.9758176803588867, "logits/rejected": -2.707080841064453, "logps/chosen": -190.8971405029297, "logps/rejected": -773.41015625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.32652747631073, "rewards/margins": 33.95763397216797, "rewards/rejected": -32.631103515625, "step": 5830 }, { "epoch": 1.99, "learning_rate": 1.8796424524738764e-07, "logits/chosen": -2.97340726852417, "logits/rejected": -2.754401922225952, "logps/chosen": -269.1912841796875, "logps/rejected": -703.2109375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.8675940036773682, "rewards/margins": 36.85981750488281, "rewards/rejected": -34.992225646972656, "step": 5840 }, { "epoch": 1.99, "learning_rate": 1.8733476016618406e-07, "logits/chosen": -2.906461000442505, "logits/rejected": -2.73972749710083, "logps/chosen": -240.800537109375, "logps/rejected": -732.9202880859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.8957122564315796, "rewards/margins": 35.121116638183594, "rewards/rejected": -33.225406646728516, "step": 5850 }, { "epoch": 1.99, "learning_rate": 1.8670527508498048e-07, "logits/chosen": -2.9235737323760986, "logits/rejected": -2.7546324729919434, "logps/chosen": -201.50439453125, "logps/rejected": -941.4697265625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.052412748336792, "rewards/margins": 38.758296966552734, "rewards/rejected": -37.70588684082031, "step": 5860 }, { "epoch": 2.0, "learning_rate": 1.860757900037769e-07, "logits/chosen": -2.980717420578003, "logits/rejected": -2.779968738555908, "logps/chosen": -200.64306640625, "logps/rejected": -626.3231811523438, "loss": 0.0171, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4845117330551147, "rewards/margins": 33.3231315612793, "rewards/rejected": -31.8386173248291, "step": 5870 }, { "epoch": 2.0, "learning_rate": 1.8544630492257333e-07, "logits/chosen": -2.9368245601654053, "logits/rejected": -2.731804609298706, "logps/chosen": -194.9738006591797, "logps/rejected": -757.4953002929688, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 1.316489815711975, "rewards/margins": 31.71505355834961, "rewards/rejected": -30.3985652923584, "step": 5880 }, { "epoch": 2.0, "learning_rate": 1.8481681984136978e-07, "logits/chosen": -2.917823553085327, "logits/rejected": -2.759410858154297, "logps/chosen": -216.90316772460938, "logps/rejected": -582.5672607421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.0284442901611328, "rewards/margins": 31.268611907958984, "rewards/rejected": -30.240169525146484, "step": 5890 }, { "epoch": 2.01, "learning_rate": 1.8418733476016617e-07, "logits/chosen": -2.8762564659118652, "logits/rejected": -2.6749892234802246, "logps/chosen": -261.474853515625, "logps/rejected": -636.2882080078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.6503074169158936, "rewards/margins": 30.604434967041016, "rewards/rejected": -28.95412826538086, "step": 5900 }, { "epoch": 2.01, "eval_logits/chosen": -2.9806458950042725, "eval_logits/rejected": -2.791865110397339, "eval_logps/chosen": -249.49081420898438, "eval_logps/rejected": -720.9660034179688, "eval_loss": 0.0024441152345389128, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.3465689420700073, "eval_rewards/margins": 33.22563934326172, "eval_rewards/rejected": -31.879070281982422, "eval_runtime": 460.8842, "eval_samples_per_second": 20.613, "eval_steps_per_second": 0.644, "step": 5900 }, { "epoch": 2.01, "learning_rate": 1.835578496789626e-07, "logits/chosen": -2.8655407428741455, "logits/rejected": -2.767519950866699, "logps/chosen": -257.52349853515625, "logps/rejected": -716.7777099609375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.0493125915527344, "rewards/margins": 30.380319595336914, "rewards/rejected": -28.331012725830078, "step": 5910 }, { "epoch": 2.01, "learning_rate": 1.8292836459775904e-07, "logits/chosen": -2.819223165512085, "logits/rejected": -2.742497444152832, "logps/chosen": -311.83770751953125, "logps/rejected": -889.16943359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.4850528240203857, "rewards/margins": 37.37776184082031, "rewards/rejected": -35.89270782470703, "step": 5920 }, { "epoch": 2.02, "learning_rate": 1.8229887951655544e-07, "logits/chosen": -2.875936508178711, "logits/rejected": -2.718971014022827, "logps/chosen": -184.7015838623047, "logps/rejected": -721.0948486328125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.481442928314209, "rewards/margins": 32.925418853759766, "rewards/rejected": -31.443973541259766, "step": 5930 }, { "epoch": 2.02, "learning_rate": 1.8166939443535186e-07, "logits/chosen": -2.8755812644958496, "logits/rejected": -2.785675287246704, "logps/chosen": -266.2384338378906, "logps/rejected": -582.3145751953125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.134749174118042, "rewards/margins": 35.78017807006836, "rewards/rejected": -33.64543151855469, "step": 5940 }, { "epoch": 2.02, "learning_rate": 1.8103990935414829e-07, "logits/chosen": -2.9033710956573486, "logits/rejected": -2.7371506690979004, "logps/chosen": -204.21218872070312, "logps/rejected": -701.19970703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.096694827079773, "rewards/margins": 28.754989624023438, "rewards/rejected": -27.658294677734375, "step": 5950 }, { "epoch": 2.03, "learning_rate": 1.8041042427294474e-07, "logits/chosen": -2.8836519718170166, "logits/rejected": -2.801570177078247, "logps/chosen": -259.0220947265625, "logps/rejected": -501.26849365234375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.845820426940918, "rewards/margins": 32.068328857421875, "rewards/rejected": -31.222509384155273, "step": 5960 }, { "epoch": 2.03, "learning_rate": 1.7978093919174113e-07, "logits/chosen": -2.8323001861572266, "logits/rejected": -2.7539143562316895, "logps/chosen": -253.606689453125, "logps/rejected": -858.4147338867188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.6231054663658142, "rewards/margins": 34.01975631713867, "rewards/rejected": -33.39664840698242, "step": 5970 }, { "epoch": 2.03, "learning_rate": 1.7915145411053755e-07, "logits/chosen": -2.865943431854248, "logits/rejected": -2.714137315750122, "logps/chosen": -257.68511962890625, "logps/rejected": -802.4866333007812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.8411090970039368, "rewards/margins": 33.018760681152344, "rewards/rejected": -32.177650451660156, "step": 5980 }, { "epoch": 2.04, "learning_rate": 1.78521969029334e-07, "logits/chosen": -2.8496298789978027, "logits/rejected": -2.7569079399108887, "logps/chosen": -260.2218933105469, "logps/rejected": -880.4786987304688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.8645738363265991, "rewards/margins": 36.467803955078125, "rewards/rejected": -35.60322952270508, "step": 5990 }, { "epoch": 2.04, "learning_rate": 1.7789248394813043e-07, "logits/chosen": -2.838435649871826, "logits/rejected": -2.7610232830047607, "logps/chosen": -304.56866455078125, "logps/rejected": -638.4124145507812, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.980881929397583, "rewards/margins": 37.234413146972656, "rewards/rejected": -35.2535285949707, "step": 6000 }, { "epoch": 2.04, "eval_logits/chosen": -2.977421998977661, "eval_logits/rejected": -2.7884342670440674, "eval_logps/chosen": -250.50091552734375, "eval_logps/rejected": -735.6439819335938, "eval_loss": 0.002392939757555723, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.2455623149871826, "eval_rewards/margins": 34.59243392944336, "eval_rewards/rejected": -33.34687423706055, "eval_runtime": 461.7768, "eval_samples_per_second": 20.573, "eval_steps_per_second": 0.643, "step": 6000 }, { "epoch": 2.04, "learning_rate": 1.7726299886692682e-07, "logits/chosen": -2.9285895824432373, "logits/rejected": -2.7106502056121826, "logps/chosen": -228.721923828125, "logps/rejected": -688.5929565429688, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 0.8108099699020386, "rewards/margins": 35.09956359863281, "rewards/rejected": -34.288753509521484, "step": 6010 }, { "epoch": 2.05, "learning_rate": 1.7663351378572327e-07, "logits/chosen": -3.0028462409973145, "logits/rejected": -2.7727208137512207, "logps/chosen": -203.9446563720703, "logps/rejected": -1001.2998046875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.6054173111915588, "rewards/margins": 42.68828201293945, "rewards/rejected": -42.082862854003906, "step": 6020 }, { "epoch": 2.05, "learning_rate": 1.760040287045197e-07, "logits/chosen": -2.8452820777893066, "logits/rejected": -2.784881114959717, "logps/chosen": -397.2545471191406, "logps/rejected": -723.3806762695312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.17479096353054047, "rewards/margins": 35.34383773803711, "rewards/rejected": -35.16904830932617, "step": 6030 }, { "epoch": 2.05, "learning_rate": 1.7537454362331612e-07, "logits/chosen": -2.826493740081787, "logits/rejected": -2.723268508911133, "logps/chosen": -251.6018829345703, "logps/rejected": -688.95849609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.9028065800666809, "rewards/margins": 35.117591857910156, "rewards/rejected": -34.214786529541016, "step": 6040 }, { "epoch": 2.06, "learning_rate": 1.7474505854211254e-07, "logits/chosen": -2.800159454345703, "logits/rejected": -2.8392751216888428, "logps/chosen": -367.2532958984375, "logps/rejected": -761.01611328125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.2024954557418823, "rewards/margins": 40.47967529296875, "rewards/rejected": -39.277183532714844, "step": 6050 }, { "epoch": 2.06, "learning_rate": 1.7411557346090896e-07, "logits/chosen": -2.7597174644470215, "logits/rejected": -2.7971949577331543, "logps/chosen": -246.7077178955078, "logps/rejected": -677.0889892578125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.5760828256607056, "rewards/margins": 37.311851501464844, "rewards/rejected": -36.73577117919922, "step": 6060 }, { "epoch": 2.06, "learning_rate": 1.7348608837970539e-07, "logits/chosen": -2.891296863555908, "logits/rejected": -2.7776036262512207, "logps/chosen": -250.755615234375, "logps/rejected": -837.9459228515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.6177400350570679, "rewards/margins": 39.81981658935547, "rewards/rejected": -38.20207977294922, "step": 6070 }, { "epoch": 2.07, "learning_rate": 1.7285660329850184e-07, "logits/chosen": -2.8535804748535156, "logits/rejected": -2.8270223140716553, "logps/chosen": -203.81573486328125, "logps/rejected": -818.34228515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.9415315389633179, "rewards/margins": 44.83909225463867, "rewards/rejected": -43.897560119628906, "step": 6080 }, { "epoch": 2.07, "learning_rate": 1.7222711821729823e-07, "logits/chosen": -2.8603405952453613, "logits/rejected": -2.7556862831115723, "logps/chosen": -192.4033966064453, "logps/rejected": -924.2728271484375, "loss": 0.0041, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.921868622303009, "rewards/margins": 41.85797882080078, "rewards/rejected": -40.93611145019531, "step": 6090 }, { "epoch": 2.07, "learning_rate": 1.7159763313609465e-07, "logits/chosen": -2.90516996383667, "logits/rejected": -2.817859649658203, "logps/chosen": -268.741943359375, "logps/rejected": -707.1231689453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.6274124979972839, "rewards/margins": 38.91834259033203, "rewards/rejected": -38.290924072265625, "step": 6100 }, { "epoch": 2.07, "eval_logits/chosen": -2.9850845336914062, "eval_logits/rejected": -2.803882598876953, "eval_logps/chosen": -254.84913635253906, "eval_logps/rejected": -774.6012573242188, "eval_loss": 0.005126514937728643, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.8107330799102783, "eval_rewards/margins": 38.0533332824707, "eval_rewards/rejected": -37.24260330200195, "eval_runtime": 461.0863, "eval_samples_per_second": 20.604, "eval_steps_per_second": 0.644, "step": 6100 }, { "epoch": 2.08, "learning_rate": 1.709681480548911e-07, "logits/chosen": -2.878788471221924, "logits/rejected": -2.8560779094696045, "logps/chosen": -196.96142578125, "logps/rejected": -660.3651123046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.5945838689804077, "rewards/margins": 37.3062629699707, "rewards/rejected": -36.71167755126953, "step": 6110 }, { "epoch": 2.08, "learning_rate": 1.7033866297368753e-07, "logits/chosen": -2.87274169921875, "logits/rejected": -2.787768840789795, "logps/chosen": -276.7560119628906, "logps/rejected": -821.8802490234375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.0633617639541626, "rewards/margins": 39.698387145996094, "rewards/rejected": -38.63502502441406, "step": 6120 }, { "epoch": 2.08, "learning_rate": 1.6970917789248392e-07, "logits/chosen": -2.9591166973114014, "logits/rejected": -2.8078365325927734, "logps/chosen": -237.8144073486328, "logps/rejected": -893.18359375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.30371737480163574, "rewards/margins": 37.66729736328125, "rewards/rejected": -37.363582611083984, "step": 6130 }, { "epoch": 2.09, "learning_rate": 1.6907969281128037e-07, "logits/chosen": -2.854177474975586, "logits/rejected": -2.8890793323516846, "logps/chosen": -322.14764404296875, "logps/rejected": -690.2075805664062, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.9070094227790833, "rewards/margins": 34.909423828125, "rewards/rejected": -34.002418518066406, "step": 6140 }, { "epoch": 2.09, "learning_rate": 1.684502077300768e-07, "logits/chosen": -2.899559736251831, "logits/rejected": -2.7852818965911865, "logps/chosen": -272.46978759765625, "logps/rejected": -726.4034423828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2762537002563477, "rewards/margins": 35.512123107910156, "rewards/rejected": -34.235870361328125, "step": 6150 }, { "epoch": 2.09, "learning_rate": 1.678207226488732e-07, "logits/chosen": -2.8938064575195312, "logits/rejected": -2.7758400440216064, "logps/chosen": -252.83889770507812, "logps/rejected": -655.7871704101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.5005686283111572, "rewards/margins": 33.911659240722656, "rewards/rejected": -32.41109085083008, "step": 6160 }, { "epoch": 2.1, "learning_rate": 1.6719123756766964e-07, "logits/chosen": -2.9249441623687744, "logits/rejected": -2.7524750232696533, "logps/chosen": -271.9706726074219, "logps/rejected": -903.2515869140625, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4147192239761353, "rewards/margins": 39.80329132080078, "rewards/rejected": -38.388572692871094, "step": 6170 }, { "epoch": 2.1, "learning_rate": 1.6656175248646606e-07, "logits/chosen": -2.843553066253662, "logits/rejected": -2.8588473796844482, "logps/chosen": -225.41970825195312, "logps/rejected": -720.2731323242188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.1085476875305176, "rewards/margins": 36.435401916503906, "rewards/rejected": -35.32685089111328, "step": 6180 }, { "epoch": 2.1, "learning_rate": 1.6593226740526249e-07, "logits/chosen": -2.9223110675811768, "logits/rejected": -2.7816576957702637, "logps/chosen": -181.74856567382812, "logps/rejected": -621.9804077148438, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 0.8904901742935181, "rewards/margins": 36.559539794921875, "rewards/rejected": -35.66904830932617, "step": 6190 }, { "epoch": 2.11, "learning_rate": 1.653027823240589e-07, "logits/chosen": -2.899003028869629, "logits/rejected": -2.7001049518585205, "logps/chosen": -200.4274139404297, "logps/rejected": -806.3859252929688, "loss": 0.0025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.34807711839675903, "rewards/margins": 36.51169204711914, "rewards/rejected": -36.163612365722656, "step": 6200 }, { "epoch": 2.11, "eval_logits/chosen": -2.964122772216797, "eval_logits/rejected": -2.7910373210906982, "eval_logps/chosen": -250.85247802734375, "eval_logps/rejected": -729.1439819335938, "eval_loss": 0.005295043345540762, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.2104049921035767, "eval_rewards/margins": 33.907264709472656, "eval_rewards/rejected": -32.69685745239258, "eval_runtime": 461.6848, "eval_samples_per_second": 20.577, "eval_steps_per_second": 0.643, "step": 6200 }, { "epoch": 2.11, "learning_rate": 1.6467329724285533e-07, "logits/chosen": -2.909715175628662, "logits/rejected": -2.7117249965667725, "logps/chosen": -212.5810546875, "logps/rejected": -819.9437255859375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.593856930732727, "rewards/margins": 31.583593368530273, "rewards/rejected": -30.989736557006836, "step": 6210 }, { "epoch": 2.11, "learning_rate": 1.6404381216165175e-07, "logits/chosen": -2.8796467781066895, "logits/rejected": -2.7389557361602783, "logps/chosen": -203.38412475585938, "logps/rejected": -694.8117065429688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.9334151148796082, "rewards/margins": 35.621917724609375, "rewards/rejected": -34.68850326538086, "step": 6220 }, { "epoch": 2.12, "learning_rate": 1.634143270804482e-07, "logits/chosen": -2.944577693939209, "logits/rejected": -2.7211403846740723, "logps/chosen": -233.1695098876953, "logps/rejected": -642.7702026367188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2761809825897217, "rewards/margins": 31.828277587890625, "rewards/rejected": -30.552099227905273, "step": 6230 }, { "epoch": 2.12, "learning_rate": 1.627848419992446e-07, "logits/chosen": -2.7696022987365723, "logits/rejected": -2.8079540729522705, "logps/chosen": -312.8160705566406, "logps/rejected": -932.4901123046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.1353214979171753, "rewards/margins": 39.102149963378906, "rewards/rejected": -37.966827392578125, "step": 6240 }, { "epoch": 2.12, "learning_rate": 1.6215535691804102e-07, "logits/chosen": -2.8144540786743164, "logits/rejected": -2.6835272312164307, "logps/chosen": -249.94577026367188, "logps/rejected": -706.3253784179688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.9098288416862488, "rewards/margins": 38.05469512939453, "rewards/rejected": -37.14486312866211, "step": 6250 }, { "epoch": 2.13, "learning_rate": 1.6152587183683747e-07, "logits/chosen": -2.946385622024536, "logits/rejected": -2.781968593597412, "logps/chosen": -201.70193481445312, "logps/rejected": -842.0213623046875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.6692003607749939, "rewards/margins": 37.30356979370117, "rewards/rejected": -36.63437271118164, "step": 6260 }, { "epoch": 2.13, "learning_rate": 1.608963867556339e-07, "logits/chosen": -2.9085464477539062, "logits/rejected": -2.8225016593933105, "logps/chosen": -206.59738159179688, "logps/rejected": -620.7462768554688, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.3949991762638092, "rewards/margins": 32.083953857421875, "rewards/rejected": -31.688955307006836, "step": 6270 }, { "epoch": 2.13, "learning_rate": 1.602669016744303e-07, "logits/chosen": -2.8093390464782715, "logits/rejected": -2.7728271484375, "logps/chosen": -321.5029602050781, "logps/rejected": -992.2282104492188, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.4271771907806396, "rewards/margins": 35.46192169189453, "rewards/rejected": -34.03474807739258, "step": 6280 }, { "epoch": 2.14, "learning_rate": 1.5963741659322674e-07, "logits/chosen": -2.8791327476501465, "logits/rejected": -2.6643471717834473, "logps/chosen": -213.18753051757812, "logps/rejected": -811.97265625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.429626226425171, "rewards/margins": 35.81269073486328, "rewards/rejected": -34.383060455322266, "step": 6290 }, { "epoch": 2.14, "learning_rate": 1.5900793151202316e-07, "logits/chosen": -2.863936185836792, "logits/rejected": -2.8204338550567627, "logps/chosen": -194.96910095214844, "logps/rejected": -736.7196044921875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.1715142726898193, "rewards/margins": 30.449565887451172, "rewards/rejected": -29.27805519104004, "step": 6300 }, { "epoch": 2.14, "eval_logits/chosen": -2.972797155380249, "eval_logits/rejected": -2.7886269092559814, "eval_logps/chosen": -247.23316955566406, "eval_logps/rejected": -709.8519287109375, "eval_loss": 0.004952012095600367, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.572334885597229, "eval_rewards/margins": 32.339996337890625, "eval_rewards/rejected": -30.767656326293945, "eval_runtime": 461.929, "eval_samples_per_second": 20.566, "eval_steps_per_second": 0.643, "step": 6300 }, { "epoch": 2.14, "learning_rate": 1.5837844643081959e-07, "logits/chosen": -2.957728624343872, "logits/rejected": -2.7860829830169678, "logps/chosen": -184.76364135742188, "logps/rejected": -890.0479736328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.1737959384918213, "rewards/margins": 35.284515380859375, "rewards/rejected": -34.110721588134766, "step": 6310 }, { "epoch": 2.15, "learning_rate": 1.57748961349616e-07, "logits/chosen": -2.92635178565979, "logits/rejected": -2.780294179916382, "logps/chosen": -260.75494384765625, "logps/rejected": -960.25390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.6903190612792969, "rewards/margins": 38.267059326171875, "rewards/rejected": -36.576744079589844, "step": 6320 }, { "epoch": 2.15, "learning_rate": 1.5711947626841243e-07, "logits/chosen": -2.779329299926758, "logits/rejected": -2.7661843299865723, "logps/chosen": -345.35791015625, "logps/rejected": -577.3766479492188, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.922202706336975, "rewards/margins": 31.3266544342041, "rewards/rejected": -29.40445327758789, "step": 6330 }, { "epoch": 2.15, "learning_rate": 1.5648999118720885e-07, "logits/chosen": -2.8253235816955566, "logits/rejected": -2.7807722091674805, "logps/chosen": -271.3294982910156, "logps/rejected": -651.1011352539062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.7462135553359985, "rewards/margins": 36.773597717285156, "rewards/rejected": -35.027381896972656, "step": 6340 }, { "epoch": 2.16, "learning_rate": 1.558605061060053e-07, "logits/chosen": -2.8480679988861084, "logits/rejected": -2.734166383743286, "logps/chosen": -216.5483856201172, "logps/rejected": -795.8843994140625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 1.5268714427947998, "rewards/margins": 39.908851623535156, "rewards/rejected": -38.381980895996094, "step": 6350 }, { "epoch": 2.16, "learning_rate": 1.552310210248017e-07, "logits/chosen": -2.816094398498535, "logits/rejected": -2.7034218311309814, "logps/chosen": -310.57305908203125, "logps/rejected": -862.2596435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.2796406745910645, "rewards/margins": 35.35778045654297, "rewards/rejected": -33.07814025878906, "step": 6360 }, { "epoch": 2.17, "learning_rate": 1.5460153594359812e-07, "logits/chosen": -2.7915420532226562, "logits/rejected": -2.671781539916992, "logps/chosen": -311.2373046875, "logps/rejected": -745.3869018554688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.031716823577881, "rewards/margins": 34.328250885009766, "rewards/rejected": -32.296531677246094, "step": 6370 }, { "epoch": 2.17, "learning_rate": 1.5397205086239457e-07, "logits/chosen": -2.8571438789367676, "logits/rejected": -2.6755595207214355, "logps/chosen": -246.83767700195312, "logps/rejected": -844.0206909179688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.091219186782837, "rewards/margins": 38.183815002441406, "rewards/rejected": -36.092594146728516, "step": 6380 }, { "epoch": 2.17, "learning_rate": 1.5334256578119097e-07, "logits/chosen": -2.900085926055908, "logits/rejected": -2.7091336250305176, "logps/chosen": -258.3498229980469, "logps/rejected": -711.0010986328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.371865749359131, "rewards/margins": 32.45520782470703, "rewards/rejected": -30.083343505859375, "step": 6390 }, { "epoch": 2.18, "learning_rate": 1.527130806999874e-07, "logits/chosen": -2.8468496799468994, "logits/rejected": -2.7841804027557373, "logps/chosen": -317.0931091308594, "logps/rejected": -583.2691650390625, "loss": 0.0062, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2529497146606445, "rewards/margins": 32.34334945678711, "rewards/rejected": -30.090396881103516, "step": 6400 }, { "epoch": 2.18, "eval_logits/chosen": -2.953019618988037, "eval_logits/rejected": -2.7688770294189453, "eval_logps/chosen": -252.5022430419922, "eval_logps/rejected": -726.55029296875, "eval_loss": 0.006201328244060278, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 1.04542875289917, "eval_rewards/margins": 33.48291778564453, "eval_rewards/rejected": -32.4374885559082, "eval_runtime": 461.1268, "eval_samples_per_second": 20.602, "eval_steps_per_second": 0.644, "step": 6400 }, { "epoch": 2.18, "learning_rate": 1.5208359561878384e-07, "logits/chosen": -2.826190948486328, "logits/rejected": -2.7620136737823486, "logps/chosen": -251.93191528320312, "logps/rejected": -669.4815673828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.1571775674819946, "rewards/margins": 32.933082580566406, "rewards/rejected": -31.77590560913086, "step": 6410 }, { "epoch": 2.18, "learning_rate": 1.5145411053758026e-07, "logits/chosen": -2.794912815093994, "logits/rejected": -2.849370241165161, "logps/chosen": -252.63980102539062, "logps/rejected": -724.2649536132812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.4321998357772827, "rewards/margins": 37.97924041748047, "rewards/rejected": -36.54704284667969, "step": 6420 }, { "epoch": 2.19, "learning_rate": 1.5082462545637666e-07, "logits/chosen": -2.9952681064605713, "logits/rejected": -2.7988531589508057, "logps/chosen": -204.57943725585938, "logps/rejected": -589.1400146484375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.7278350591659546, "rewards/margins": 33.63867950439453, "rewards/rejected": -32.91083908081055, "step": 6430 }, { "epoch": 2.19, "learning_rate": 1.501951403751731e-07, "logits/chosen": -2.852402925491333, "logits/rejected": -2.759084463119507, "logps/chosen": -193.99783325195312, "logps/rejected": -740.3551025390625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.8674389123916626, "rewards/margins": 35.4468994140625, "rewards/rejected": -33.57946014404297, "step": 6440 }, { "epoch": 2.19, "learning_rate": 1.4956565529396953e-07, "logits/chosen": -2.866487979888916, "logits/rejected": -2.809661388397217, "logps/chosen": -206.09365844726562, "logps/rejected": -697.0120849609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.6737501621246338, "rewards/margins": 34.778953552246094, "rewards/rejected": -33.10520553588867, "step": 6450 }, { "epoch": 2.2, "learning_rate": 1.4893617021276595e-07, "logits/chosen": -2.837038516998291, "logits/rejected": -2.690638542175293, "logps/chosen": -210.99221801757812, "logps/rejected": -800.6724243164062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.3208402395248413, "rewards/margins": 36.44570541381836, "rewards/rejected": -35.12486267089844, "step": 6460 }, { "epoch": 2.2, "learning_rate": 1.4830668513156238e-07, "logits/chosen": -2.99686861038208, "logits/rejected": -2.806051731109619, "logps/chosen": -177.14239501953125, "logps/rejected": -511.55767822265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5078082084655762, "rewards/margins": 33.12656784057617, "rewards/rejected": -31.618755340576172, "step": 6470 }, { "epoch": 2.2, "learning_rate": 1.476772000503588e-07, "logits/chosen": -2.908655881881714, "logits/rejected": -2.6778323650360107, "logps/chosen": -198.29550170898438, "logps/rejected": -642.8115234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.574456810951233, "rewards/margins": 32.94312286376953, "rewards/rejected": -31.368667602539062, "step": 6480 }, { "epoch": 2.21, "learning_rate": 1.4704771496915522e-07, "logits/chosen": -2.816061496734619, "logits/rejected": -2.7578587532043457, "logps/chosen": -209.73300170898438, "logps/rejected": -681.6697998046875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 1.4110932350158691, "rewards/margins": 37.92109680175781, "rewards/rejected": -36.51000213623047, "step": 6490 }, { "epoch": 2.21, "learning_rate": 1.4641822988795167e-07, "logits/chosen": -2.8126578330993652, "logits/rejected": -2.747807264328003, "logps/chosen": -198.5749969482422, "logps/rejected": -696.9048461914062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.1815764904022217, "rewards/margins": 29.548873901367188, "rewards/rejected": -28.367298126220703, "step": 6500 }, { "epoch": 2.21, "eval_logits/chosen": -2.9271626472473145, "eval_logits/rejected": -2.745046377182007, "eval_logps/chosen": -249.99899291992188, "eval_logps/rejected": -730.9217529296875, "eval_loss": 0.005013682879507542, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.2957507371902466, "eval_rewards/margins": 34.170387268066406, "eval_rewards/rejected": -32.8746337890625, "eval_runtime": 462.4803, "eval_samples_per_second": 20.541, "eval_steps_per_second": 0.642, "step": 6500 }, { "epoch": 2.21, "learning_rate": 1.4578874480674807e-07, "logits/chosen": -2.8538568019866943, "logits/rejected": -2.742309093475342, "logps/chosen": -296.4598693847656, "logps/rejected": -683.8643798828125, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9654844999313354, "rewards/margins": 34.71126937866211, "rewards/rejected": -33.745784759521484, "step": 6510 }, { "epoch": 2.22, "learning_rate": 1.451592597255445e-07, "logits/chosen": -2.724020481109619, "logits/rejected": -2.7483139038085938, "logps/chosen": -290.93194580078125, "logps/rejected": -668.3776245117188, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.045581579208374, "rewards/margins": 33.491546630859375, "rewards/rejected": -31.445964813232422, "step": 6520 }, { "epoch": 2.22, "learning_rate": 1.4452977464434094e-07, "logits/chosen": -2.8333933353424072, "logits/rejected": -2.666412591934204, "logps/chosen": -235.8727264404297, "logps/rejected": -668.5108032226562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.8813183307647705, "rewards/margins": 30.990671157836914, "rewards/rejected": -29.10935401916504, "step": 6530 }, { "epoch": 2.22, "learning_rate": 1.4390028956313736e-07, "logits/chosen": -2.9098212718963623, "logits/rejected": -2.8171534538269043, "logps/chosen": -195.0373992919922, "logps/rejected": -742.1309814453125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.1326677799224854, "rewards/margins": 36.800025939941406, "rewards/rejected": -34.667354583740234, "step": 6540 }, { "epoch": 2.23, "learning_rate": 1.4327080448193376e-07, "logits/chosen": -2.805898427963257, "logits/rejected": -2.73037052154541, "logps/chosen": -367.656005859375, "logps/rejected": -641.3599243164062, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5067262649536133, "rewards/margins": 32.68325424194336, "rewards/rejected": -31.176528930664062, "step": 6550 }, { "epoch": 2.23, "learning_rate": 1.426413194007302e-07, "logits/chosen": -2.948568820953369, "logits/rejected": -2.814675807952881, "logps/chosen": -201.19239807128906, "logps/rejected": -669.7403564453125, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 2.4859707355499268, "rewards/margins": 33.071434020996094, "rewards/rejected": -30.585460662841797, "step": 6560 }, { "epoch": 2.23, "learning_rate": 1.4201183431952663e-07, "logits/chosen": -2.912996292114258, "logits/rejected": -2.746863842010498, "logps/chosen": -256.2448425292969, "logps/rejected": -900.3212890625, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 1.700669527053833, "rewards/margins": 32.63864517211914, "rewards/rejected": -30.937976837158203, "step": 6570 }, { "epoch": 2.24, "learning_rate": 1.4138234923832303e-07, "logits/chosen": -2.8576042652130127, "logits/rejected": -2.7083358764648438, "logps/chosen": -255.2916717529297, "logps/rejected": -713.992431640625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.374990224838257, "rewards/margins": 31.791086196899414, "rewards/rejected": -29.416095733642578, "step": 6580 }, { "epoch": 2.24, "learning_rate": 1.4075286415711948e-07, "logits/chosen": -2.930135726928711, "logits/rejected": -2.8600940704345703, "logps/chosen": -259.8876647949219, "logps/rejected": -598.64501953125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.9909387826919556, "rewards/margins": 32.24399185180664, "rewards/rejected": -30.2530517578125, "step": 6590 }, { "epoch": 2.24, "learning_rate": 1.401233790759159e-07, "logits/chosen": -2.8502655029296875, "logits/rejected": -2.754793882369995, "logps/chosen": -291.9371337890625, "logps/rejected": -664.8497314453125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.6366865634918213, "rewards/margins": 29.300161361694336, "rewards/rejected": -27.66347312927246, "step": 6600 }, { "epoch": 2.24, "eval_logits/chosen": -2.967190980911255, "eval_logits/rejected": -2.7934722900390625, "eval_logps/chosen": -244.50128173828125, "eval_logps/rejected": -699.8776245117188, "eval_loss": 0.0034650887828320265, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.845523715019226, "eval_rewards/margins": 31.615753173828125, "eval_rewards/rejected": -29.770227432250977, "eval_runtime": 462.4318, "eval_samples_per_second": 20.544, "eval_steps_per_second": 0.642, "step": 6600 }, { "epoch": 2.25, "learning_rate": 1.3949389399471232e-07, "logits/chosen": -2.926976203918457, "logits/rejected": -2.819268226623535, "logps/chosen": -185.74819946289062, "logps/rejected": -717.5703735351562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.1767642498016357, "rewards/margins": 34.659889221191406, "rewards/rejected": -32.483123779296875, "step": 6610 }, { "epoch": 2.25, "learning_rate": 1.3886440891350874e-07, "logits/chosen": -2.8294992446899414, "logits/rejected": -2.8871030807495117, "logps/chosen": -259.6488037109375, "logps/rejected": -596.0245971679688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.3721041679382324, "rewards/margins": 35.177635192871094, "rewards/rejected": -32.80553436279297, "step": 6620 }, { "epoch": 2.25, "learning_rate": 1.3823492383230517e-07, "logits/chosen": -2.9230117797851562, "logits/rejected": -2.813938617706299, "logps/chosen": -195.14869689941406, "logps/rejected": -792.7903442382812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.3939220905303955, "rewards/margins": 34.775917053222656, "rewards/rejected": -32.381996154785156, "step": 6630 }, { "epoch": 2.26, "learning_rate": 1.376054387511016e-07, "logits/chosen": -2.891641616821289, "logits/rejected": -2.8508944511413574, "logps/chosen": -214.0260772705078, "logps/rejected": -702.675537109375, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5022273063659668, "rewards/margins": 32.760799407958984, "rewards/rejected": -31.25857162475586, "step": 6640 }, { "epoch": 2.26, "learning_rate": 1.36975953669898e-07, "logits/chosen": -2.8298850059509277, "logits/rejected": -2.7986233234405518, "logps/chosen": -239.6798095703125, "logps/rejected": -589.5087280273438, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.6225337982177734, "rewards/margins": 31.002025604248047, "rewards/rejected": -29.37949562072754, "step": 6650 }, { "epoch": 2.26, "learning_rate": 1.3634646858869444e-07, "logits/chosen": -2.898169994354248, "logits/rejected": -2.7683115005493164, "logps/chosen": -207.80050659179688, "logps/rejected": -599.37841796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.806452751159668, "rewards/margins": 35.48632049560547, "rewards/rejected": -33.67986297607422, "step": 6660 }, { "epoch": 2.27, "learning_rate": 1.3571698350749086e-07, "logits/chosen": -2.8748412132263184, "logits/rejected": -2.8128647804260254, "logps/chosen": -248.04281616210938, "logps/rejected": -705.0379638671875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 2.3167214393615723, "rewards/margins": 34.01567077636719, "rewards/rejected": -31.698949813842773, "step": 6670 }, { "epoch": 2.27, "learning_rate": 1.3508749842628728e-07, "logits/chosen": -2.8343710899353027, "logits/rejected": -2.767008066177368, "logps/chosen": -236.0721435546875, "logps/rejected": -641.6688842773438, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.8973774909973145, "rewards/margins": 27.978378295898438, "rewards/rejected": -25.080997467041016, "step": 6680 }, { "epoch": 2.27, "learning_rate": 1.3445801334508373e-07, "logits/chosen": -2.7863447666168213, "logits/rejected": -2.8104686737060547, "logps/chosen": -296.5308532714844, "logps/rejected": -492.69140625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.1710474491119385, "rewards/margins": 30.84963035583496, "rewards/rejected": -28.6785831451416, "step": 6690 }, { "epoch": 2.28, "learning_rate": 1.3382852826388013e-07, "logits/chosen": -2.8803343772888184, "logits/rejected": -2.7430243492126465, "logps/chosen": -196.5672607421875, "logps/rejected": -601.2169189453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.960148811340332, "rewards/margins": 30.1269588470459, "rewards/rejected": -28.16680908203125, "step": 6700 }, { "epoch": 2.28, "eval_logits/chosen": -2.980076789855957, "eval_logits/rejected": -2.8073980808258057, "eval_logps/chosen": -242.7506866455078, "eval_logps/rejected": -702.1669311523438, "eval_loss": 0.004025810863822699, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 2.020583152770996, "eval_rewards/margins": 32.01973342895508, "eval_rewards/rejected": -29.999155044555664, "eval_runtime": 462.2364, "eval_samples_per_second": 20.552, "eval_steps_per_second": 0.643, "step": 6700 }, { "epoch": 2.28, "learning_rate": 1.3319904318267655e-07, "logits/chosen": -2.8463196754455566, "logits/rejected": -2.7651925086975098, "logps/chosen": -242.87747192382812, "logps/rejected": -743.3735961914062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.275507926940918, "rewards/margins": 32.503204345703125, "rewards/rejected": -30.22769546508789, "step": 6710 }, { "epoch": 2.28, "learning_rate": 1.32569558101473e-07, "logits/chosen": -2.8666844367980957, "logits/rejected": -2.828458786010742, "logps/chosen": -205.630859375, "logps/rejected": -677.1856079101562, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 2.0754740238189697, "rewards/margins": 30.86680030822754, "rewards/rejected": -28.79132652282715, "step": 6720 }, { "epoch": 2.29, "learning_rate": 1.3194007302026942e-07, "logits/chosen": -2.8675484657287598, "logits/rejected": -2.8063743114471436, "logps/chosen": -230.78573608398438, "logps/rejected": -613.0828857421875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.2249460220336914, "rewards/margins": 33.10938262939453, "rewards/rejected": -30.884435653686523, "step": 6730 }, { "epoch": 2.29, "learning_rate": 1.3131058793906582e-07, "logits/chosen": -2.7744717597961426, "logits/rejected": -2.7267651557922363, "logps/chosen": -296.0487365722656, "logps/rejected": -657.561767578125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.438033103942871, "rewards/margins": 32.15264129638672, "rewards/rejected": -29.714611053466797, "step": 6740 }, { "epoch": 2.29, "learning_rate": 1.3068110285786227e-07, "logits/chosen": -2.862522840499878, "logits/rejected": -2.713259220123291, "logps/chosen": -238.83340454101562, "logps/rejected": -594.6341552734375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.368257761001587, "rewards/margins": 30.621322631835938, "rewards/rejected": -28.253063201904297, "step": 6750 }, { "epoch": 2.3, "learning_rate": 1.300516177766587e-07, "logits/chosen": -2.9005699157714844, "logits/rejected": -2.7510528564453125, "logps/chosen": -233.699462890625, "logps/rejected": -656.0194091796875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 2.1575751304626465, "rewards/margins": 33.58635330200195, "rewards/rejected": -31.428781509399414, "step": 6760 }, { "epoch": 2.3, "learning_rate": 1.294221326954551e-07, "logits/chosen": -2.8968067169189453, "logits/rejected": -2.7885546684265137, "logps/chosen": -259.97833251953125, "logps/rejected": -972.9719848632812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5536123514175415, "rewards/margins": 45.83961868286133, "rewards/rejected": -44.28600311279297, "step": 6770 }, { "epoch": 2.3, "learning_rate": 1.2879264761425154e-07, "logits/chosen": -2.932699203491211, "logits/rejected": -2.8351573944091797, "logps/chosen": -211.8537139892578, "logps/rejected": -625.6563720703125, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": 1.3130404949188232, "rewards/margins": 37.16905212402344, "rewards/rejected": -35.85601043701172, "step": 6780 }, { "epoch": 2.31, "learning_rate": 1.2816316253304796e-07, "logits/chosen": -2.9588351249694824, "logits/rejected": -2.7706549167633057, "logps/chosen": -174.28515625, "logps/rejected": -673.1832275390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.457035779953003, "rewards/margins": 35.412818908691406, "rewards/rejected": -33.95578384399414, "step": 6790 }, { "epoch": 2.31, "learning_rate": 1.2753367745184438e-07, "logits/chosen": -2.860121488571167, "logits/rejected": -2.7706971168518066, "logps/chosen": -258.8064270019531, "logps/rejected": -554.7119750976562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.8876125812530518, "rewards/margins": 32.657413482666016, "rewards/rejected": -30.769800186157227, "step": 6800 }, { "epoch": 2.31, "eval_logits/chosen": -2.9749245643615723, "eval_logits/rejected": -2.7943482398986816, "eval_logps/chosen": -247.71817016601562, "eval_logps/rejected": -738.4593505859375, "eval_loss": 0.004223205149173737, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.523833990097046, "eval_rewards/margins": 35.1522331237793, "eval_rewards/rejected": -33.628395080566406, "eval_runtime": 462.24, "eval_samples_per_second": 20.552, "eval_steps_per_second": 0.643, "step": 6800 }, { "epoch": 2.31, "learning_rate": 1.2690419237064083e-07, "logits/chosen": -2.7946372032165527, "logits/rejected": -2.734610080718994, "logps/chosen": -369.85418701171875, "logps/rejected": -776.8677978515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.6659796237945557, "rewards/margins": 35.6276741027832, "rewards/rejected": -33.961692810058594, "step": 6810 }, { "epoch": 2.32, "learning_rate": 1.2627470728943723e-07, "logits/chosen": -2.878134250640869, "logits/rejected": -2.667599678039551, "logps/chosen": -331.798828125, "logps/rejected": -825.4573974609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.0393176078796387, "rewards/margins": 35.9044189453125, "rewards/rejected": -33.8651008605957, "step": 6820 }, { "epoch": 2.32, "learning_rate": 1.2564522220823365e-07, "logits/chosen": -2.891584873199463, "logits/rejected": -2.7043325901031494, "logps/chosen": -261.2659606933594, "logps/rejected": -798.1756591796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.106187105178833, "rewards/margins": 37.21535110473633, "rewards/rejected": -35.10916519165039, "step": 6830 }, { "epoch": 2.32, "learning_rate": 1.250157371270301e-07, "logits/chosen": -2.8865151405334473, "logits/rejected": -2.7472994327545166, "logps/chosen": -263.88555908203125, "logps/rejected": -658.2882690429688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.4573724269866943, "rewards/margins": 37.42094039916992, "rewards/rejected": -35.96356964111328, "step": 6840 }, { "epoch": 2.33, "learning_rate": 1.243862520458265e-07, "logits/chosen": -2.9648311138153076, "logits/rejected": -2.7427048683166504, "logps/chosen": -246.5808868408203, "logps/rejected": -618.0371704101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.4643669128417969, "rewards/margins": 33.96628952026367, "rewards/rejected": -32.501922607421875, "step": 6850 }, { "epoch": 2.33, "learning_rate": 1.2375676696462294e-07, "logits/chosen": -2.83660888671875, "logits/rejected": -2.75048828125, "logps/chosen": -275.4791564941406, "logps/rejected": -844.1605224609375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.5744662284851074, "rewards/margins": 35.16064453125, "rewards/rejected": -34.586181640625, "step": 6860 }, { "epoch": 2.34, "learning_rate": 1.2312728188341934e-07, "logits/chosen": -2.877854585647583, "logits/rejected": -2.7021849155426025, "logps/chosen": -223.7528533935547, "logps/rejected": -814.8102416992188, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.129342794418335, "rewards/margins": 40.59099578857422, "rewards/rejected": -39.46165466308594, "step": 6870 }, { "epoch": 2.34, "learning_rate": 1.224977968022158e-07, "logits/chosen": -2.8396759033203125, "logits/rejected": -2.791139841079712, "logps/chosen": -305.5686340332031, "logps/rejected": -718.57470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.7580264806747437, "rewards/margins": 38.50865936279297, "rewards/rejected": -36.75062942504883, "step": 6880 }, { "epoch": 2.34, "learning_rate": 1.218683117210122e-07, "logits/chosen": -2.9255530834198, "logits/rejected": -2.7770631313323975, "logps/chosen": -196.04710388183594, "logps/rejected": -817.8995361328125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.7630962133407593, "rewards/margins": 35.8356819152832, "rewards/rejected": -34.07258605957031, "step": 6890 }, { "epoch": 2.35, "learning_rate": 1.2123882663980863e-07, "logits/chosen": -2.803342819213867, "logits/rejected": -2.7385268211364746, "logps/chosen": -282.10919189453125, "logps/rejected": -816.6295166015625, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": 1.0018178224563599, "rewards/margins": 31.63300132751465, "rewards/rejected": -30.63118553161621, "step": 6900 }, { "epoch": 2.35, "eval_logits/chosen": -2.9606316089630127, "eval_logits/rejected": -2.777897834777832, "eval_logps/chosen": -255.3953857421875, "eval_logps/rejected": -754.1393432617188, "eval_loss": 0.003949255682528019, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.7561129927635193, "eval_rewards/margins": 35.9525146484375, "eval_rewards/rejected": -35.1963996887207, "eval_runtime": 461.7875, "eval_samples_per_second": 20.572, "eval_steps_per_second": 0.643, "step": 6900 }, { "epoch": 2.35, "learning_rate": 1.2060934155860506e-07, "logits/chosen": -2.8755500316619873, "logits/rejected": -2.7596487998962402, "logps/chosen": -222.6465301513672, "logps/rejected": -664.4992065429688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.554885745048523, "rewards/margins": 35.28291320800781, "rewards/rejected": -34.72802734375, "step": 6910 }, { "epoch": 2.35, "learning_rate": 1.1997985647740148e-07, "logits/chosen": -2.9587173461914062, "logits/rejected": -2.6760454177856445, "logps/chosen": -212.9420623779297, "logps/rejected": -724.3406982421875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.4477721452713013, "rewards/margins": 33.811851501464844, "rewards/rejected": -32.364078521728516, "step": 6920 }, { "epoch": 2.36, "learning_rate": 1.193503713961979e-07, "logits/chosen": -2.7842495441436768, "logits/rejected": -2.65433931350708, "logps/chosen": -355.6415710449219, "logps/rejected": -989.3406982421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.1173064708709717, "rewards/margins": 38.22886657714844, "rewards/rejected": -36.1115608215332, "step": 6930 }, { "epoch": 2.36, "learning_rate": 1.1872088631499433e-07, "logits/chosen": -2.9348652362823486, "logits/rejected": -2.7323825359344482, "logps/chosen": -235.4287567138672, "logps/rejected": -834.8053588867188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.48224806785583496, "rewards/margins": 39.8243408203125, "rewards/rejected": -39.34209060668945, "step": 6940 }, { "epoch": 2.36, "learning_rate": 1.1809140123379076e-07, "logits/chosen": -2.8480896949768066, "logits/rejected": -2.7182576656341553, "logps/chosen": -226.03060913085938, "logps/rejected": -774.3438720703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.9855741262435913, "rewards/margins": 33.30661392211914, "rewards/rejected": -31.321033477783203, "step": 6950 }, { "epoch": 2.37, "learning_rate": 1.1746191615258717e-07, "logits/chosen": -2.8448469638824463, "logits/rejected": -2.788433790206909, "logps/chosen": -210.885009765625, "logps/rejected": -797.6231689453125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.4180879592895508, "rewards/margins": 37.67543029785156, "rewards/rejected": -36.25734329223633, "step": 6960 }, { "epoch": 2.37, "learning_rate": 1.1683243107138361e-07, "logits/chosen": -2.837432384490967, "logits/rejected": -2.7519171237945557, "logps/chosen": -272.05780029296875, "logps/rejected": -733.4521484375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.8562266826629639, "rewards/margins": 41.41218566894531, "rewards/rejected": -39.55595397949219, "step": 6970 }, { "epoch": 2.37, "learning_rate": 1.1620294599018003e-07, "logits/chosen": -2.8522043228149414, "logits/rejected": -2.7312769889831543, "logps/chosen": -213.55044555664062, "logps/rejected": -670.4902954101562, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.6487449407577515, "rewards/margins": 35.915889739990234, "rewards/rejected": -35.267147064208984, "step": 6980 }, { "epoch": 2.38, "learning_rate": 1.1557346090897645e-07, "logits/chosen": -2.826024293899536, "logits/rejected": -2.7452285289764404, "logps/chosen": -195.41445922851562, "logps/rejected": -874.2907104492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.6810390949249268, "rewards/margins": 34.35312271118164, "rewards/rejected": -32.672080993652344, "step": 6990 }, { "epoch": 2.38, "learning_rate": 1.1494397582777288e-07, "logits/chosen": -2.969984531402588, "logits/rejected": -2.691690444946289, "logps/chosen": -192.30917358398438, "logps/rejected": -735.1781616210938, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.1726291179656982, "rewards/margins": 30.258869171142578, "rewards/rejected": -29.086238861083984, "step": 7000 }, { "epoch": 2.38, "eval_logits/chosen": -2.9533886909484863, "eval_logits/rejected": -2.7762296199798584, "eval_logps/chosen": -251.5618133544922, "eval_logps/rejected": -721.62548828125, "eval_loss": 0.0038166262675076723, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.1394697427749634, "eval_rewards/margins": 33.0844841003418, "eval_rewards/rejected": -31.945016860961914, "eval_runtime": 461.4071, "eval_samples_per_second": 20.589, "eval_steps_per_second": 0.644, "step": 7000 }, { "epoch": 2.38, "learning_rate": 1.1431449074656931e-07, "logits/chosen": -2.7527425289154053, "logits/rejected": -2.7223024368286133, "logps/chosen": -375.702880859375, "logps/rejected": -582.9882202148438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.7554123401641846, "rewards/margins": 32.07166290283203, "rewards/rejected": -30.316247940063477, "step": 7010 }, { "epoch": 2.39, "learning_rate": 1.1368500566536572e-07, "logits/chosen": -2.824134349822998, "logits/rejected": -2.604318141937256, "logps/chosen": -325.3201904296875, "logps/rejected": -778.512451171875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.90605890750885, "rewards/margins": 32.10724639892578, "rewards/rejected": -30.201183319091797, "step": 7020 }, { "epoch": 2.39, "learning_rate": 1.1305552058416214e-07, "logits/chosen": -2.9170992374420166, "logits/rejected": -2.7739944458007812, "logps/chosen": -216.15518188476562, "logps/rejected": -757.1286010742188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3385379314422607, "rewards/margins": 35.64634704589844, "rewards/rejected": -34.30780792236328, "step": 7030 }, { "epoch": 2.39, "learning_rate": 1.1242603550295858e-07, "logits/chosen": -2.950968027114868, "logits/rejected": -2.739314317703247, "logps/chosen": -194.66650390625, "logps/rejected": -838.9270629882812, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.4790537357330322, "rewards/margins": 38.41047286987305, "rewards/rejected": -36.931419372558594, "step": 7040 }, { "epoch": 2.4, "learning_rate": 1.1179655042175499e-07, "logits/chosen": -2.81681227684021, "logits/rejected": -2.712296962738037, "logps/chosen": -223.7734375, "logps/rejected": -586.4120483398438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.2764442265033722, "rewards/margins": 30.01479148864746, "rewards/rejected": -30.29123878479004, "step": 7050 }, { "epoch": 2.4, "learning_rate": 1.1116706534055143e-07, "logits/chosen": -2.89819073677063, "logits/rejected": -2.701070547103882, "logps/chosen": -261.9802551269531, "logps/rejected": -733.7467651367188, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.7233132123947144, "rewards/margins": 38.041664123535156, "rewards/rejected": -37.31835174560547, "step": 7060 }, { "epoch": 2.4, "learning_rate": 1.1053758025934785e-07, "logits/chosen": -2.9158082008361816, "logits/rejected": -2.7507033348083496, "logps/chosen": -206.71853637695312, "logps/rejected": -894.9440307617188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.9987127184867859, "rewards/margins": 34.97980880737305, "rewards/rejected": -33.98109436035156, "step": 7070 }, { "epoch": 2.41, "learning_rate": 1.0990809517814427e-07, "logits/chosen": -2.9142837524414062, "logits/rejected": -2.778026580810547, "logps/chosen": -259.2138977050781, "logps/rejected": -590.9222412109375, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 0.613023042678833, "rewards/margins": 30.991968154907227, "rewards/rejected": -30.37894058227539, "step": 7080 }, { "epoch": 2.41, "learning_rate": 1.092786100969407e-07, "logits/chosen": -2.8467581272125244, "logits/rejected": -2.692148208618164, "logps/chosen": -212.12490844726562, "logps/rejected": -727.260498046875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.7248979806900024, "rewards/margins": 35.2900505065918, "rewards/rejected": -33.56515121459961, "step": 7090 }, { "epoch": 2.41, "learning_rate": 1.0864912501573713e-07, "logits/chosen": -2.8422183990478516, "logits/rejected": -2.6257264614105225, "logps/chosen": -200.8089141845703, "logps/rejected": -612.3531494140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.5693346261978149, "rewards/margins": 27.619197845458984, "rewards/rejected": -27.04986572265625, "step": 7100 }, { "epoch": 2.41, "eval_logits/chosen": -2.95153546333313, "eval_logits/rejected": -2.7624027729034424, "eval_logps/chosen": -250.61111450195312, "eval_logps/rejected": -745.656982421875, "eval_loss": 0.00404700729995966, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.2345402240753174, "eval_rewards/margins": 35.58269500732422, "eval_rewards/rejected": -34.3481559753418, "eval_runtime": 462.0442, "eval_samples_per_second": 20.561, "eval_steps_per_second": 0.643, "step": 7100 }, { "epoch": 2.42, "learning_rate": 1.0801963993453354e-07, "logits/chosen": -2.989555835723877, "logits/rejected": -2.7749760150909424, "logps/chosen": -209.5009002685547, "logps/rejected": -579.4617919921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.1281912326812744, "rewards/margins": 35.204437255859375, "rewards/rejected": -34.07624435424805, "step": 7110 }, { "epoch": 2.42, "learning_rate": 1.0739015485332998e-07, "logits/chosen": -2.7728326320648193, "logits/rejected": -2.721867799758911, "logps/chosen": -298.6146545410156, "logps/rejected": -803.2979736328125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.4211173057556152, "rewards/margins": 36.10731887817383, "rewards/rejected": -34.68620300292969, "step": 7120 }, { "epoch": 2.42, "learning_rate": 1.067606697721264e-07, "logits/chosen": -2.8334362506866455, "logits/rejected": -2.680701494216919, "logps/chosen": -279.81378173828125, "logps/rejected": -669.5172729492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.21878190338611603, "rewards/margins": 33.8343391418457, "rewards/rejected": -33.61555862426758, "step": 7130 }, { "epoch": 2.43, "learning_rate": 1.0613118469092282e-07, "logits/chosen": -2.9019999504089355, "logits/rejected": -2.6161866188049316, "logps/chosen": -253.5339813232422, "logps/rejected": -1072.937744140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.8699172735214233, "rewards/margins": 36.23839569091797, "rewards/rejected": -34.36847686767578, "step": 7140 }, { "epoch": 2.43, "learning_rate": 1.0550169960971924e-07, "logits/chosen": -2.8552181720733643, "logits/rejected": -2.720036745071411, "logps/chosen": -287.0708312988281, "logps/rejected": -857.0642700195312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.225647449493408, "rewards/margins": 37.789974212646484, "rewards/rejected": -34.5643310546875, "step": 7150 }, { "epoch": 2.43, "learning_rate": 1.0487221452851568e-07, "logits/chosen": -2.8624091148376465, "logits/rejected": -2.7015230655670166, "logps/chosen": -240.4618682861328, "logps/rejected": -857.6585083007812, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.1447405815124512, "rewards/margins": 33.89751434326172, "rewards/rejected": -32.752777099609375, "step": 7160 }, { "epoch": 2.44, "learning_rate": 1.0424272944731209e-07, "logits/chosen": -2.8212456703186035, "logits/rejected": -2.7755770683288574, "logps/chosen": -272.45330810546875, "logps/rejected": -841.7190551757812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.9376089572906494, "rewards/margins": 35.578269958496094, "rewards/rejected": -33.64065933227539, "step": 7170 }, { "epoch": 2.44, "learning_rate": 1.0361324436610853e-07, "logits/chosen": -2.861576557159424, "logits/rejected": -2.853471279144287, "logps/chosen": -258.7378234863281, "logps/rejected": -701.6221923828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.7719053030014038, "rewards/margins": 40.149166107177734, "rewards/rejected": -38.37725830078125, "step": 7180 }, { "epoch": 2.44, "learning_rate": 1.0298375928490494e-07, "logits/chosen": -2.8291234970092773, "logits/rejected": -2.7765250205993652, "logps/chosen": -189.64999389648438, "logps/rejected": -767.1797485351562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2078698873519897, "rewards/margins": 35.883583068847656, "rewards/rejected": -34.67571258544922, "step": 7190 }, { "epoch": 2.45, "learning_rate": 1.0235427420370137e-07, "logits/chosen": -2.9084620475769043, "logits/rejected": -2.683842897415161, "logps/chosen": -213.56723022460938, "logps/rejected": -655.2828979492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.7238308191299438, "rewards/margins": 30.63381004333496, "rewards/rejected": -28.90997886657715, "step": 7200 }, { "epoch": 2.45, "eval_logits/chosen": -2.9635136127471924, "eval_logits/rejected": -2.7747459411621094, "eval_logps/chosen": -246.9364776611328, "eval_logps/rejected": -723.669677734375, "eval_loss": 0.0034322983119636774, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.6020042896270752, "eval_rewards/margins": 33.75143814086914, "eval_rewards/rejected": -32.14942932128906, "eval_runtime": 461.7296, "eval_samples_per_second": 20.575, "eval_steps_per_second": 0.643, "step": 7200 }, { "epoch": 2.45, "learning_rate": 1.017247891224978e-07, "logits/chosen": -2.9878621101379395, "logits/rejected": -2.785412549972534, "logps/chosen": -197.89822387695312, "logps/rejected": -516.4476928710938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.732309341430664, "rewards/margins": 31.4769229888916, "rewards/rejected": -29.744613647460938, "step": 7210 }, { "epoch": 2.45, "learning_rate": 1.0109530404129422e-07, "logits/chosen": -2.901970386505127, "logits/rejected": -2.7656936645507812, "logps/chosen": -267.3144836425781, "logps/rejected": -641.322021484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5833483934402466, "rewards/margins": 35.35906219482422, "rewards/rejected": -33.77570724487305, "step": 7220 }, { "epoch": 2.46, "learning_rate": 1.0046581896009064e-07, "logits/chosen": -2.8694534301757812, "logits/rejected": -2.757223606109619, "logps/chosen": -245.86709594726562, "logps/rejected": -710.860107421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.1430171728134155, "rewards/margins": 35.51673126220703, "rewards/rejected": -34.37370681762695, "step": 7230 }, { "epoch": 2.46, "learning_rate": 9.983633387888708e-08, "logits/chosen": -2.900573253631592, "logits/rejected": -2.7112362384796143, "logps/chosen": -253.2101593017578, "logps/rejected": -903.4608154296875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.5164027214050293, "rewards/margins": 35.865013122558594, "rewards/rejected": -34.3486213684082, "step": 7240 }, { "epoch": 2.46, "learning_rate": 9.920684879768348e-08, "logits/chosen": -2.883937120437622, "logits/rejected": -2.7085719108581543, "logps/chosen": -289.17596435546875, "logps/rejected": -729.0779418945312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7090067863464355, "rewards/margins": 30.788379669189453, "rewards/rejected": -28.07937240600586, "step": 7250 }, { "epoch": 2.47, "learning_rate": 9.857736371647991e-08, "logits/chosen": -2.8483545780181885, "logits/rejected": -2.7901923656463623, "logps/chosen": -327.535400390625, "logps/rejected": -783.3504028320312, "loss": 0.0025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8648250102996826, "rewards/margins": 34.592803955078125, "rewards/rejected": -32.72798156738281, "step": 7260 }, { "epoch": 2.47, "learning_rate": 9.794787863527634e-08, "logits/chosen": -2.839839458465576, "logits/rejected": -2.75819993019104, "logps/chosen": -322.12139892578125, "logps/rejected": -518.5718383789062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 2.304831027984619, "rewards/margins": 33.19840621948242, "rewards/rejected": -30.893579483032227, "step": 7270 }, { "epoch": 2.47, "learning_rate": 9.731839355407275e-08, "logits/chosen": -2.8968801498413086, "logits/rejected": -2.670220375061035, "logps/chosen": -255.6337432861328, "logps/rejected": -551.29931640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.439632534980774, "rewards/margins": 30.470157623291016, "rewards/rejected": -29.030527114868164, "step": 7280 }, { "epoch": 2.48, "learning_rate": 9.668890847286919e-08, "logits/chosen": -2.8631632328033447, "logits/rejected": -2.757976531982422, "logps/chosen": -306.1112365722656, "logps/rejected": -560.8742065429688, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 2.7616422176361084, "rewards/margins": 32.03538513183594, "rewards/rejected": -29.27374267578125, "step": 7290 }, { "epoch": 2.48, "learning_rate": 9.605942339166561e-08, "logits/chosen": -2.9146995544433594, "logits/rejected": -2.6166672706604004, "logps/chosen": -239.5911407470703, "logps/rejected": -961.2615356445312, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 2.23673415184021, "rewards/margins": 35.936248779296875, "rewards/rejected": -33.69951629638672, "step": 7300 }, { "epoch": 2.48, "eval_logits/chosen": -2.955641269683838, "eval_logits/rejected": -2.7679035663604736, "eval_logps/chosen": -246.64295959472656, "eval_logps/rejected": -723.46728515625, "eval_loss": 0.0036121748853474855, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.6313525438308716, "eval_rewards/margins": 33.76054000854492, "eval_rewards/rejected": -32.129188537597656, "eval_runtime": 461.9047, "eval_samples_per_second": 20.567, "eval_steps_per_second": 0.643, "step": 7300 }, { "epoch": 2.48, "learning_rate": 9.542993831046203e-08, "logits/chosen": -2.7836217880249023, "logits/rejected": -2.741434097290039, "logps/chosen": -253.1768341064453, "logps/rejected": -609.7213134765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.5441644191741943, "rewards/margins": 29.044719696044922, "rewards/rejected": -27.500558853149414, "step": 7310 }, { "epoch": 2.49, "learning_rate": 9.480045322925846e-08, "logits/chosen": -2.8722763061523438, "logits/rejected": -2.8363466262817383, "logps/chosen": -257.98944091796875, "logps/rejected": -749.8233642578125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.2243343591690063, "rewards/margins": 34.42903137207031, "rewards/rejected": -33.204689025878906, "step": 7320 }, { "epoch": 2.49, "learning_rate": 9.41709681480549e-08, "logits/chosen": -2.8483028411865234, "logits/rejected": -2.8284692764282227, "logps/chosen": -189.8106689453125, "logps/rejected": -665.6820068359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.0484671592712402, "rewards/margins": 36.00248718261719, "rewards/rejected": -33.954017639160156, "step": 7330 }, { "epoch": 2.49, "learning_rate": 9.35414830668513e-08, "logits/chosen": -2.9139206409454346, "logits/rejected": -2.730468273162842, "logps/chosen": -173.9368438720703, "logps/rejected": -727.186767578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2355272769927979, "rewards/margins": 32.55491638183594, "rewards/rejected": -31.319387435913086, "step": 7340 }, { "epoch": 2.5, "learning_rate": 9.291199798564774e-08, "logits/chosen": -2.9307892322540283, "logits/rejected": -2.7155587673187256, "logps/chosen": -205.4160614013672, "logps/rejected": -727.6798706054688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.196798801422119, "rewards/margins": 33.27196502685547, "rewards/rejected": -31.07516860961914, "step": 7350 }, { "epoch": 2.5, "learning_rate": 9.228251290444416e-08, "logits/chosen": -2.898782253265381, "logits/rejected": -2.6827712059020996, "logps/chosen": -252.8424835205078, "logps/rejected": -718.3244018554688, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.7688614130020142, "rewards/margins": 31.635894775390625, "rewards/rejected": -29.867029190063477, "step": 7360 }, { "epoch": 2.51, "learning_rate": 9.165302782324058e-08, "logits/chosen": -2.862764596939087, "logits/rejected": -2.7372021675109863, "logps/chosen": -212.2377471923828, "logps/rejected": -730.7447509765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.339857578277588, "rewards/margins": 35.01812744140625, "rewards/rejected": -33.67827224731445, "step": 7370 }, { "epoch": 2.51, "learning_rate": 9.102354274203701e-08, "logits/chosen": -2.9186649322509766, "logits/rejected": -2.708289623260498, "logps/chosen": -241.11477661132812, "logps/rejected": -769.4359130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.5188145637512207, "rewards/margins": 37.06493377685547, "rewards/rejected": -35.54612350463867, "step": 7380 }, { "epoch": 2.51, "learning_rate": 9.039405766083344e-08, "logits/chosen": -2.847299575805664, "logits/rejected": -2.6940419673919678, "logps/chosen": -210.3753662109375, "logps/rejected": -863.38037109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.46426454186439514, "rewards/margins": 32.90520095825195, "rewards/rejected": -32.44093322753906, "step": 7390 }, { "epoch": 2.52, "learning_rate": 8.976457257962985e-08, "logits/chosen": -2.9129178524017334, "logits/rejected": -2.725386381149292, "logps/chosen": -208.5717315673828, "logps/rejected": -761.41552734375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.2757318019866943, "rewards/margins": 32.52819061279297, "rewards/rejected": -31.252466201782227, "step": 7400 }, { "epoch": 2.52, "eval_logits/chosen": -2.962887763977051, "eval_logits/rejected": -2.7721610069274902, "eval_logps/chosen": -248.1788787841797, "eval_logps/rejected": -737.426025390625, "eval_loss": 0.003546712687239051, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.4777637720108032, "eval_rewards/margins": 35.00283432006836, "eval_rewards/rejected": -33.52507400512695, "eval_runtime": 461.9904, "eval_samples_per_second": 20.563, "eval_steps_per_second": 0.643, "step": 7400 }, { "epoch": 2.52, "learning_rate": 8.913508749842629e-08, "logits/chosen": -2.8658292293548584, "logits/rejected": -2.7712082862854004, "logps/chosen": -203.25149536132812, "logps/rejected": -712.8258666992188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2116005420684814, "rewards/margins": 34.32830047607422, "rewards/rejected": -33.11669921875, "step": 7410 }, { "epoch": 2.52, "learning_rate": 8.850560241722271e-08, "logits/chosen": -2.8211684226989746, "logits/rejected": -2.6288390159606934, "logps/chosen": -260.2122497558594, "logps/rejected": -699.3378295898438, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.1009926795959473, "rewards/margins": 30.824275970458984, "rewards/rejected": -29.723285675048828, "step": 7420 }, { "epoch": 2.53, "learning_rate": 8.787611733601913e-08, "logits/chosen": -2.7467024326324463, "logits/rejected": -2.7833166122436523, "logps/chosen": -361.0596618652344, "logps/rejected": -564.9066162109375, "loss": 0.004, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9736824035644531, "rewards/margins": 35.389312744140625, "rewards/rejected": -33.41563034057617, "step": 7430 }, { "epoch": 2.53, "learning_rate": 8.724663225481556e-08, "logits/chosen": -2.9154953956604004, "logits/rejected": -2.788863182067871, "logps/chosen": -246.64834594726562, "logps/rejected": -835.2634887695312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.8881199359893799, "rewards/margins": 35.76900863647461, "rewards/rejected": -33.880889892578125, "step": 7440 }, { "epoch": 2.53, "learning_rate": 8.6617147173612e-08, "logits/chosen": -2.8674702644348145, "logits/rejected": -2.719933271408081, "logps/chosen": -202.82200622558594, "logps/rejected": -861.4627075195312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.9311338663101196, "rewards/margins": 36.43107223510742, "rewards/rejected": -34.49993896484375, "step": 7450 }, { "epoch": 2.54, "learning_rate": 8.59876620924084e-08, "logits/chosen": -2.838120937347412, "logits/rejected": -2.758655071258545, "logps/chosen": -372.70684814453125, "logps/rejected": -760.6192626953125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.2844078540802, "rewards/margins": 33.54411315917969, "rewards/rejected": -31.25970458984375, "step": 7460 }, { "epoch": 2.54, "learning_rate": 8.535817701120483e-08, "logits/chosen": -2.881722927093506, "logits/rejected": -2.7332863807678223, "logps/chosen": -209.82485961914062, "logps/rejected": -859.8692626953125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.223785877227783, "rewards/margins": 31.529964447021484, "rewards/rejected": -29.30617904663086, "step": 7470 }, { "epoch": 2.54, "learning_rate": 8.472869193000126e-08, "logits/chosen": -2.726494550704956, "logits/rejected": -2.826253890991211, "logps/chosen": -390.6224365234375, "logps/rejected": -751.8790283203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.9882179498672485, "rewards/margins": 35.86880874633789, "rewards/rejected": -33.88058853149414, "step": 7480 }, { "epoch": 2.55, "learning_rate": 8.409920684879767e-08, "logits/chosen": -2.823176622390747, "logits/rejected": -2.7413859367370605, "logps/chosen": -251.97366333007812, "logps/rejected": -697.8931884765625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.9913427829742432, "rewards/margins": 33.10942459106445, "rewards/rejected": -31.11808204650879, "step": 7490 }, { "epoch": 2.55, "learning_rate": 8.346972176759411e-08, "logits/chosen": -2.8623874187469482, "logits/rejected": -2.754727840423584, "logps/chosen": -360.556640625, "logps/rejected": -788.0841674804688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.1576857566833496, "rewards/margins": 35.63214111328125, "rewards/rejected": -34.474456787109375, "step": 7500 }, { "epoch": 2.55, "eval_logits/chosen": -2.9806020259857178, "eval_logits/rejected": -2.7915842533111572, "eval_logps/chosen": -245.9751739501953, "eval_logps/rejected": -728.8140258789062, "eval_loss": 0.003388113807886839, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.6981340646743774, "eval_rewards/margins": 34.36199951171875, "eval_rewards/rejected": -32.66386413574219, "eval_runtime": 460.9302, "eval_samples_per_second": 20.61, "eval_steps_per_second": 0.644, "step": 7500 }, { "epoch": 2.55, "learning_rate": 8.284023668639053e-08, "logits/chosen": -2.8245668411254883, "logits/rejected": -2.714792490005493, "logps/chosen": -356.82037353515625, "logps/rejected": -586.7261352539062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.2216227054595947, "rewards/margins": 28.455944061279297, "rewards/rejected": -26.234323501586914, "step": 7510 }, { "epoch": 2.56, "learning_rate": 8.221075160518695e-08, "logits/chosen": -2.8163273334503174, "logits/rejected": -2.6320555210113525, "logps/chosen": -290.7445373535156, "logps/rejected": -657.8960571289062, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.6349328756332397, "rewards/margins": 28.144948959350586, "rewards/rejected": -26.5100154876709, "step": 7520 }, { "epoch": 2.56, "learning_rate": 8.158126652398338e-08, "logits/chosen": -2.8538668155670166, "logits/rejected": -2.7637343406677246, "logps/chosen": -303.41632080078125, "logps/rejected": -596.43212890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4906651973724365, "rewards/margins": 34.45391845703125, "rewards/rejected": -31.9632511138916, "step": 7530 }, { "epoch": 2.56, "learning_rate": 8.09517814427798e-08, "logits/chosen": -2.899174690246582, "logits/rejected": -2.770847797393799, "logps/chosen": -190.26593017578125, "logps/rejected": -695.39599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.8257381916046143, "rewards/margins": 34.00194549560547, "rewards/rejected": -32.17620086669922, "step": 7540 }, { "epoch": 2.57, "learning_rate": 8.032229636157622e-08, "logits/chosen": -2.8691365718841553, "logits/rejected": -2.7192864418029785, "logps/chosen": -181.1930694580078, "logps/rejected": -866.8465576171875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 1.8788772821426392, "rewards/margins": 34.857215881347656, "rewards/rejected": -32.97834014892578, "step": 7550 }, { "epoch": 2.57, "learning_rate": 7.969281128037266e-08, "logits/chosen": -2.9226157665252686, "logits/rejected": -2.8280673027038574, "logps/chosen": -209.9813995361328, "logps/rejected": -640.6646118164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.4108762741088867, "rewards/margins": 31.19570541381836, "rewards/rejected": -29.78483009338379, "step": 7560 }, { "epoch": 2.57, "learning_rate": 7.906332619916907e-08, "logits/chosen": -2.8823554515838623, "logits/rejected": -2.7710671424865723, "logps/chosen": -267.537109375, "logps/rejected": -731.9823608398438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.1326946020126343, "rewards/margins": 32.28135299682617, "rewards/rejected": -31.14866065979004, "step": 7570 }, { "epoch": 2.58, "learning_rate": 7.84338411179655e-08, "logits/chosen": -2.8676023483276367, "logits/rejected": -2.680656909942627, "logps/chosen": -279.1723937988281, "logps/rejected": -765.1873168945312, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 1.8172576427459717, "rewards/margins": 38.70576095581055, "rewards/rejected": -36.88850021362305, "step": 7580 }, { "epoch": 2.58, "learning_rate": 7.780435603676193e-08, "logits/chosen": -2.9320082664489746, "logits/rejected": -2.817993402481079, "logps/chosen": -191.81338500976562, "logps/rejected": -736.7349853515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.358437418937683, "rewards/margins": 37.26475143432617, "rewards/rejected": -35.90631866455078, "step": 7590 }, { "epoch": 2.58, "learning_rate": 7.717487095555835e-08, "logits/chosen": -2.870086193084717, "logits/rejected": -2.8316028118133545, "logps/chosen": -213.04891967773438, "logps/rejected": -826.2672729492188, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.0381367206573486, "rewards/margins": 38.66090774536133, "rewards/rejected": -37.622772216796875, "step": 7600 }, { "epoch": 2.58, "eval_logits/chosen": -2.975843906402588, "eval_logits/rejected": -2.7888052463531494, "eval_logps/chosen": -245.88052368164062, "eval_logps/rejected": -729.0055541992188, "eval_loss": 0.003199763363227248, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.7076013088226318, "eval_rewards/margins": 34.390621185302734, "eval_rewards/rejected": -32.683021545410156, "eval_runtime": 461.8318, "eval_samples_per_second": 20.57, "eval_steps_per_second": 0.643, "step": 7600 }, { "epoch": 2.59, "learning_rate": 7.654538587435477e-08, "logits/chosen": -2.9359641075134277, "logits/rejected": -2.7478954792022705, "logps/chosen": -222.0003662109375, "logps/rejected": -683.6935424804688, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1966060400009155, "rewards/margins": 33.215641021728516, "rewards/rejected": -32.01903533935547, "step": 7610 }, { "epoch": 2.59, "learning_rate": 7.591590079315121e-08, "logits/chosen": -2.832313299179077, "logits/rejected": -2.759093761444092, "logps/chosen": -241.6915283203125, "logps/rejected": -686.8284912109375, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 1.4447554349899292, "rewards/margins": 30.330928802490234, "rewards/rejected": -28.886173248291016, "step": 7620 }, { "epoch": 2.59, "learning_rate": 7.528641571194762e-08, "logits/chosen": -2.8889105319976807, "logits/rejected": -2.7153854370117188, "logps/chosen": -186.6122283935547, "logps/rejected": -839.9580078125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.225599527359009, "rewards/margins": 34.77473449707031, "rewards/rejected": -32.549129486083984, "step": 7630 }, { "epoch": 2.6, "learning_rate": 7.465693063074405e-08, "logits/chosen": -2.851897954940796, "logits/rejected": -2.886849880218506, "logps/chosen": -267.0950012207031, "logps/rejected": -721.800048828125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.708082914352417, "rewards/margins": 35.50648880004883, "rewards/rejected": -33.798404693603516, "step": 7640 }, { "epoch": 2.6, "learning_rate": 7.402744554954048e-08, "logits/chosen": -2.8853766918182373, "logits/rejected": -2.8013031482696533, "logps/chosen": -252.66921997070312, "logps/rejected": -599.9445190429688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.360748767852783, "rewards/margins": 29.6185245513916, "rewards/rejected": -27.25777244567871, "step": 7650 }, { "epoch": 2.6, "learning_rate": 7.33979604683369e-08, "logits/chosen": -2.8861021995544434, "logits/rejected": -2.7406888008117676, "logps/chosen": -258.0936584472656, "logps/rejected": -596.8135375976562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.3771817684173584, "rewards/margins": 28.408737182617188, "rewards/rejected": -26.03155517578125, "step": 7660 }, { "epoch": 2.61, "learning_rate": 7.276847538713332e-08, "logits/chosen": -2.940986156463623, "logits/rejected": -2.7470431327819824, "logps/chosen": -278.822265625, "logps/rejected": -624.2587890625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.7470260858535767, "rewards/margins": 30.56268882751465, "rewards/rejected": -28.815664291381836, "step": 7670 }, { "epoch": 2.61, "learning_rate": 7.213899030592976e-08, "logits/chosen": -2.864551544189453, "logits/rejected": -2.7666144371032715, "logps/chosen": -265.721923828125, "logps/rejected": -554.2433471679688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.618726968765259, "rewards/margins": 27.209869384765625, "rewards/rejected": -24.59114646911621, "step": 7680 }, { "epoch": 2.61, "learning_rate": 7.150950522472617e-08, "logits/chosen": -2.9601664543151855, "logits/rejected": -2.7529773712158203, "logps/chosen": -260.0295104980469, "logps/rejected": -620.2147216796875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 2.6507606506347656, "rewards/margins": 33.92409896850586, "rewards/rejected": -31.27333641052246, "step": 7690 }, { "epoch": 2.62, "learning_rate": 7.088002014352259e-08, "logits/chosen": -2.810514450073242, "logits/rejected": -2.7307190895080566, "logps/chosen": -321.6409606933594, "logps/rejected": -662.0742797851562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.2467312812805176, "rewards/margins": 31.846202850341797, "rewards/rejected": -29.599477767944336, "step": 7700 }, { "epoch": 2.62, "eval_logits/chosen": -2.9656155109405518, "eval_logits/rejected": -2.783700704574585, "eval_logps/chosen": -242.3954315185547, "eval_logps/rejected": -700.0899047851562, "eval_loss": 0.003384356154128909, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 2.05610990524292, "eval_rewards/margins": 31.84757423400879, "eval_rewards/rejected": -29.791465759277344, "eval_runtime": 462.0039, "eval_samples_per_second": 20.563, "eval_steps_per_second": 0.643, "step": 7700 }, { "epoch": 2.62, "learning_rate": 7.025053506231903e-08, "logits/chosen": -2.827775478363037, "logits/rejected": -2.675283432006836, "logps/chosen": -234.77120971679688, "logps/rejected": -526.2034912109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.9131532907485962, "rewards/margins": 30.8258056640625, "rewards/rejected": -28.912649154663086, "step": 7710 }, { "epoch": 2.62, "learning_rate": 6.962104998111543e-08, "logits/chosen": -2.783626079559326, "logits/rejected": -2.8327949047088623, "logps/chosen": -224.01358032226562, "logps/rejected": -629.04833984375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.130934476852417, "rewards/margins": 31.484020233154297, "rewards/rejected": -29.353084564208984, "step": 7720 }, { "epoch": 2.63, "learning_rate": 6.899156489991187e-08, "logits/chosen": -2.8576273918151855, "logits/rejected": -2.8163819313049316, "logps/chosen": -230.7477569580078, "logps/rejected": -647.674072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.2242932319641113, "rewards/margins": 32.5281867980957, "rewards/rejected": -30.303897857666016, "step": 7730 }, { "epoch": 2.63, "learning_rate": 6.83620798187083e-08, "logits/chosen": -2.8221435546875, "logits/rejected": -2.7843406200408936, "logps/chosen": -302.7204895019531, "logps/rejected": -581.983154296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.5695157051086426, "rewards/margins": 30.563451766967773, "rewards/rejected": -27.993942260742188, "step": 7740 }, { "epoch": 2.63, "learning_rate": 6.773259473750472e-08, "logits/chosen": -2.96877384185791, "logits/rejected": -2.6815133094787598, "logps/chosen": -179.95159912109375, "logps/rejected": -645.2207641601562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.4195446968078613, "rewards/margins": 31.964548110961914, "rewards/rejected": -29.545001983642578, "step": 7750 }, { "epoch": 2.64, "learning_rate": 6.710310965630114e-08, "logits/chosen": -2.752136468887329, "logits/rejected": -2.70862078666687, "logps/chosen": -406.32769775390625, "logps/rejected": -729.5647583007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.097856044769287, "rewards/margins": 32.05318069458008, "rewards/rejected": -29.955318450927734, "step": 7760 }, { "epoch": 2.64, "learning_rate": 6.647362457509758e-08, "logits/chosen": -2.866553544998169, "logits/rejected": -2.7680437564849854, "logps/chosen": -249.7261199951172, "logps/rejected": -565.7576293945312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.496461272239685, "rewards/margins": 29.221481323242188, "rewards/rejected": -27.725025177001953, "step": 7770 }, { "epoch": 2.64, "learning_rate": 6.584413949389398e-08, "logits/chosen": -2.8272061347961426, "logits/rejected": -2.752551555633545, "logps/chosen": -314.1927490234375, "logps/rejected": -744.2816772460938, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.779463529586792, "rewards/margins": 31.075159072875977, "rewards/rejected": -29.29569435119629, "step": 7780 }, { "epoch": 2.65, "learning_rate": 6.521465441269042e-08, "logits/chosen": -2.9061553478240967, "logits/rejected": -2.726597547531128, "logps/chosen": -300.7236328125, "logps/rejected": -766.4381103515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.3516252040863037, "rewards/margins": 30.0067138671875, "rewards/rejected": -27.65509033203125, "step": 7790 }, { "epoch": 2.65, "learning_rate": 6.458516933148684e-08, "logits/chosen": -2.8620238304138184, "logits/rejected": -2.6686692237854004, "logps/chosen": -238.9864959716797, "logps/rejected": -746.80615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.1027491092681885, "rewards/margins": 31.016870498657227, "rewards/rejected": -28.91411781311035, "step": 7800 }, { "epoch": 2.65, "eval_logits/chosen": -2.961817979812622, "eval_logits/rejected": -2.7781989574432373, "eval_logps/chosen": -242.5819854736328, "eval_logps/rejected": -706.1458129882812, "eval_loss": 0.003279141616076231, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 2.0374555587768555, "eval_rewards/margins": 32.43450927734375, "eval_rewards/rejected": -30.397050857543945, "eval_runtime": 462.5321, "eval_samples_per_second": 20.539, "eval_steps_per_second": 0.642, "step": 7800 }, { "epoch": 2.65, "learning_rate": 6.395568425028327e-08, "logits/chosen": -2.8812954425811768, "logits/rejected": -2.703012466430664, "logps/chosen": -182.6671905517578, "logps/rejected": -906.060546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.9316184520721436, "rewards/margins": 36.83222198486328, "rewards/rejected": -34.900604248046875, "step": 7810 }, { "epoch": 2.66, "learning_rate": 6.332619916907969e-08, "logits/chosen": -2.8721272945404053, "logits/rejected": -2.823141574859619, "logps/chosen": -187.41921997070312, "logps/rejected": -757.3554077148438, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 2.137277364730835, "rewards/margins": 36.71207809448242, "rewards/rejected": -34.574806213378906, "step": 7820 }, { "epoch": 2.66, "learning_rate": 6.269671408787612e-08, "logits/chosen": -2.9182374477386475, "logits/rejected": -2.7441134452819824, "logps/chosen": -213.75363159179688, "logps/rejected": -827.5164184570312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.7667627334594727, "rewards/margins": 32.6684684753418, "rewards/rejected": -30.90171241760254, "step": 7830 }, { "epoch": 2.66, "learning_rate": 6.206722900667253e-08, "logits/chosen": -2.896608591079712, "logits/rejected": -2.7430830001831055, "logps/chosen": -256.9504699707031, "logps/rejected": -880.4461669921875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.8200105428695679, "rewards/margins": 31.402782440185547, "rewards/rejected": -29.5827693939209, "step": 7840 }, { "epoch": 2.67, "learning_rate": 6.143774392546897e-08, "logits/chosen": -2.8509395122528076, "logits/rejected": -2.695783853530884, "logps/chosen": -327.309814453125, "logps/rejected": -610.1741943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.7793450355529785, "rewards/margins": 33.20561218261719, "rewards/rejected": -30.4262638092041, "step": 7850 }, { "epoch": 2.67, "learning_rate": 6.080825884426539e-08, "logits/chosen": -2.8422911167144775, "logits/rejected": -2.8998265266418457, "logps/chosen": -259.82586669921875, "logps/rejected": -625.9583740234375, "loss": 0.0026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5995936393737793, "rewards/margins": 32.35774230957031, "rewards/rejected": -30.758148193359375, "step": 7860 }, { "epoch": 2.68, "learning_rate": 6.017877376306182e-08, "logits/chosen": -2.8806869983673096, "logits/rejected": -2.8387961387634277, "logps/chosen": -263.5113220214844, "logps/rejected": -695.5892333984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.4628822803497314, "rewards/margins": 37.64958953857422, "rewards/rejected": -36.18670654296875, "step": 7870 }, { "epoch": 2.68, "learning_rate": 5.954928868185824e-08, "logits/chosen": -2.7881393432617188, "logits/rejected": -2.6759397983551025, "logps/chosen": -324.4100036621094, "logps/rejected": -913.5089111328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.3475289344787598, "rewards/margins": 33.375404357910156, "rewards/rejected": -31.027873992919922, "step": 7880 }, { "epoch": 2.68, "learning_rate": 5.891980360065466e-08, "logits/chosen": -2.7498486042022705, "logits/rejected": -2.794663667678833, "logps/chosen": -301.72369384765625, "logps/rejected": -537.539794921875, "loss": 0.0023, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4914456605911255, "rewards/margins": 31.71088218688965, "rewards/rejected": -30.219436645507812, "step": 7890 }, { "epoch": 2.69, "learning_rate": 5.8290318519451084e-08, "logits/chosen": -2.925682783126831, "logits/rejected": -2.7502644062042236, "logps/chosen": -191.26950073242188, "logps/rejected": -568.9969482421875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.3215970993041992, "rewards/margins": 26.668567657470703, "rewards/rejected": -25.346969604492188, "step": 7900 }, { "epoch": 2.69, "eval_logits/chosen": -2.973884344100952, "eval_logits/rejected": -2.7837181091308594, "eval_logps/chosen": -244.2588653564453, "eval_logps/rejected": -713.432861328125, "eval_loss": 0.0030939257703721523, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.8697654008865356, "eval_rewards/margins": 32.99552536010742, "eval_rewards/rejected": -31.12575912475586, "eval_runtime": 463.2004, "eval_samples_per_second": 20.509, "eval_steps_per_second": 0.641, "step": 7900 }, { "epoch": 2.69, "learning_rate": 5.7660833438247514e-08, "logits/chosen": -2.917731285095215, "logits/rejected": -2.799001693725586, "logps/chosen": -261.0954284667969, "logps/rejected": -693.6121826171875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.7646353244781494, "rewards/margins": 33.04741668701172, "rewards/rejected": -31.282785415649414, "step": 7910 }, { "epoch": 2.69, "learning_rate": 5.7031348357043937e-08, "logits/chosen": -2.8706984519958496, "logits/rejected": -2.713573932647705, "logps/chosen": -213.04867553710938, "logps/rejected": -916.0791015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.963094711303711, "rewards/margins": 35.601112365722656, "rewards/rejected": -33.63801956176758, "step": 7920 }, { "epoch": 2.7, "learning_rate": 5.640186327584036e-08, "logits/chosen": -2.8877367973327637, "logits/rejected": -2.718595027923584, "logps/chosen": -233.1251220703125, "logps/rejected": -636.7473754882812, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.1000721454620361, "rewards/margins": 34.30464553833008, "rewards/rejected": -33.20457077026367, "step": 7930 }, { "epoch": 2.7, "learning_rate": 5.577237819463679e-08, "logits/chosen": -2.8754708766937256, "logits/rejected": -2.8534939289093018, "logps/chosen": -252.5353546142578, "logps/rejected": -553.8245849609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.6495370864868164, "rewards/margins": 35.70760726928711, "rewards/rejected": -34.05807113647461, "step": 7940 }, { "epoch": 2.7, "learning_rate": 5.514289311343321e-08, "logits/chosen": -2.809431314468384, "logits/rejected": -2.847320556640625, "logps/chosen": -293.93109130859375, "logps/rejected": -804.7247314453125, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 2.8012919425964355, "rewards/margins": 37.96441650390625, "rewards/rejected": -35.163124084472656, "step": 7950 }, { "epoch": 2.71, "learning_rate": 5.4513408032229634e-08, "logits/chosen": -2.850141763687134, "logits/rejected": -2.717120409011841, "logps/chosen": -301.227294921875, "logps/rejected": -888.9896240234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.126427173614502, "rewards/margins": 36.72178268432617, "rewards/rejected": -34.59535598754883, "step": 7960 }, { "epoch": 2.71, "learning_rate": 5.388392295102606e-08, "logits/chosen": -2.856905937194824, "logits/rejected": -2.7897603511810303, "logps/chosen": -245.76791381835938, "logps/rejected": -536.1624755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.729501724243164, "rewards/margins": 30.310199737548828, "rewards/rejected": -28.5806941986084, "step": 7970 }, { "epoch": 2.71, "learning_rate": 5.3254437869822486e-08, "logits/chosen": -2.8947901725769043, "logits/rejected": -2.746194839477539, "logps/chosen": -195.88357543945312, "logps/rejected": -653.6214599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.8187377452850342, "rewards/margins": 34.86467742919922, "rewards/rejected": -33.04594039916992, "step": 7980 }, { "epoch": 2.72, "learning_rate": 5.262495278861891e-08, "logits/chosen": -2.849066972732544, "logits/rejected": -2.6891274452209473, "logps/chosen": -186.70700073242188, "logps/rejected": -512.2257080078125, "loss": 0.0021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0710628032684326, "rewards/margins": 28.34885597229004, "rewards/rejected": -27.277795791625977, "step": 7990 }, { "epoch": 2.72, "learning_rate": 5.199546770741533e-08, "logits/chosen": -2.9111413955688477, "logits/rejected": -2.760098457336426, "logps/chosen": -198.2312469482422, "logps/rejected": -714.1201782226562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.1827255487442017, "rewards/margins": 33.96942901611328, "rewards/rejected": -32.786705017089844, "step": 8000 }, { "epoch": 2.72, "eval_logits/chosen": -2.9523746967315674, "eval_logits/rejected": -2.76188588142395, "eval_logps/chosen": -244.83216857910156, "eval_logps/rejected": -722.8104858398438, "eval_loss": 0.0029186487663537264, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.8124349117279053, "eval_rewards/margins": 33.87594985961914, "eval_rewards/rejected": -32.063514709472656, "eval_runtime": 463.4758, "eval_samples_per_second": 20.497, "eval_steps_per_second": 0.641, "step": 8000 }, { "epoch": 2.72, "learning_rate": 5.136598262621176e-08, "logits/chosen": -2.9415037631988525, "logits/rejected": -2.7271430492401123, "logps/chosen": -183.74893188476562, "logps/rejected": -642.0931396484375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.7032045125961304, "rewards/margins": 36.18794250488281, "rewards/rejected": -34.48473358154297, "step": 8010 }, { "epoch": 2.73, "learning_rate": 5.073649754500818e-08, "logits/chosen": -2.9826855659484863, "logits/rejected": -2.728407859802246, "logps/chosen": -205.96456909179688, "logps/rejected": -612.9322509765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.726447343826294, "rewards/margins": 35.132320404052734, "rewards/rejected": -33.40587615966797, "step": 8020 }, { "epoch": 2.73, "learning_rate": 5.01070124638046e-08, "logits/chosen": -2.8416223526000977, "logits/rejected": -2.762373447418213, "logps/chosen": -278.6443176269531, "logps/rejected": -753.6058959960938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.926841139793396, "rewards/margins": 37.72515106201172, "rewards/rejected": -35.798316955566406, "step": 8030 }, { "epoch": 2.73, "learning_rate": 4.947752738260103e-08, "logits/chosen": -2.8469138145446777, "logits/rejected": -2.659383535385132, "logps/chosen": -277.76739501953125, "logps/rejected": -820.9759521484375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.898606538772583, "rewards/margins": 36.009578704833984, "rewards/rejected": -34.11096954345703, "step": 8040 }, { "epoch": 2.74, "learning_rate": 4.884804230139745e-08, "logits/chosen": -2.913717269897461, "logits/rejected": -2.590941905975342, "logps/chosen": -190.384033203125, "logps/rejected": -919.0404052734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.040963888168335, "rewards/margins": 32.01324462890625, "rewards/rejected": -29.9722843170166, "step": 8050 }, { "epoch": 2.74, "learning_rate": 4.8218557220193875e-08, "logits/chosen": -2.6889381408691406, "logits/rejected": -2.621088743209839, "logps/chosen": -455.24053955078125, "logps/rejected": -740.9610595703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.0844435691833496, "rewards/margins": 32.413047790527344, "rewards/rejected": -30.328603744506836, "step": 8060 }, { "epoch": 2.74, "learning_rate": 4.7589072138990305e-08, "logits/chosen": -2.844010829925537, "logits/rejected": -2.731935977935791, "logps/chosen": -240.62875366210938, "logps/rejected": -628.2028198242188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3309201002120972, "rewards/margins": 33.94548797607422, "rewards/rejected": -32.61457061767578, "step": 8070 }, { "epoch": 2.75, "learning_rate": 4.695958705778673e-08, "logits/chosen": -2.795356035232544, "logits/rejected": -2.7213807106018066, "logps/chosen": -360.2414855957031, "logps/rejected": -488.0118713378906, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.238100051879883, "rewards/margins": 31.607147216796875, "rewards/rejected": -29.36904525756836, "step": 8080 }, { "epoch": 2.75, "learning_rate": 4.633010197658315e-08, "logits/chosen": -2.9043846130371094, "logits/rejected": -2.6883883476257324, "logps/chosen": -263.35760498046875, "logps/rejected": -616.5202026367188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.6883020401000977, "rewards/margins": 30.9733943939209, "rewards/rejected": -29.28508949279785, "step": 8090 }, { "epoch": 2.75, "learning_rate": 4.570061689537958e-08, "logits/chosen": -2.80082368850708, "logits/rejected": -2.698975086212158, "logps/chosen": -383.357666015625, "logps/rejected": -700.9032592773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.6076881885528564, "rewards/margins": 35.00193786621094, "rewards/rejected": -32.394248962402344, "step": 8100 }, { "epoch": 2.75, "eval_logits/chosen": -2.9517009258270264, "eval_logits/rejected": -2.759443759918213, "eval_logps/chosen": -245.44287109375, "eval_logps/rejected": -728.3179931640625, "eval_loss": 0.0028691969346255064, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.7513657808303833, "eval_rewards/margins": 34.36562728881836, "eval_rewards/rejected": -32.614261627197266, "eval_runtime": 462.6439, "eval_samples_per_second": 20.534, "eval_steps_per_second": 0.642, "step": 8100 }, { "epoch": 2.76, "learning_rate": 4.5071131814176e-08, "logits/chosen": -2.9013912677764893, "logits/rejected": -2.6237916946411133, "logps/chosen": -311.77569580078125, "logps/rejected": -716.3660888671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.0787107944488525, "rewards/margins": 31.411428451538086, "rewards/rejected": -29.332717895507812, "step": 8110 }, { "epoch": 2.76, "learning_rate": 4.4441646732972425e-08, "logits/chosen": -2.8182454109191895, "logits/rejected": -2.714923620223999, "logps/chosen": -201.95504760742188, "logps/rejected": -968.5489501953125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.6728975772857666, "rewards/margins": 38.2669677734375, "rewards/rejected": -36.59407043457031, "step": 8120 }, { "epoch": 2.76, "learning_rate": 4.3812161651768855e-08, "logits/chosen": -2.8762423992156982, "logits/rejected": -2.801011562347412, "logps/chosen": -204.72171020507812, "logps/rejected": -706.783203125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.8652136325836182, "rewards/margins": 36.24138259887695, "rewards/rejected": -34.37616729736328, "step": 8130 }, { "epoch": 2.77, "learning_rate": 4.318267657056528e-08, "logits/chosen": -2.74670147895813, "logits/rejected": -2.684847116470337, "logps/chosen": -279.9619445800781, "logps/rejected": -574.5152587890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.2464849948883057, "rewards/margins": 26.70419692993164, "rewards/rejected": -25.457712173461914, "step": 8140 }, { "epoch": 2.77, "learning_rate": 4.25531914893617e-08, "logits/chosen": -2.899589776992798, "logits/rejected": -2.7317843437194824, "logps/chosen": -204.38204956054688, "logps/rejected": -796.5444946289062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.0370306968688965, "rewards/margins": 35.522666931152344, "rewards/rejected": -33.48564147949219, "step": 8150 }, { "epoch": 2.77, "learning_rate": 4.192370640815812e-08, "logits/chosen": -2.758873462677002, "logits/rejected": -2.7521636486053467, "logps/chosen": -229.4162139892578, "logps/rejected": -628.3582763671875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.4046229124069214, "rewards/margins": 33.60283660888672, "rewards/rejected": -32.19821548461914, "step": 8160 }, { "epoch": 2.78, "learning_rate": 4.129422132695455e-08, "logits/chosen": -2.881314992904663, "logits/rejected": -2.7420051097869873, "logps/chosen": -249.81631469726562, "logps/rejected": -584.2086791992188, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.4726141691207886, "rewards/margins": 29.38981056213379, "rewards/rejected": -27.91719627380371, "step": 8170 }, { "epoch": 2.78, "learning_rate": 4.0664736245750975e-08, "logits/chosen": -2.871246337890625, "logits/rejected": -2.7974417209625244, "logps/chosen": -200.88021850585938, "logps/rejected": -600.2832641601562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.9458301067352295, "rewards/margins": 34.04649353027344, "rewards/rejected": -32.10066223144531, "step": 8180 }, { "epoch": 2.78, "learning_rate": 4.00352511645474e-08, "logits/chosen": -2.8207430839538574, "logits/rejected": -2.7224230766296387, "logps/chosen": -266.0521240234375, "logps/rejected": -696.4131469726562, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.4409301280975342, "rewards/margins": 33.28892517089844, "rewards/rejected": -31.84799575805664, "step": 8190 }, { "epoch": 2.79, "learning_rate": 3.940576608334383e-08, "logits/chosen": -2.837584972381592, "logits/rejected": -2.7460408210754395, "logps/chosen": -272.820068359375, "logps/rejected": -722.5364990234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.866076111793518, "rewards/margins": 35.569801330566406, "rewards/rejected": -33.70372772216797, "step": 8200 }, { "epoch": 2.79, "eval_logits/chosen": -2.9529659748077393, "eval_logits/rejected": -2.7605702877044678, "eval_logps/chosen": -245.90090942382812, "eval_logps/rejected": -733.0239868164062, "eval_loss": 0.0028625179547816515, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.7055586576461792, "eval_rewards/margins": 34.79042053222656, "eval_rewards/rejected": -33.08485794067383, "eval_runtime": 462.923, "eval_samples_per_second": 20.522, "eval_steps_per_second": 0.642, "step": 8200 }, { "epoch": 2.79, "learning_rate": 3.877628100214025e-08, "logits/chosen": -2.8431358337402344, "logits/rejected": -2.692789316177368, "logps/chosen": -253.12271118164062, "logps/rejected": -733.8238525390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.890484094619751, "rewards/margins": 35.519893646240234, "rewards/rejected": -33.62941360473633, "step": 8210 }, { "epoch": 2.79, "learning_rate": 3.814679592093667e-08, "logits/chosen": -2.8372626304626465, "logits/rejected": -2.768393039703369, "logps/chosen": -296.54766845703125, "logps/rejected": -935.0896606445312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.2551589012145996, "rewards/margins": 38.06463623046875, "rewards/rejected": -35.80947494506836, "step": 8220 }, { "epoch": 2.8, "learning_rate": 3.75173108397331e-08, "logits/chosen": -2.8285956382751465, "logits/rejected": -2.6880383491516113, "logps/chosen": -238.5934295654297, "logps/rejected": -804.083251953125, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 1.3455486297607422, "rewards/margins": 35.137413024902344, "rewards/rejected": -33.79186248779297, "step": 8230 }, { "epoch": 2.8, "learning_rate": 3.688782575852952e-08, "logits/chosen": -2.8728716373443604, "logits/rejected": -2.6623497009277344, "logps/chosen": -182.77908325195312, "logps/rejected": -545.1868286132812, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.7516419887542725, "rewards/margins": 29.533950805664062, "rewards/rejected": -27.78230857849121, "step": 8240 }, { "epoch": 2.8, "learning_rate": 3.625834067732594e-08, "logits/chosen": -2.7869961261749268, "logits/rejected": -2.634920835494995, "logps/chosen": -306.122802734375, "logps/rejected": -730.5965576171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.8775463104248047, "rewards/margins": 33.65734100341797, "rewards/rejected": -31.7797908782959, "step": 8250 }, { "epoch": 2.81, "learning_rate": 3.562885559612237e-08, "logits/chosen": -2.790224075317383, "logits/rejected": -2.7068870067596436, "logps/chosen": -280.0884094238281, "logps/rejected": -817.982666015625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.4231791496276855, "rewards/margins": 36.39927291870117, "rewards/rejected": -33.97609329223633, "step": 8260 }, { "epoch": 2.81, "learning_rate": 3.499937051491879e-08, "logits/chosen": -2.9327492713928223, "logits/rejected": -2.7168617248535156, "logps/chosen": -194.67178344726562, "logps/rejected": -662.565185546875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.127627968788147, "rewards/margins": 35.013309478759766, "rewards/rejected": -33.885684967041016, "step": 8270 }, { "epoch": 2.81, "learning_rate": 3.4369885433715216e-08, "logits/chosen": -2.934410810470581, "logits/rejected": -2.7781777381896973, "logps/chosen": -178.58929443359375, "logps/rejected": -552.3302612304688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.214401960372925, "rewards/margins": 32.13267517089844, "rewards/rejected": -29.91827392578125, "step": 8280 }, { "epoch": 2.82, "learning_rate": 3.3740400352511645e-08, "logits/chosen": -2.890235662460327, "logits/rejected": -2.7146573066711426, "logps/chosen": -181.51712036132812, "logps/rejected": -969.4215087890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3645102977752686, "rewards/margins": 36.79445266723633, "rewards/rejected": -35.42994689941406, "step": 8290 }, { "epoch": 2.82, "learning_rate": 3.311091527130807e-08, "logits/chosen": -2.895934820175171, "logits/rejected": -2.7098352909088135, "logps/chosen": -212.12545776367188, "logps/rejected": -578.0053100585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.6477187871932983, "rewards/margins": 34.006507873535156, "rewards/rejected": -32.358787536621094, "step": 8300 }, { "epoch": 2.82, "eval_logits/chosen": -2.9370880126953125, "eval_logits/rejected": -2.7437326908111572, "eval_logps/chosen": -246.60716247558594, "eval_logps/rejected": -730.386474609375, "eval_loss": 0.0029500466771423817, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.6349343061447144, "eval_rewards/margins": 34.4560432434082, "eval_rewards/rejected": -32.821109771728516, "eval_runtime": 462.8901, "eval_samples_per_second": 20.523, "eval_steps_per_second": 0.642, "step": 8300 }, { "epoch": 2.82, "learning_rate": 3.248143019010449e-08, "logits/chosen": -2.859182596206665, "logits/rejected": -2.7199881076812744, "logps/chosen": -183.3325958251953, "logps/rejected": -765.2219848632812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5486308336257935, "rewards/margins": 33.019866943359375, "rewards/rejected": -31.471233367919922, "step": 8310 }, { "epoch": 2.83, "learning_rate": 3.1851945108900914e-08, "logits/chosen": -2.923696994781494, "logits/rejected": -2.6853814125061035, "logps/chosen": -257.73358154296875, "logps/rejected": -811.6209716796875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.248937726020813, "rewards/margins": 35.611331939697266, "rewards/rejected": -34.362388610839844, "step": 8320 }, { "epoch": 2.83, "learning_rate": 3.122246002769734e-08, "logits/chosen": -2.9014363288879395, "logits/rejected": -2.7109408378601074, "logps/chosen": -189.33616638183594, "logps/rejected": -644.8126831054688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.0130256414413452, "rewards/margins": 33.65305709838867, "rewards/rejected": -32.64003372192383, "step": 8330 }, { "epoch": 2.83, "learning_rate": 3.0592974946493766e-08, "logits/chosen": -2.8610825538635254, "logits/rejected": -2.7753965854644775, "logps/chosen": -228.6236572265625, "logps/rejected": -612.8883056640625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 2.5022826194763184, "rewards/margins": 35.31542205810547, "rewards/rejected": -32.813140869140625, "step": 8340 }, { "epoch": 2.84, "learning_rate": 2.996348986529019e-08, "logits/chosen": -2.8168563842773438, "logits/rejected": -2.6722679138183594, "logps/chosen": -322.066650390625, "logps/rejected": -654.7132568359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.1526522636413574, "rewards/margins": 37.03415298461914, "rewards/rejected": -34.881500244140625, "step": 8350 }, { "epoch": 2.84, "learning_rate": 2.9334004784086618e-08, "logits/chosen": -2.8888437747955322, "logits/rejected": -2.6385433673858643, "logps/chosen": -214.4794921875, "logps/rejected": -765.9920043945312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3708596229553223, "rewards/margins": 35.8171272277832, "rewards/rejected": -34.44626998901367, "step": 8360 }, { "epoch": 2.85, "learning_rate": 2.870451970288304e-08, "logits/chosen": -2.815093517303467, "logits/rejected": -2.6457314491271973, "logps/chosen": -308.3783874511719, "logps/rejected": -810.87744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.4664806127548218, "rewards/margins": 34.69248580932617, "rewards/rejected": -33.22600555419922, "step": 8370 }, { "epoch": 2.85, "learning_rate": 2.8075034621679467e-08, "logits/chosen": -2.83091402053833, "logits/rejected": -2.676267147064209, "logps/chosen": -361.5912170410156, "logps/rejected": -474.63916015625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 2.431955337524414, "rewards/margins": 28.1231632232666, "rewards/rejected": -25.691204071044922, "step": 8380 }, { "epoch": 2.85, "learning_rate": 2.744554954047589e-08, "logits/chosen": -2.859971523284912, "logits/rejected": -2.694551944732666, "logps/chosen": -189.6149444580078, "logps/rejected": -709.7062377929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.1697099208831787, "rewards/margins": 33.811119079589844, "rewards/rejected": -32.64141082763672, "step": 8390 }, { "epoch": 2.86, "learning_rate": 2.6816064459272312e-08, "logits/chosen": -2.7933144569396973, "logits/rejected": -2.6871895790100098, "logps/chosen": -365.2389831542969, "logps/rejected": -730.922119140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.9460251331329346, "rewards/margins": 32.6923828125, "rewards/rejected": -30.74635887145996, "step": 8400 }, { "epoch": 2.86, "eval_logits/chosen": -2.93858003616333, "eval_logits/rejected": -2.7437851428985596, "eval_logps/chosen": -247.0050506591797, "eval_logps/rejected": -731.6737670898438, "eval_loss": 0.0029253766406327486, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.5951491594314575, "eval_rewards/margins": 34.544986724853516, "eval_rewards/rejected": -32.9498405456543, "eval_runtime": 463.0416, "eval_samples_per_second": 20.517, "eval_steps_per_second": 0.641, "step": 8400 }, { "epoch": 2.86, "learning_rate": 2.618657937806874e-08, "logits/chosen": -2.8827600479125977, "logits/rejected": -2.7538294792175293, "logps/chosen": -253.6561737060547, "logps/rejected": -666.5469970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.9339252710342407, "rewards/margins": 32.07635498046875, "rewards/rejected": -30.142431259155273, "step": 8410 }, { "epoch": 2.86, "learning_rate": 2.555709429686516e-08, "logits/chosen": -2.888822555541992, "logits/rejected": -2.7079339027404785, "logps/chosen": -332.5297546386719, "logps/rejected": -697.7379150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.9848358631134033, "rewards/margins": 33.17619705200195, "rewards/rejected": -31.191356658935547, "step": 8420 }, { "epoch": 2.87, "learning_rate": 2.4927609215661587e-08, "logits/chosen": -2.924531936645508, "logits/rejected": -2.7056684494018555, "logps/chosen": -199.9510498046875, "logps/rejected": -572.1231079101562, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 1.3454816341400146, "rewards/margins": 34.673255920410156, "rewards/rejected": -33.3277702331543, "step": 8430 }, { "epoch": 2.87, "learning_rate": 2.4298124134458013e-08, "logits/chosen": -2.928957462310791, "logits/rejected": -2.743579387664795, "logps/chosen": -202.9106903076172, "logps/rejected": -774.45166015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.139336347579956, "rewards/margins": 34.214088439941406, "rewards/rejected": -33.07474899291992, "step": 8440 }, { "epoch": 2.87, "learning_rate": 2.3668639053254436e-08, "logits/chosen": -2.801884889602661, "logits/rejected": -2.736821413040161, "logps/chosen": -205.0902862548828, "logps/rejected": -578.8478393554688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.9652506709098816, "rewards/margins": 31.689992904663086, "rewards/rejected": -30.724746704101562, "step": 8450 }, { "epoch": 2.88, "learning_rate": 2.3039153972050862e-08, "logits/chosen": -2.8657679557800293, "logits/rejected": -2.771345615386963, "logps/chosen": -261.31298828125, "logps/rejected": -913.3406372070312, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3926279544830322, "rewards/margins": 36.627540588378906, "rewards/rejected": -35.23491287231445, "step": 8460 }, { "epoch": 2.88, "learning_rate": 2.2409668890847285e-08, "logits/chosen": -2.9273786544799805, "logits/rejected": -2.680814266204834, "logps/chosen": -199.24087524414062, "logps/rejected": -703.1931762695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.5375652313232422, "rewards/margins": 33.18684768676758, "rewards/rejected": -31.6492862701416, "step": 8470 }, { "epoch": 2.88, "learning_rate": 2.178018380964371e-08, "logits/chosen": -2.907334327697754, "logits/rejected": -2.7058966159820557, "logps/chosen": -263.55633544921875, "logps/rejected": -759.4267578125, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": 1.2649438381195068, "rewards/margins": 37.39045715332031, "rewards/rejected": -36.125511169433594, "step": 8480 }, { "epoch": 2.89, "learning_rate": 2.1150698728440137e-08, "logits/chosen": -2.909327507019043, "logits/rejected": -2.7049715518951416, "logps/chosen": -244.219970703125, "logps/rejected": -656.9718017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.5359927415847778, "rewards/margins": 34.044219970703125, "rewards/rejected": -32.50822830200195, "step": 8490 }, { "epoch": 2.89, "learning_rate": 2.052121364723656e-08, "logits/chosen": -2.9345736503601074, "logits/rejected": -2.709101676940918, "logps/chosen": -191.2003631591797, "logps/rejected": -661.4557495117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.649473786354065, "rewards/margins": 31.498300552368164, "rewards/rejected": -29.848827362060547, "step": 8500 }, { "epoch": 2.89, "eval_logits/chosen": -2.949676513671875, "eval_logits/rejected": -2.754098892211914, "eval_logps/chosen": -247.28961181640625, "eval_logps/rejected": -731.5333251953125, "eval_loss": 0.0028794598765671253, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.566688895225525, "eval_rewards/margins": 34.50249099731445, "eval_rewards/rejected": -32.93579864501953, "eval_runtime": 462.299, "eval_samples_per_second": 20.549, "eval_steps_per_second": 0.642, "step": 8500 }, { "epoch": 2.89, "learning_rate": 1.9891728566032983e-08, "logits/chosen": -2.818610429763794, "logits/rejected": -2.763514518737793, "logps/chosen": -259.8336181640625, "logps/rejected": -558.02099609375, "loss": 0.0051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4684436321258545, "rewards/margins": 33.00536346435547, "rewards/rejected": -31.53692054748535, "step": 8510 }, { "epoch": 2.9, "learning_rate": 1.926224348482941e-08, "logits/chosen": -2.8706917762756348, "logits/rejected": -2.6944868564605713, "logps/chosen": -307.8582763671875, "logps/rejected": -675.1591796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5979983806610107, "rewards/margins": 35.30298614501953, "rewards/rejected": -33.704994201660156, "step": 8520 }, { "epoch": 2.9, "learning_rate": 1.863275840362583e-08, "logits/chosen": -2.8207685947418213, "logits/rejected": -2.746610641479492, "logps/chosen": -194.3468017578125, "logps/rejected": -881.32763671875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.0232957601547241, "rewards/margins": 37.870323181152344, "rewards/rejected": -36.84703063964844, "step": 8530 }, { "epoch": 2.9, "learning_rate": 1.8003273322422258e-08, "logits/chosen": -2.8564186096191406, "logits/rejected": -2.7289223670959473, "logps/chosen": -206.2211456298828, "logps/rejected": -723.2646484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.7703731060028076, "rewards/margins": 32.80350875854492, "rewards/rejected": -31.03313636779785, "step": 8540 }, { "epoch": 2.91, "learning_rate": 1.737378824121868e-08, "logits/chosen": -2.8191444873809814, "logits/rejected": -2.7419543266296387, "logps/chosen": -199.77188110351562, "logps/rejected": -710.7234497070312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.6947271823883057, "rewards/margins": 32.641624450683594, "rewards/rejected": -31.946903228759766, "step": 8550 }, { "epoch": 2.91, "learning_rate": 1.6744303160015107e-08, "logits/chosen": -2.912092924118042, "logits/rejected": -2.782015085220337, "logps/chosen": -257.63775634765625, "logps/rejected": -679.5921630859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2363474369049072, "rewards/margins": 31.867115020751953, "rewards/rejected": -30.630767822265625, "step": 8560 }, { "epoch": 2.91, "learning_rate": 1.6114818078811533e-08, "logits/chosen": -2.8573620319366455, "logits/rejected": -2.5644099712371826, "logps/chosen": -271.6121520996094, "logps/rejected": -819.2306518554688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.0041792392730713, "rewards/margins": 32.7983283996582, "rewards/rejected": -31.794147491455078, "step": 8570 }, { "epoch": 2.92, "learning_rate": 1.5485332997607955e-08, "logits/chosen": -2.8703975677490234, "logits/rejected": -2.7564332485198975, "logps/chosen": -184.20236206054688, "logps/rejected": -702.0172119140625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.9957479238510132, "rewards/margins": 34.989768981933594, "rewards/rejected": -32.994022369384766, "step": 8580 }, { "epoch": 2.92, "learning_rate": 1.485584791640438e-08, "logits/chosen": -2.8433501720428467, "logits/rejected": -2.6756844520568848, "logps/chosen": -243.35073852539062, "logps/rejected": -751.3645629882812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2838568687438965, "rewards/margins": 35.26364517211914, "rewards/rejected": -33.97978591918945, "step": 8590 }, { "epoch": 2.92, "learning_rate": 1.4226362835200804e-08, "logits/chosen": -2.9263997077941895, "logits/rejected": -2.710463762283325, "logps/chosen": -197.16151428222656, "logps/rejected": -753.781005859375, "loss": 0.0029, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.751476526260376, "rewards/margins": 33.50548553466797, "rewards/rejected": -31.75400733947754, "step": 8600 }, { "epoch": 2.92, "eval_logits/chosen": -2.951413869857788, "eval_logits/rejected": -2.7541491985321045, "eval_logps/chosen": -247.97059631347656, "eval_logps/rejected": -735.2822265625, "eval_loss": 0.002888133516535163, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.4985933303833008, "eval_rewards/margins": 34.80928039550781, "eval_rewards/rejected": -33.31068420410156, "eval_runtime": 463.0949, "eval_samples_per_second": 20.514, "eval_steps_per_second": 0.641, "step": 8600 }, { "epoch": 2.93, "learning_rate": 1.3596877753997229e-08, "logits/chosen": -2.8231263160705566, "logits/rejected": -2.782031297683716, "logps/chosen": -324.94549560546875, "logps/rejected": -591.5970458984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.7182931900024414, "rewards/margins": 36.668060302734375, "rewards/rejected": -34.94976806640625, "step": 8610 }, { "epoch": 2.93, "learning_rate": 1.2967392672793655e-08, "logits/chosen": -2.937084674835205, "logits/rejected": -2.6452908515930176, "logps/chosen": -217.12228393554688, "logps/rejected": -971.6239013671875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.1587468385696411, "rewards/margins": 33.779090881347656, "rewards/rejected": -32.620338439941406, "step": 8620 }, { "epoch": 2.93, "learning_rate": 1.233790759159008e-08, "logits/chosen": -2.833441734313965, "logits/rejected": -2.7452266216278076, "logps/chosen": -232.07705688476562, "logps/rejected": -710.0538330078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.1738405227661133, "rewards/margins": 33.40818405151367, "rewards/rejected": -32.23434066772461, "step": 8630 }, { "epoch": 2.94, "learning_rate": 1.1708422510386504e-08, "logits/chosen": -2.8302254676818848, "logits/rejected": -2.7558364868164062, "logps/chosen": -300.77178955078125, "logps/rejected": -756.2189331054688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 2.1941707134246826, "rewards/margins": 41.46385192871094, "rewards/rejected": -39.26968002319336, "step": 8640 }, { "epoch": 2.94, "learning_rate": 1.1078937429182926e-08, "logits/chosen": -2.7935729026794434, "logits/rejected": -2.724950075149536, "logps/chosen": -341.2043762207031, "logps/rejected": -810.2388305664062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.3720297813415527, "rewards/margins": 33.338443756103516, "rewards/rejected": -30.966411590576172, "step": 8650 }, { "epoch": 2.94, "learning_rate": 1.0449452347979353e-08, "logits/chosen": -2.8189926147460938, "logits/rejected": -2.63934326171875, "logps/chosen": -302.576416015625, "logps/rejected": -707.257568359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.1341822147369385, "rewards/margins": 32.629722595214844, "rewards/rejected": -30.49553871154785, "step": 8660 }, { "epoch": 2.95, "learning_rate": 9.819967266775777e-09, "logits/chosen": -2.7826783657073975, "logits/rejected": -2.6357524394989014, "logps/chosen": -197.04359436035156, "logps/rejected": -748.2913208007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.307620644569397, "rewards/margins": 33.984466552734375, "rewards/rejected": -32.67684555053711, "step": 8670 }, { "epoch": 2.95, "learning_rate": 9.190482185572201e-09, "logits/chosen": -2.781609535217285, "logits/rejected": -2.6577868461608887, "logps/chosen": -295.7722473144531, "logps/rejected": -862.9734497070312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5047187805175781, "rewards/margins": 34.63649368286133, "rewards/rejected": -33.13177490234375, "step": 8680 }, { "epoch": 2.95, "learning_rate": 8.560997104368626e-09, "logits/chosen": -2.7609448432922363, "logits/rejected": -2.6951041221618652, "logps/chosen": -256.26678466796875, "logps/rejected": -583.029052734375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.7293832302093506, "rewards/margins": 31.212879180908203, "rewards/rejected": -29.483495712280273, "step": 8690 }, { "epoch": 2.96, "learning_rate": 7.931512023165052e-09, "logits/chosen": -2.793464422225952, "logits/rejected": -2.578908920288086, "logps/chosen": -260.47509765625, "logps/rejected": -675.4356689453125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.082809329032898, "rewards/margins": 30.84918785095215, "rewards/rejected": -29.76637840270996, "step": 8700 }, { "epoch": 2.96, "eval_logits/chosen": -2.951796293258667, "eval_logits/rejected": -2.7544431686401367, "eval_logps/chosen": -248.01109313964844, "eval_logps/rejected": -735.39306640625, "eval_loss": 0.00290289637632668, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.4945420026779175, "eval_rewards/margins": 34.816314697265625, "eval_rewards/rejected": -33.321773529052734, "eval_runtime": 463.2269, "eval_samples_per_second": 20.508, "eval_steps_per_second": 0.641, "step": 8700 }, { "epoch": 2.96, "learning_rate": 7.3020269419614755e-09, "logits/chosen": -2.8251395225524902, "logits/rejected": -2.661292552947998, "logps/chosen": -199.82347106933594, "logps/rejected": -916.0056762695312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5197585821151733, "rewards/margins": 36.043338775634766, "rewards/rejected": -34.52357864379883, "step": 8710 }, { "epoch": 2.96, "learning_rate": 6.6725418607579e-09, "logits/chosen": -2.8604416847229004, "logits/rejected": -2.798391819000244, "logps/chosen": -277.4889831542969, "logps/rejected": -846.44091796875, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5089476108551025, "rewards/margins": 40.80601501464844, "rewards/rejected": -39.29706573486328, "step": 8720 }, { "epoch": 2.97, "learning_rate": 6.043056779554324e-09, "logits/chosen": -2.940009355545044, "logits/rejected": -2.6321115493774414, "logps/chosen": -206.8089141845703, "logps/rejected": -921.3098754882812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3747873306274414, "rewards/margins": 35.20962142944336, "rewards/rejected": -33.83483123779297, "step": 8730 }, { "epoch": 2.97, "learning_rate": 5.41357169835075e-09, "logits/chosen": -2.849306583404541, "logits/rejected": -2.661442756652832, "logps/chosen": -277.26129150390625, "logps/rejected": -736.669677734375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.4232017993927002, "rewards/margins": 37.99282455444336, "rewards/rejected": -36.56962203979492, "step": 8740 }, { "epoch": 2.97, "learning_rate": 4.784086617147173e-09, "logits/chosen": -2.798865795135498, "logits/rejected": -2.861072063446045, "logps/chosen": -368.9256591796875, "logps/rejected": -597.7943115234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.714577078819275, "rewards/margins": 31.3504581451416, "rewards/rejected": -29.635883331298828, "step": 8750 }, { "epoch": 2.98, "learning_rate": 4.1546015359435984e-09, "logits/chosen": -2.811037540435791, "logits/rejected": -2.812325954437256, "logps/chosen": -319.78192138671875, "logps/rejected": -737.4420166015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.1931021213531494, "rewards/margins": 36.184112548828125, "rewards/rejected": -34.99100875854492, "step": 8760 }, { "epoch": 2.98, "learning_rate": 3.5251164547400225e-09, "logits/chosen": -2.788015127182007, "logits/rejected": -2.709311008453369, "logps/chosen": -297.61968994140625, "logps/rejected": -733.8298950195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3519922494888306, "rewards/margins": 33.35474395751953, "rewards/rejected": -32.002750396728516, "step": 8770 }, { "epoch": 2.98, "learning_rate": 2.895631373536447e-09, "logits/chosen": -2.84269118309021, "logits/rejected": -2.7168543338775635, "logps/chosen": -248.77706909179688, "logps/rejected": -561.2557373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2624130249023438, "rewards/margins": 31.8685302734375, "rewards/rejected": -30.606115341186523, "step": 8780 }, { "epoch": 2.99, "learning_rate": 2.2661462923328713e-09, "logits/chosen": -2.8374154567718506, "logits/rejected": -2.758894205093384, "logps/chosen": -253.4519500732422, "logps/rejected": -774.97021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.312196969985962, "rewards/margins": 36.33062744140625, "rewards/rejected": -35.0184326171875, "step": 8790 }, { "epoch": 2.99, "learning_rate": 1.6366612111292962e-09, "logits/chosen": -2.868804931640625, "logits/rejected": -2.782166004180908, "logps/chosen": -201.5443572998047, "logps/rejected": -694.0833740234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.9795600771903992, "rewards/margins": 41.135257720947266, "rewards/rejected": -40.155696868896484, "step": 8800 }, { "epoch": 2.99, "eval_logits/chosen": -2.952502727508545, "eval_logits/rejected": -2.754737615585327, "eval_logps/chosen": -247.97816467285156, "eval_logps/rejected": -735.1167602539062, "eval_loss": 0.0028906739316880703, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 1.4978344440460205, "eval_rewards/margins": 34.79197692871094, "eval_rewards/rejected": -33.29414749145508, "eval_runtime": 463.8127, "eval_samples_per_second": 20.482, "eval_steps_per_second": 0.64, "step": 8800 }, { "epoch": 2.99, "learning_rate": 1.0071761299257208e-09, "logits/chosen": -2.8924288749694824, "logits/rejected": -2.691993236541748, "logps/chosen": -217.14035034179688, "logps/rejected": -747.6012573242188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.143796682357788, "rewards/margins": 35.540218353271484, "rewards/rejected": -33.39643096923828, "step": 8810 }, { "epoch": 3.0, "learning_rate": 3.7769104872214527e-10, "logits/chosen": -2.864406108856201, "logits/rejected": -2.7862467765808105, "logps/chosen": -207.127685546875, "logps/rejected": -606.5277709960938, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.4771344661712646, "rewards/margins": 33.160789489746094, "rewards/rejected": -31.68366050720215, "step": 8820 }, { "epoch": 3.0, "step": 8826, "total_flos": 0.0, "train_loss": 0.022219736984536855, "train_runtime": 94567.8662, "train_samples_per_second": 5.973, "train_steps_per_second": 0.093 } ], "logging_steps": 10, "max_steps": 8826, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }