{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994767137624281, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010465724751439038, "grad_norm": 7.312430627947873, "learning_rate": 5.208333333333333e-09, "logits/chosen": -2.6544837951660156, "logits/rejected": -2.5759358406066895, "logps/chosen": -101.20021057128906, "logps/rejected": -85.73662567138672, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.010465724751439037, "grad_norm": 6.798309843145578, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.5073227882385254, "logits/rejected": -2.498267650604248, "logps/chosen": -76.74188995361328, "logps/rejected": -68.77124786376953, "loss": 0.6931, "rewards/accuracies": 0.1597222238779068, "rewards/chosen": -0.00010638780076988041, "rewards/margins": -0.00026712569524534047, "rewards/rejected": 0.00016073790902737528, "step": 10 }, { "epoch": 0.020931449502878074, "grad_norm": 6.328409074575995, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.5329933166503906, "logits/rejected": -2.5121402740478516, "logps/chosen": -91.55634307861328, "logps/rejected": -97.98811340332031, "loss": 0.6931, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": -8.86881971382536e-05, "rewards/margins": 0.0001090427249437198, "rewards/rejected": -0.00019773092935793102, "step": 20 }, { "epoch": 0.03139717425431711, "grad_norm": 5.987591394147146, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.6352438926696777, "logits/rejected": -2.624114513397217, "logps/chosen": -74.1455078125, "logps/rejected": -74.52520751953125, "loss": 0.6929, "rewards/accuracies": 0.2562499940395355, "rewards/chosen": -0.00024412055790890008, "rewards/margins": 0.00047922172234393656, "rewards/rejected": -0.0007233422948047519, "step": 30 }, { "epoch": 0.04186289900575615, "grad_norm": 6.813717975556312, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.5477070808410645, "logits/rejected": -2.495793104171753, "logps/chosen": -90.96524810791016, "logps/rejected": -89.30764770507812, "loss": 0.6925, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -3.622327858465724e-05, "rewards/margins": 0.0016909090336412191, "rewards/rejected": -0.0017271323595196009, "step": 40 }, { "epoch": 0.052328623757195186, "grad_norm": 6.382358089839322, "learning_rate": 2.604166666666667e-07, "logits/chosen": -2.5686728954315186, "logits/rejected": -2.5408482551574707, "logps/chosen": -76.85763549804688, "logps/rejected": -78.096923828125, "loss": 0.6914, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": 0.002494217362254858, "rewards/margins": 0.002928710076957941, "rewards/rejected": -0.00043449303484521806, "step": 50 }, { "epoch": 0.06279434850863422, "grad_norm": 6.142397539571157, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.520242214202881, "logits/rejected": -2.5160062313079834, "logps/chosen": -71.42273712158203, "logps/rejected": -71.63546752929688, "loss": 0.6901, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.011287200264632702, "rewards/margins": 0.006170675158500671, "rewards/rejected": 0.0051165251061320305, "step": 60 }, { "epoch": 0.07326007326007326, "grad_norm": 6.834963526035445, "learning_rate": 3.645833333333333e-07, "logits/chosen": -2.462669849395752, "logits/rejected": -2.448659658432007, "logps/chosen": -72.4560775756836, "logps/rejected": -89.7001953125, "loss": 0.6863, "rewards/accuracies": 0.375, "rewards/chosen": 0.019068170338869095, "rewards/margins": 0.014615567401051521, "rewards/rejected": 0.0044526029378175735, "step": 70 }, { "epoch": 0.0837257980115123, "grad_norm": 7.385617041745493, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.4439797401428223, "logits/rejected": -2.4138834476470947, "logps/chosen": -81.91383361816406, "logps/rejected": -80.78437042236328, "loss": 0.6811, "rewards/accuracies": 0.34375, "rewards/chosen": 0.00990500207990408, "rewards/margins": 0.032499730587005615, "rewards/rejected": -0.02259472757577896, "step": 80 }, { "epoch": 0.09419152276295134, "grad_norm": 8.102390740927866, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.3960914611816406, "logits/rejected": -2.3794617652893066, "logps/chosen": -73.29556274414062, "logps/rejected": -76.41487121582031, "loss": 0.6784, "rewards/accuracies": 0.28125, "rewards/chosen": -0.040218498557806015, "rewards/margins": 0.016615843400359154, "rewards/rejected": -0.056834347546100616, "step": 90 }, { "epoch": 0.10465724751439037, "grad_norm": 6.755359569975091, "learning_rate": 4.999732492681437e-07, "logits/chosen": -2.3556602001190186, "logits/rejected": -2.3346455097198486, "logps/chosen": -78.03315734863281, "logps/rejected": -95.56941223144531, "loss": 0.6719, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.005892142653465271, "rewards/margins": 0.07231049239635468, "rewards/rejected": -0.07820263504981995, "step": 100 }, { "epoch": 0.10465724751439037, "eval_logits/chosen": -2.3486523628234863, "eval_logits/rejected": -2.3309624195098877, "eval_logps/chosen": -73.28648376464844, "eval_logps/rejected": -85.4478530883789, "eval_loss": 0.6686670184135437, "eval_rewards/accuracies": 0.3273809552192688, "eval_rewards/chosen": 0.012047496624290943, "eval_rewards/margins": 0.05524253472685814, "eval_rewards/rejected": -0.04319504275918007, "eval_runtime": 113.7223, "eval_samples_per_second": 17.587, "eval_steps_per_second": 0.554, "step": 100 }, { "epoch": 0.1151229722658294, "grad_norm": 13.391566933736375, "learning_rate": 4.996723692767926e-07, "logits/chosen": -2.406934976577759, "logits/rejected": -2.4089908599853516, "logps/chosen": -80.33895111083984, "logps/rejected": -93.52415466308594, "loss": 0.6563, "rewards/accuracies": 0.375, "rewards/chosen": -0.034039516001939774, "rewards/margins": 0.08607066422700882, "rewards/rejected": -0.1201101765036583, "step": 110 }, { "epoch": 0.12558869701726844, "grad_norm": 9.646857443256218, "learning_rate": 4.990375746213598e-07, "logits/chosen": -2.2991995811462402, "logits/rejected": -2.265392303466797, "logps/chosen": -76.64207458496094, "logps/rejected": -94.79474639892578, "loss": 0.6601, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.01742006093263626, "rewards/margins": 0.11232365667819977, "rewards/rejected": -0.12974372506141663, "step": 120 }, { "epoch": 0.1360544217687075, "grad_norm": 16.35113879818621, "learning_rate": 4.980697142834314e-07, "logits/chosen": -2.244642496109009, "logits/rejected": -2.2259411811828613, "logps/chosen": -65.43486022949219, "logps/rejected": -84.06461334228516, "loss": 0.66, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.04491991177201271, "rewards/margins": 0.0634341612458229, "rewards/rejected": -0.10835406929254532, "step": 130 }, { "epoch": 0.14652014652014653, "grad_norm": 17.442951288216843, "learning_rate": 4.967700826904229e-07, "logits/chosen": -2.218097686767578, "logits/rejected": -2.2064428329467773, "logps/chosen": -109.58231353759766, "logps/rejected": -120.42280578613281, "loss": 0.6529, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.18489012122154236, "rewards/margins": 0.14332745969295502, "rewards/rejected": -0.3282175660133362, "step": 140 }, { "epoch": 0.15698587127158556, "grad_norm": 21.84882656667421, "learning_rate": 4.951404179843962e-07, "logits/chosen": -2.3305749893188477, "logits/rejected": -2.3431344032287598, "logps/chosen": -69.88795471191406, "logps/rejected": -94.39766693115234, "loss": 0.6506, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": -0.14505597949028015, "rewards/margins": 0.1382911652326584, "rewards/rejected": -0.28334707021713257, "step": 150 }, { "epoch": 0.1674515960230246, "grad_norm": 13.04813612493708, "learning_rate": 4.931828996974498e-07, "logits/chosen": -2.0861306190490723, "logits/rejected": -2.0900378227233887, "logps/chosen": -87.21963500976562, "logps/rejected": -114.88661193847656, "loss": 0.6528, "rewards/accuracies": 0.3125, "rewards/chosen": -0.13326936960220337, "rewards/margins": 0.1332504153251648, "rewards/rejected": -0.26651981472969055, "step": 160 }, { "epoch": 0.17791732077446362, "grad_norm": 14.778101043613702, "learning_rate": 4.909001458367866e-07, "logits/chosen": -1.9173187017440796, "logits/rejected": -1.8886661529541016, "logps/chosen": -81.811767578125, "logps/rejected": -101.0234375, "loss": 0.634, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.16403724253177643, "rewards/margins": 0.19587047398090363, "rewards/rejected": -0.35990768671035767, "step": 170 }, { "epoch": 0.18838304552590268, "grad_norm": 26.424629291070275, "learning_rate": 4.882952093833627e-07, "logits/chosen": -1.3328689336776733, "logits/rejected": -1.3415606021881104, "logps/chosen": -121.7206039428711, "logps/rejected": -163.23861694335938, "loss": 0.6215, "rewards/accuracies": 0.375, "rewards/chosen": -0.4643549919128418, "rewards/margins": 0.2982317805290222, "rewards/rejected": -0.7625867128372192, "step": 180 }, { "epoch": 0.1988487702773417, "grad_norm": 18.286232555939137, "learning_rate": 4.853715742087946e-07, "logits/chosen": -1.299851655960083, "logits/rejected": -1.2263530492782593, "logps/chosen": -130.7896270751953, "logps/rejected": -153.83375549316406, "loss": 0.6187, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.5220120549201965, "rewards/margins": 0.28233885765075684, "rewards/rejected": -0.8043509721755981, "step": 190 }, { "epoch": 0.20931449502878074, "grad_norm": 22.532611448739736, "learning_rate": 4.821331504159906e-07, "logits/chosen": -1.2297742366790771, "logits/rejected": -1.1969741582870483, "logps/chosen": -117.6633071899414, "logps/rejected": -130.67193603515625, "loss": 0.6488, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.3429523706436157, "rewards/margins": 0.20222480595111847, "rewards/rejected": -0.545177161693573, "step": 200 }, { "epoch": 0.20931449502878074, "eval_logits/chosen": -1.2583706378936768, "eval_logits/rejected": -1.23958420753479, "eval_logps/chosen": -102.57432556152344, "eval_logps/rejected": -130.3724822998047, "eval_loss": 0.634810745716095, "eval_rewards/accuracies": 0.3373015820980072, "eval_rewards/chosen": -0.2808309495449066, "eval_rewards/margins": 0.21161039173603058, "eval_rewards/rejected": -0.492441326379776, "eval_runtime": 113.6607, "eval_samples_per_second": 17.596, "eval_steps_per_second": 0.554, "step": 200 }, { "epoch": 0.21978021978021978, "grad_norm": 43.96668983794973, "learning_rate": 4.785842691097342e-07, "logits/chosen": -1.166576623916626, "logits/rejected": -1.0878881216049194, "logps/chosen": -102.22654724121094, "logps/rejected": -119.87858581542969, "loss": 0.6449, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.30706560611724854, "rewards/margins": 0.2078002393245697, "rewards/rejected": -0.5148658752441406, "step": 210 }, { "epoch": 0.2302459445316588, "grad_norm": 21.60611363424848, "learning_rate": 4.7472967660421603e-07, "logits/chosen": -1.4838167428970337, "logits/rejected": -1.293268084526062, "logps/chosen": -140.9451446533203, "logps/rejected": -159.35256958007812, "loss": 0.6317, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.40442484617233276, "rewards/margins": 0.2767654359340668, "rewards/rejected": -0.6811902523040771, "step": 220 }, { "epoch": 0.24071166928309787, "grad_norm": 23.269156895827596, "learning_rate": 4.705745280752585e-07, "logits/chosen": -1.360938310623169, "logits/rejected": -1.2307772636413574, "logps/chosen": -92.72604370117188, "logps/rejected": -110.38468933105469, "loss": 0.6411, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.3325788080692291, "rewards/margins": 0.19615355134010315, "rewards/rejected": -0.5287323594093323, "step": 230 }, { "epoch": 0.25117739403453687, "grad_norm": 32.70987860826756, "learning_rate": 4.6612438066572555e-07, "logits/chosen": -1.1462054252624512, "logits/rejected": -0.9243119359016418, "logps/chosen": -129.86032104492188, "logps/rejected": -173.3503875732422, "loss": 0.6198, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.4205262064933777, "rewards/margins": 0.42327141761779785, "rewards/rejected": -0.8437975645065308, "step": 240 }, { "epoch": 0.2616431187859759, "grad_norm": 18.56295517177359, "learning_rate": 4.6138518605333664e-07, "logits/chosen": -0.9204837083816528, "logits/rejected": -0.8685296177864075, "logps/chosen": -83.56452178955078, "logps/rejected": -112.06929779052734, "loss": 0.6227, "rewards/accuracies": 0.24375000596046448, "rewards/chosen": -0.30188173055648804, "rewards/margins": 0.20583298802375793, "rewards/rejected": -0.5077147483825684, "step": 250 }, { "epoch": 0.272108843537415, "grad_norm": 24.278208900563758, "learning_rate": 4.5636328249082514e-07, "logits/chosen": -0.5377733111381531, "logits/rejected": -0.28726479411125183, "logps/chosen": -126.16239166259766, "logps/rejected": -144.44386291503906, "loss": 0.6132, "rewards/accuracies": 0.3125, "rewards/chosen": -0.43910473585128784, "rewards/margins": 0.27166199684143066, "rewards/rejected": -0.7107667922973633, "step": 260 }, { "epoch": 0.282574568288854, "grad_norm": 33.830045643602666, "learning_rate": 4.510653863290871e-07, "logits/chosen": -0.43365517258644104, "logits/rejected": -0.23527593910694122, "logps/chosen": -127.06233215332031, "logps/rejected": -150.31356811523438, "loss": 0.6191, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.4495798647403717, "rewards/margins": 0.3023374378681183, "rewards/rejected": -0.7519172430038452, "step": 270 }, { "epoch": 0.29304029304029305, "grad_norm": 24.405765366439173, "learning_rate": 4.4549858303465737e-07, "logits/chosen": -0.9159714579582214, "logits/rejected": -0.717955470085144, "logps/chosen": -109.26994323730469, "logps/rejected": -139.60409545898438, "loss": 0.622, "rewards/accuracies": 0.3125, "rewards/chosen": -0.3430403769016266, "rewards/margins": 0.24182644486427307, "rewards/rejected": -0.5848668217658997, "step": 280 }, { "epoch": 0.3035060177917321, "grad_norm": 18.88355766247808, "learning_rate": 4.396703177135261e-07, "logits/chosen": -1.3694268465042114, "logits/rejected": -1.1670420169830322, "logps/chosen": -97.29615783691406, "logps/rejected": -123.79087829589844, "loss": 0.6219, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.3448092043399811, "rewards/margins": 0.27321183681488037, "rewards/rejected": -0.6180210709571838, "step": 290 }, { "epoch": 0.3139717425431711, "grad_norm": 24.161262620892497, "learning_rate": 4.335883851539693e-07, "logits/chosen": -1.5254216194152832, "logits/rejected": -1.4045015573501587, "logps/chosen": -147.19302368164062, "logps/rejected": -174.0820770263672, "loss": 0.6331, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5355431437492371, "rewards/margins": 0.2548714876174927, "rewards/rejected": -0.7904146313667297, "step": 300 }, { "epoch": 0.3139717425431711, "eval_logits/chosen": -1.1872740983963013, "eval_logits/rejected": -1.0319762229919434, "eval_logps/chosen": -120.13069915771484, "eval_logps/rejected": -157.09765625, "eval_loss": 0.6194990277290344, "eval_rewards/accuracies": 0.3452380895614624, "eval_rewards/chosen": -0.45639467239379883, "eval_rewards/margins": 0.3032984435558319, "eval_rewards/rejected": -0.7596930265426636, "eval_runtime": 113.7203, "eval_samples_per_second": 17.587, "eval_steps_per_second": 0.554, "step": 300 }, { "epoch": 0.32443746729461015, "grad_norm": 27.882854283662915, "learning_rate": 4.272609194017105e-07, "logits/chosen": -0.8721631765365601, "logits/rejected": -0.47974568605422974, "logps/chosen": -142.952392578125, "logps/rejected": -164.13180541992188, "loss": 0.6108, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.44492608308792114, "rewards/margins": 0.36958834528923035, "rewards/rejected": -0.8145144581794739, "step": 310 }, { "epoch": 0.3349031920460492, "grad_norm": 40.79438243043005, "learning_rate": 4.2069638288135547e-07, "logits/chosen": 0.030854111537337303, "logits/rejected": 0.30916082859039307, "logps/chosen": -143.35768127441406, "logps/rejected": -215.4879608154297, "loss": 0.646, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.6657307147979736, "rewards/margins": 0.5068569779396057, "rewards/rejected": -1.1725876331329346, "step": 320 }, { "epoch": 0.3453689167974882, "grad_norm": 28.74932802994849, "learning_rate": 4.139035550786494e-07, "logits/chosen": 0.10449258983135223, "logits/rejected": 0.19263358414173126, "logps/chosen": -125.71956634521484, "logps/rejected": -157.4947509765625, "loss": 0.6184, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.5608196258544922, "rewards/margins": 0.2601728141307831, "rewards/rejected": -0.8209924697875977, "step": 330 }, { "epoch": 0.35583464154892724, "grad_norm": 20.26835641704999, "learning_rate": 4.0689152079869306e-07, "logits/chosen": -0.6976083517074585, "logits/rejected": -0.4943923354148865, "logps/chosen": -127.49371337890625, "logps/rejected": -158.2890625, "loss": 0.6271, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5544435977935791, "rewards/margins": 0.3053310215473175, "rewards/rejected": -0.8597745895385742, "step": 340 }, { "epoch": 0.3663003663003663, "grad_norm": 27.63474707817879, "learning_rate": 3.99669658015821e-07, "logits/chosen": -0.6875920295715332, "logits/rejected": -0.5427245497703552, "logps/chosen": -149.16458129882812, "logps/rejected": -175.6387481689453, "loss": 0.6043, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.6470784544944763, "rewards/margins": 0.2719033360481262, "rewards/rejected": -0.9189817309379578, "step": 350 }, { "epoch": 0.37676609105180536, "grad_norm": 30.4860863672758, "learning_rate": 3.92247625331392e-07, "logits/chosen": 0.019889334216713905, "logits/rejected": 0.449519544839859, "logps/chosen": -158.36898803710938, "logps/rejected": -190.15274047851562, "loss": 0.5979, "rewards/accuracies": 0.40625, "rewards/chosen": -0.6107655763626099, "rewards/margins": 0.40659332275390625, "rewards/rejected": -1.0173588991165161, "step": 360 }, { "epoch": 0.3872318158032444, "grad_norm": 28.105949004143817, "learning_rate": 3.846353490562664e-07, "logits/chosen": 0.23578917980194092, "logits/rejected": 0.578147292137146, "logps/chosen": -144.15545654296875, "logps/rejected": -183.06863403320312, "loss": 0.6068, "rewards/accuracies": 0.4375, "rewards/chosen": -0.577487587928772, "rewards/margins": 0.37577182054519653, "rewards/rejected": -0.9532594680786133, "step": 370 }, { "epoch": 0.3976975405546834, "grad_norm": 19.345534639045226, "learning_rate": 3.768430099352445e-07, "logits/chosen": -0.3674705922603607, "logits/rejected": 0.07584401965141296, "logps/chosen": -130.8544464111328, "logps/rejected": -168.41697692871094, "loss": 0.6092, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.49021464586257935, "rewards/margins": 0.39871546626091003, "rewards/rejected": -0.8889301419258118, "step": 380 }, { "epoch": 0.40816326530612246, "grad_norm": 29.65852362420373, "learning_rate": 3.6888102953122304e-07, "logits/chosen": -0.23846562206745148, "logits/rejected": -0.008897816762328148, "logps/chosen": -138.2523956298828, "logps/rejected": -194.8299102783203, "loss": 0.6241, "rewards/accuracies": 0.40625, "rewards/chosen": -0.5844453573226929, "rewards/margins": 0.4089936316013336, "rewards/rejected": -0.9934390187263489, "step": 390 }, { "epoch": 0.4186289900575615, "grad_norm": 42.485008339554724, "learning_rate": 3.607600562872785e-07, "logits/chosen": 0.0847388356924057, "logits/rejected": 0.3739756643772125, "logps/chosen": -133.99862670898438, "logps/rejected": -157.2317352294922, "loss": 0.6321, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.6070858836174011, "rewards/margins": 0.2605039179325104, "rewards/rejected": -0.8675897717475891, "step": 400 }, { "epoch": 0.4186289900575615, "eval_logits/chosen": 0.03354182466864586, "eval_logits/rejected": 0.37280067801475525, "eval_logps/chosen": -146.9637451171875, "eval_logps/rejected": -190.27566528320312, "eval_loss": 0.6099374294281006, "eval_rewards/accuracies": 0.363095223903656, "eval_rewards/chosen": -0.7247251272201538, "eval_rewards/margins": 0.36674803495407104, "eval_rewards/rejected": -1.0914732217788696, "eval_runtime": 113.6653, "eval_samples_per_second": 17.596, "eval_steps_per_second": 0.554, "step": 400 }, { "epoch": 0.4290947148090005, "grad_norm": 17.771390083818016, "learning_rate": 3.5249095128531856e-07, "logits/chosen": -0.3344365656375885, "logits/rejected": -0.13601410388946533, "logps/chosen": -168.97885131835938, "logps/rejected": -211.15029907226562, "loss": 0.6274, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.8895459175109863, "rewards/margins": 0.2965359091758728, "rewards/rejected": -1.186081886291504, "step": 410 }, { "epoch": 0.43956043956043955, "grad_norm": 29.30170184710936, "learning_rate": 3.4408477372034736e-07, "logits/chosen": -0.6253395080566406, "logits/rejected": -0.4562221169471741, "logps/chosen": -117.9905014038086, "logps/rejected": -139.88853454589844, "loss": 0.6305, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.49884548783302307, "rewards/margins": 0.21792730689048767, "rewards/rejected": -0.7167727947235107, "step": 420 }, { "epoch": 0.4500261643118786, "grad_norm": 19.52934749077977, "learning_rate": 3.3555276610977276e-07, "logits/chosen": 0.060841239988803864, "logits/rejected": 0.28291866183280945, "logps/chosen": -119.32076263427734, "logps/rejected": -161.02288818359375, "loss": 0.6071, "rewards/accuracies": 0.28125, "rewards/chosen": -0.5378307104110718, "rewards/margins": 0.3328271806240082, "rewards/rejected": -0.8706579208374023, "step": 430 }, { "epoch": 0.4604918890633176, "grad_norm": 28.977327104522658, "learning_rate": 3.269063392575352e-07, "logits/chosen": 0.562627911567688, "logits/rejected": 0.706725001335144, "logps/chosen": -145.38662719726562, "logps/rejected": -177.89791870117188, "loss": 0.6155, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.7169743776321411, "rewards/margins": 0.19164128601551056, "rewards/rejected": -0.9086155891418457, "step": 440 }, { "epoch": 0.47095761381475665, "grad_norm": 21.22082345177454, "learning_rate": 3.1815705699316964e-07, "logits/chosen": 0.6601327657699585, "logits/rejected": 0.9368169903755188, "logps/chosen": -159.88311767578125, "logps/rejected": -201.655517578125, "loss": 0.6175, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.6956228017807007, "rewards/margins": 0.4413565993309021, "rewards/rejected": -1.1369794607162476, "step": 450 }, { "epoch": 0.48142333856619574, "grad_norm": 19.35642436173428, "learning_rate": 3.0931662070620794e-07, "logits/chosen": 0.47756925225257874, "logits/rejected": 1.0075833797454834, "logps/chosen": -136.54437255859375, "logps/rejected": -176.9443359375, "loss": 0.6108, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.6124375462532043, "rewards/margins": 0.4467083811759949, "rewards/rejected": -1.0591459274291992, "step": 460 }, { "epoch": 0.49188906331763477, "grad_norm": 35.6743715566195, "learning_rate": 3.003968536966078e-07, "logits/chosen": 0.9841750264167786, "logits/rejected": 1.1566669940948486, "logps/chosen": -134.76565551757812, "logps/rejected": -183.22018432617188, "loss": 0.6033, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.6426252722740173, "rewards/margins": 0.34931105375289917, "rewards/rejected": -0.9919363856315613, "step": 470 }, { "epoch": 0.5023547880690737, "grad_norm": 31.355461154744457, "learning_rate": 2.9140968536213693e-07, "logits/chosen": 1.8672128915786743, "logits/rejected": 2.3499321937561035, "logps/chosen": -142.2679443359375, "logps/rejected": -197.79867553710938, "loss": 0.6029, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.6666483879089355, "rewards/margins": 0.5482696890830994, "rewards/rejected": -1.2149180173873901, "step": 480 }, { "epoch": 0.5128205128205128, "grad_norm": 39.792642390254535, "learning_rate": 2.823671352438608e-07, "logits/chosen": 2.002504825592041, "logits/rejected": 2.7407174110412598, "logps/chosen": -147.71644592285156, "logps/rejected": -184.10256958007812, "loss": 0.6191, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.5607318878173828, "rewards/margins": 0.49417656660079956, "rewards/rejected": -1.0549085140228271, "step": 490 }, { "epoch": 0.5232862375719518, "grad_norm": 19.501719693409513, "learning_rate": 2.73281296951072e-07, "logits/chosen": 2.3184399604797363, "logits/rejected": 2.6984035968780518, "logps/chosen": -179.01693725585938, "logps/rejected": -221.22781372070312, "loss": 0.6318, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.8433685302734375, "rewards/margins": 0.404899924993515, "rewards/rejected": -1.248268485069275, "step": 500 }, { "epoch": 0.5232862375719518, "eval_logits/chosen": 2.6547250747680664, "eval_logits/rejected": 2.9545063972473145, "eval_logps/chosen": -155.49295043945312, "eval_logps/rejected": -204.6371307373047, "eval_loss": 0.6104578375816345, "eval_rewards/accuracies": 0.3551587164402008, "eval_rewards/chosen": -0.8100170493125916, "eval_rewards/margins": 0.4250708818435669, "eval_rewards/rejected": -1.2350879907608032, "eval_runtime": 113.5938, "eval_samples_per_second": 17.607, "eval_steps_per_second": 0.555, "step": 500 }, { "epoch": 0.533751962323391, "grad_norm": 23.676894445603228, "learning_rate": 2.641643219871597e-07, "logits/chosen": 2.713271379470825, "logits/rejected": 2.843205213546753, "logps/chosen": -130.48731994628906, "logps/rejected": -174.28225708007812, "loss": 0.6232, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.6893380284309387, "rewards/margins": 0.37240949273109436, "rewards/rejected": -1.061747431755066, "step": 510 }, { "epoch": 0.54421768707483, "grad_norm": 35.84824398777263, "learning_rate": 2.550284034980507e-07, "logits/chosen": 2.0955018997192383, "logits/rejected": 2.464780330657959, "logps/chosen": -167.99371337890625, "logps/rejected": -198.072021484375, "loss": 0.6346, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": -0.9579731822013855, "rewards/margins": 0.2499997913837433, "rewards/rejected": -1.2079728841781616, "step": 520 }, { "epoch": 0.554683411826269, "grad_norm": 19.326207651996775, "learning_rate": 2.4588575996495794e-07, "logits/chosen": 1.578254222869873, "logits/rejected": 1.7954685688018799, "logps/chosen": -124.0101547241211, "logps/rejected": -157.9552764892578, "loss": 0.6063, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.6386234760284424, "rewards/margins": 0.3164999485015869, "rewards/rejected": -0.9551234245300293, "step": 530 }, { "epoch": 0.565149136577708, "grad_norm": 25.47159513000541, "learning_rate": 2.367486188632446e-07, "logits/chosen": 1.5957086086273193, "logits/rejected": 1.9952272176742554, "logps/chosen": -136.2624053955078, "logps/rejected": -160.97589111328125, "loss": 0.6202, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.6916071772575378, "rewards/margins": 0.2573884129524231, "rewards/rejected": -0.9489954710006714, "step": 540 }, { "epoch": 0.5756148613291471, "grad_norm": 27.189003326527832, "learning_rate": 2.276292003092593e-07, "logits/chosen": 0.5969494581222534, "logits/rejected": 1.1224400997161865, "logps/chosen": -165.4878387451172, "logps/rejected": -201.8157958984375, "loss": 0.6072, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.7528584599494934, "rewards/margins": 0.4292448163032532, "rewards/rejected": -1.1821032762527466, "step": 550 }, { "epoch": 0.5860805860805861, "grad_norm": 19.01493291730421, "learning_rate": 2.185397007170141e-07, "logits/chosen": 0.5469252467155457, "logits/rejected": 0.9992968440055847, "logps/chosen": -109.37480163574219, "logps/rejected": -158.11180114746094, "loss": 0.607, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.5237552523612976, "rewards/margins": 0.4429488778114319, "rewards/rejected": -0.9667040705680847, "step": 560 }, { "epoch": 0.5965463108320251, "grad_norm": 34.53884646430518, "learning_rate": 2.094922764865619e-07, "logits/chosen": 0.47024235129356384, "logits/rejected": 1.3419710397720337, "logps/chosen": -152.4022674560547, "logps/rejected": -181.06829833984375, "loss": 0.6036, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.6253000497817993, "rewards/margins": 0.45260563492774963, "rewards/rejected": -1.0779056549072266, "step": 570 }, { "epoch": 0.6070120355834642, "grad_norm": 23.195476547368756, "learning_rate": 2.0049902774588797e-07, "logits/chosen": 0.7025114297866821, "logits/rejected": 1.4946035146713257, "logps/chosen": -155.9736328125, "logps/rejected": -197.6754150390625, "loss": 0.6174, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.7866873741149902, "rewards/margins": 0.5116966366767883, "rewards/rejected": -1.2983839511871338, "step": 580 }, { "epoch": 0.6174777603349032, "grad_norm": 28.28948495055075, "learning_rate": 1.9157198216806238e-07, "logits/chosen": 1.2547266483306885, "logits/rejected": 1.6798299551010132, "logps/chosen": -143.81948852539062, "logps/rejected": -180.07110595703125, "loss": 0.5974, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.7290927171707153, "rewards/margins": 0.344825804233551, "rewards/rejected": -1.0739185810089111, "step": 590 }, { "epoch": 0.6279434850863422, "grad_norm": 26.475084176869974, "learning_rate": 1.8272307888529274e-07, "logits/chosen": 1.0473191738128662, "logits/rejected": 1.4664316177368164, "logps/chosen": -151.5343017578125, "logps/rejected": -203.01600646972656, "loss": 0.5978, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.8304673433303833, "rewards/margins": 0.4478435516357422, "rewards/rejected": -1.278310775756836, "step": 600 }, { "epoch": 0.6279434850863422, "eval_logits/chosen": 0.9605558514595032, "eval_logits/rejected": 1.442029356956482, "eval_logps/chosen": -147.85601806640625, "eval_logps/rejected": -199.51206970214844, "eval_loss": 0.6014743447303772, "eval_rewards/accuracies": 0.3591269850730896, "eval_rewards/chosen": -0.733647882938385, "eval_rewards/margins": 0.4501895010471344, "eval_rewards/rejected": -1.1838374137878418, "eval_runtime": 113.6375, "eval_samples_per_second": 17.6, "eval_steps_per_second": 0.554, "step": 600 }, { "epoch": 0.6384092098377813, "grad_norm": 24.64809094353209, "learning_rate": 1.7396415252139288e-07, "logits/chosen": 0.9734107255935669, "logits/rejected": 1.4407756328582764, "logps/chosen": -136.4254150390625, "logps/rejected": -157.22183227539062, "loss": 0.6155, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.6510840654373169, "rewards/margins": 0.3478087782859802, "rewards/rejected": -0.9988927841186523, "step": 610 }, { "epoch": 0.6488749345892203, "grad_norm": 24.74151564246123, "learning_rate": 1.6530691736402316e-07, "logits/chosen": 0.4937843680381775, "logits/rejected": 0.9153006672859192, "logps/chosen": -137.00144958496094, "logps/rejected": -177.6410675048828, "loss": 0.6073, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.725879967212677, "rewards/margins": 0.3728798031806946, "rewards/rejected": -1.0987598896026611, "step": 620 }, { "epoch": 0.6593406593406593, "grad_norm": 22.938989151746902, "learning_rate": 1.5676295169786864e-07, "logits/chosen": 0.19430339336395264, "logits/rejected": 0.6654868721961975, "logps/chosen": -176.47686767578125, "logps/rejected": -213.91622924804688, "loss": 0.5789, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.7090758085250854, "rewards/margins": 0.45301565527915955, "rewards/rejected": -1.1620914936065674, "step": 630 }, { "epoch": 0.6698063840920984, "grad_norm": 43.50900642344418, "learning_rate": 1.483436823197092e-07, "logits/chosen": 0.7957710027694702, "logits/rejected": 1.4320136308670044, "logps/chosen": -182.98187255859375, "logps/rejected": -233.4198455810547, "loss": 0.5792, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.872964084148407, "rewards/margins": 0.5010480284690857, "rewards/rejected": -1.3740123510360718, "step": 640 }, { "epoch": 0.6802721088435374, "grad_norm": 24.400296552508813, "learning_rate": 1.4006036925609243e-07, "logits/chosen": 1.008284330368042, "logits/rejected": 1.555418848991394, "logps/chosen": -128.50022888183594, "logps/rejected": -170.05349731445312, "loss": 0.59, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.5598582029342651, "rewards/margins": 0.4066081643104553, "rewards/rejected": -0.9664663076400757, "step": 650 }, { "epoch": 0.6907378335949764, "grad_norm": 31.773454484895552, "learning_rate": 1.319240907040458e-07, "logits/chosen": 0.5741680860519409, "logits/rejected": 1.095399022102356, "logps/chosen": -144.4388427734375, "logps/rejected": -190.87571716308594, "loss": 0.6023, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.5422704815864563, "rewards/margins": 0.48283880949020386, "rewards/rejected": -1.0251094102859497, "step": 660 }, { "epoch": 0.7012035583464155, "grad_norm": 37.13339246311252, "learning_rate": 1.239457282149695e-07, "logits/chosen": 0.8501984477043152, "logits/rejected": 1.152748942375183, "logps/chosen": -102.63143157958984, "logps/rejected": -154.28704833984375, "loss": 0.5949, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.4462759494781494, "rewards/margins": 0.41997307538986206, "rewards/rejected": -0.8662489652633667, "step": 670 }, { "epoch": 0.7116692830978545, "grad_norm": 32.56523005952964, "learning_rate": 1.1613595214152711e-07, "logits/chosen": 1.335402250289917, "logits/rejected": 1.811517357826233, "logps/chosen": -135.314453125, "logps/rejected": -198.50515747070312, "loss": 0.6071, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.587442934513092, "rewards/margins": 0.5670086145401001, "rewards/rejected": -1.154451608657837, "step": 680 }, { "epoch": 0.7221350078492935, "grad_norm": 23.014936124623496, "learning_rate": 1.0850520736699362e-07, "logits/chosen": 0.7806999683380127, "logits/rejected": 1.3070814609527588, "logps/chosen": -168.81216430664062, "logps/rejected": -202.5950469970703, "loss": 0.5659, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.7739877700805664, "rewards/margins": 0.40968823432922363, "rewards/rejected": -1.18367600440979, "step": 690 }, { "epoch": 0.7326007326007326, "grad_norm": 26.764323919821496, "learning_rate": 1.0106369933615042e-07, "logits/chosen": 0.9588180780410767, "logits/rejected": 1.4978833198547363, "logps/chosen": -165.7656707763672, "logps/rejected": -204.122802734375, "loss": 0.6113, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.9200434684753418, "rewards/margins": 0.37314558029174805, "rewards/rejected": -1.2931890487670898, "step": 700 }, { "epoch": 0.7326007326007326, "eval_logits/chosen": 1.1833491325378418, "eval_logits/rejected": 1.7187780141830444, "eval_logps/chosen": -150.68544006347656, "eval_logps/rejected": -204.91946411132812, "eval_loss": 0.5986347794532776, "eval_rewards/accuracies": 0.3650793731212616, "eval_rewards/chosen": -0.7619420289993286, "eval_rewards/margins": 0.4759688675403595, "eval_rewards/rejected": -1.2379108667373657, "eval_runtime": 113.6638, "eval_samples_per_second": 17.596, "eval_steps_per_second": 0.554, "step": 700 }, { "epoch": 0.7430664573521716, "grad_norm": 27.404707816356066, "learning_rate": 9.382138040640714e-08, "logits/chosen": 0.7452703714370728, "logits/rejected": 1.4386818408966064, "logps/chosen": -175.61923217773438, "logps/rejected": -214.57119750976562, "loss": 0.618, "rewards/accuracies": 0.40625, "rewards/chosen": -0.852981686592102, "rewards/margins": 0.44363918900489807, "rewards/rejected": -1.2966209650039673, "step": 710 }, { "epoch": 0.7535321821036107, "grad_norm": 27.03589930657382, "learning_rate": 8.678793653740632e-08, "logits/chosen": 1.3341294527053833, "logits/rejected": 1.880934476852417, "logps/chosen": -154.9960174560547, "logps/rejected": -193.92404174804688, "loss": 0.5852, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.8138143420219421, "rewards/margins": 0.4843239188194275, "rewards/rejected": -1.2981382608413696, "step": 720 }, { "epoch": 0.7639979068550498, "grad_norm": 19.503526282992237, "learning_rate": 7.997277433690983e-08, "logits/chosen": 1.2488057613372803, "logits/rejected": 1.543897271156311, "logps/chosen": -179.2456817626953, "logps/rejected": -218.6804656982422, "loss": 0.6017, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.9957185983657837, "rewards/margins": 0.3561645448207855, "rewards/rejected": -1.3518832921981812, "step": 730 }, { "epoch": 0.7744636316064888, "grad_norm": 24.561492093850955, "learning_rate": 7.338500848029602e-08, "logits/chosen": 0.7433587908744812, "logits/rejected": 1.235414981842041, "logps/chosen": -166.8399200439453, "logps/rejected": -215.26876831054688, "loss": 0.6178, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.9067522883415222, "rewards/margins": 0.41579413414001465, "rewards/rejected": -1.322546362876892, "step": 740 }, { "epoch": 0.7849293563579278, "grad_norm": 28.115986705653192, "learning_rate": 6.70334495204884e-08, "logits/chosen": 0.6296231150627136, "logits/rejected": 1.1273549795150757, "logps/chosen": -182.21527099609375, "logps/rejected": -222.38735961914062, "loss": 0.6104, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.9979060292243958, "rewards/margins": 0.37623411417007446, "rewards/rejected": -1.3741401433944702, "step": 750 }, { "epoch": 0.7953950811093669, "grad_norm": 25.524078897067774, "learning_rate": 6.092659210462231e-08, "logits/chosen": 0.6932498216629028, "logits/rejected": 1.2560994625091553, "logps/chosen": -163.45713806152344, "logps/rejected": -210.75875854492188, "loss": 0.6428, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.8338336944580078, "rewards/margins": 0.41811808943748474, "rewards/rejected": -1.2519516944885254, "step": 760 }, { "epoch": 0.8058608058608059, "grad_norm": 19.037858132036448, "learning_rate": 5.507260361320737e-08, "logits/chosen": 0.7545400261878967, "logits/rejected": 1.186693787574768, "logps/chosen": -139.1665802001953, "logps/rejected": -173.541748046875, "loss": 0.5694, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.6477771401405334, "rewards/margins": 0.3577363193035126, "rewards/rejected": -1.0055135488510132, "step": 770 }, { "epoch": 0.8163265306122449, "grad_norm": 25.539683354526822, "learning_rate": 4.947931323697982e-08, "logits/chosen": 0.8985282778739929, "logits/rejected": 1.1507294178009033, "logps/chosen": -127.5772933959961, "logps/rejected": -150.71971130371094, "loss": 0.6274, "rewards/accuracies": 0.2562499940395355, "rewards/chosen": -0.5861515402793884, "rewards/margins": 0.2556864023208618, "rewards/rejected": -0.8418378829956055, "step": 780 }, { "epoch": 0.826792255363684, "grad_norm": 29.296410897786252, "learning_rate": 4.415420150605398e-08, "logits/chosen": 0.5066564083099365, "logits/rejected": 1.2835947275161743, "logps/chosen": -169.21194458007812, "logps/rejected": -226.9739990234375, "loss": 0.5972, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7719463109970093, "rewards/margins": 0.5745865702629089, "rewards/rejected": -1.346532940864563, "step": 790 }, { "epoch": 0.837257980115123, "grad_norm": 23.071826857167007, "learning_rate": 3.9104390285376374e-08, "logits/chosen": 0.2603650689125061, "logits/rejected": 0.9912735819816589, "logps/chosen": -189.1995086669922, "logps/rejected": -238.74069213867188, "loss": 0.5885, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.8696195483207703, "rewards/margins": 0.551138162612915, "rewards/rejected": -1.4207580089569092, "step": 800 }, { "epoch": 0.837257980115123, "eval_logits/chosen": 0.5612532496452332, "eval_logits/rejected": 1.0127543210983276, "eval_logps/chosen": -141.69253540039062, "eval_logps/rejected": -192.48452758789062, "eval_loss": 0.5973930954933167, "eval_rewards/accuracies": 0.369047611951828, "eval_rewards/chosen": -0.672012984752655, "eval_rewards/margins": 0.4415486454963684, "eval_rewards/rejected": -1.113561749458313, "eval_runtime": 113.6177, "eval_samples_per_second": 17.603, "eval_steps_per_second": 0.554, "step": 800 }, { "epoch": 0.847723704866562, "grad_norm": 17.313265289048484, "learning_rate": 3.433663324986208e-08, "logits/chosen": 0.3017066419124603, "logits/rejected": 0.7334527373313904, "logps/chosen": -151.9001007080078, "logps/rejected": -182.63177490234375, "loss": 0.613, "rewards/accuracies": 0.375, "rewards/chosen": -0.6817248463630676, "rewards/margins": 0.36351272463798523, "rewards/rejected": -1.0452375411987305, "step": 810 }, { "epoch": 0.858189429618001, "grad_norm": 32.324000689082936, "learning_rate": 2.9857306851953897e-08, "logits/chosen": 0.6850260496139526, "logits/rejected": 1.2093479633331299, "logps/chosen": -144.95767211914062, "logps/rejected": -198.352783203125, "loss": 0.6, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.6805930733680725, "rewards/margins": 0.44533902406692505, "rewards/rejected": -1.1259320974349976, "step": 820 }, { "epoch": 0.8686551543694401, "grad_norm": 27.72643919363414, "learning_rate": 2.567240179368185e-08, "logits/chosen": -0.07449465245008469, "logits/rejected": 0.24005027115345, "logps/chosen": -154.6387176513672, "logps/rejected": -227.32882690429688, "loss": 0.6169, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.7048233151435852, "rewards/margins": 0.4948086142539978, "rewards/rejected": -1.199631929397583, "step": 830 }, { "epoch": 0.8791208791208791, "grad_norm": 23.57031908849105, "learning_rate": 2.1787515014630357e-08, "logits/chosen": 0.2434501200914383, "logits/rejected": 0.7666997313499451, "logps/chosen": -151.32212829589844, "logps/rejected": -209.18057250976562, "loss": 0.5855, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.6667734980583191, "rewards/margins": 0.47117409110069275, "rewards/rejected": -1.1379475593566895, "step": 840 }, { "epoch": 0.8895866038723181, "grad_norm": 27.822093386580878, "learning_rate": 1.820784220652766e-08, "logits/chosen": 0.09173062443733215, "logits/rejected": 0.7935197949409485, "logps/chosen": -160.8597869873047, "logps/rejected": -215.7743377685547, "loss": 0.5858, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.6746786236763, "rewards/margins": 0.5673048496246338, "rewards/rejected": -1.2419836521148682, "step": 850 }, { "epoch": 0.9000523286237572, "grad_norm": 24.993719056225927, "learning_rate": 1.4938170864468636e-08, "logits/chosen": -0.022973239421844482, "logits/rejected": 0.728354811668396, "logps/chosen": -177.0865936279297, "logps/rejected": -222.30313110351562, "loss": 0.587, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.7254469990730286, "rewards/margins": 0.5123113989830017, "rewards/rejected": -1.2377583980560303, "step": 860 }, { "epoch": 0.9105180533751962, "grad_norm": 15.152551046729396, "learning_rate": 1.1982873884064465e-08, "logits/chosen": 0.3221861720085144, "logits/rejected": 0.7626439332962036, "logps/chosen": -138.0549774169922, "logps/rejected": -176.89402770996094, "loss": 0.6052, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.6149898767471313, "rewards/margins": 0.41914796829223633, "rewards/rejected": -1.0341379642486572, "step": 870 }, { "epoch": 0.9209837781266352, "grad_norm": 34.8714524353856, "learning_rate": 9.345903713082304e-09, "logits/chosen": 0.6613011360168457, "logits/rejected": 0.8276697397232056, "logps/chosen": -124.2752914428711, "logps/rejected": -161.05874633789062, "loss": 0.6179, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": -0.602181077003479, "rewards/margins": 0.27649611234664917, "rewards/rejected": -0.8786771893501282, "step": 880 }, { "epoch": 0.9314495028780743, "grad_norm": 17.42009363611703, "learning_rate": 7.030787065396865e-09, "logits/chosen": 0.14433155953884125, "logits/rejected": 0.636074423789978, "logps/chosen": -144.4778289794922, "logps/rejected": -189.1996612548828, "loss": 0.6092, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.6060083508491516, "rewards/margins": 0.42767366766929626, "rewards/rejected": -1.033682107925415, "step": 890 }, { "epoch": 0.9419152276295133, "grad_norm": 25.137415279564916, "learning_rate": 5.04062020432286e-09, "logits/chosen": 0.5784530639648438, "logits/rejected": 1.0418832302093506, "logps/chosen": -118.29461669921875, "logps/rejected": -145.42251586914062, "loss": 0.595, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.4733172357082367, "rewards/margins": 0.31973880529403687, "rewards/rejected": -0.793056070804596, "step": 900 }, { "epoch": 0.9419152276295133, "eval_logits/chosen": 0.4325558543205261, "eval_logits/rejected": 0.9106192588806152, "eval_logps/chosen": -136.28819274902344, "eval_logps/rejected": -189.55056762695312, "eval_loss": 0.5958317518234253, "eval_rewards/accuracies": 0.3710317313671112, "eval_rewards/chosen": -0.6179695725440979, "eval_rewards/margins": 0.4662524461746216, "eval_rewards/rejected": -1.0842220783233643, "eval_runtime": 113.6384, "eval_samples_per_second": 17.6, "eval_steps_per_second": 0.554, "step": 900 }, { "epoch": 0.9523809523809523, "grad_norm": 48.333955356090875, "learning_rate": 3.3780648016376866e-09, "logits/chosen": -0.08735128492116928, "logits/rejected": 0.43545690178871155, "logps/chosen": -137.79881286621094, "logps/rejected": -196.9468231201172, "loss": 0.593, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.7070980072021484, "rewards/margins": 0.4561308026313782, "rewards/rejected": -1.1632287502288818, "step": 910 }, { "epoch": 0.9628466771323915, "grad_norm": 28.43671889890296, "learning_rate": 2.0453443778310766e-09, "logits/chosen": -0.048371605575084686, "logits/rejected": 0.41192588210105896, "logps/chosen": -149.82281494140625, "logps/rejected": -189.8140411376953, "loss": 0.6088, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.6467713713645935, "rewards/margins": 0.3985586166381836, "rewards/rejected": -1.0453299283981323, "step": 920 }, { "epoch": 0.9733124018838305, "grad_norm": 37.55350929589785, "learning_rate": 1.0442413283435758e-09, "logits/chosen": -0.05256899446249008, "logits/rejected": 0.6733183860778809, "logps/chosen": -138.57061767578125, "logps/rejected": -181.3246307373047, "loss": 0.5951, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.5108522772789001, "rewards/margins": 0.48942360281944275, "rewards/rejected": -1.0002758502960205, "step": 930 }, { "epoch": 0.9837781266352695, "grad_norm": 23.109646236441566, "learning_rate": 3.760945397705828e-10, "logits/chosen": 0.17422077059745789, "logits/rejected": 0.798999547958374, "logps/chosen": -132.8301239013672, "logps/rejected": -187.94265747070312, "loss": 0.5961, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.599638819694519, "rewards/margins": 0.4609258770942688, "rewards/rejected": -1.060564637184143, "step": 940 }, { "epoch": 0.9942438513867086, "grad_norm": 18.08118855716985, "learning_rate": 4.17975992204056e-11, "logits/chosen": 0.5573434829711914, "logits/rejected": 0.9020156860351562, "logps/chosen": -134.12124633789062, "logps/rejected": -163.89334106445312, "loss": 0.618, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.6402407288551331, "rewards/margins": 0.31644412875175476, "rewards/rejected": -0.9566848874092102, "step": 950 }, { "epoch": 0.9994767137624281, "step": 955, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 0.0165, "train_samples_per_second": 369998.418, "train_steps_per_second": 5810.543 } ], "logging_steps": 10, "max_steps": 96, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }