zephyr-7b-sft-full-SPIN-iter0 / trainer_state.json
ydeng9's picture
Model first version
a36a681
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 1556,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.0706638115631692e-09,
"logits/chosen": -3.0633435249328613,
"logits/rejected": -3.0370049476623535,
"logps/chosen": -237.29315185546875,
"logps/rejected": -251.69747924804688,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 1.070663811563169e-08,
"logits/chosen": -2.990461826324463,
"logits/rejected": -3.0024797916412354,
"logps/chosen": -356.6201171875,
"logps/rejected": -390.87042236328125,
"loss": 0.6911,
"rewards/accuracies": 0.5555555820465088,
"rewards/chosen": -0.004924382548779249,
"rewards/margins": 0.009135871194303036,
"rewards/rejected": -0.014060255140066147,
"step": 10
},
{
"epoch": 0.03,
"learning_rate": 2.141327623126338e-08,
"logits/chosen": -3.002528429031372,
"logits/rejected": -3.0017483234405518,
"logps/chosen": -350.7555847167969,
"logps/rejected": -393.46014404296875,
"loss": 0.6801,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.050124846398830414,
"rewards/margins": 0.028588850051164627,
"rewards/rejected": 0.02153599075973034,
"step": 20
},
{
"epoch": 0.04,
"learning_rate": 3.2119914346895076e-08,
"logits/chosen": -2.975447416305542,
"logits/rejected": -3.0126380920410156,
"logps/chosen": -375.95391845703125,
"logps/rejected": -432.83587646484375,
"loss": 0.6435,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.2032477855682373,
"rewards/margins": 0.1010356992483139,
"rewards/rejected": 0.1022120863199234,
"step": 30
},
{
"epoch": 0.05,
"learning_rate": 4.282655246252676e-08,
"logits/chosen": -3.0026869773864746,
"logits/rejected": -2.9945485591888428,
"logps/chosen": -383.3456115722656,
"logps/rejected": -392.7911376953125,
"loss": 0.5784,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.4273909628391266,
"rewards/margins": 0.30088725686073303,
"rewards/rejected": 0.12650372087955475,
"step": 40
},
{
"epoch": 0.06,
"learning_rate": 5.353319057815846e-08,
"logits/chosen": -2.989891529083252,
"logits/rejected": -2.996675968170166,
"logps/chosen": -339.07513427734375,
"logps/rejected": -373.727783203125,
"loss": 0.5345,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.6149066686630249,
"rewards/margins": 0.39920732378959656,
"rewards/rejected": 0.21569931507110596,
"step": 50
},
{
"epoch": 0.08,
"learning_rate": 6.423982869379015e-08,
"logits/chosen": -3.026094913482666,
"logits/rejected": -2.9982128143310547,
"logps/chosen": -327.8692321777344,
"logps/rejected": -375.9877624511719,
"loss": 0.4485,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.6913961172103882,
"rewards/margins": 0.7190420031547546,
"rewards/rejected": -0.02764584682881832,
"step": 60
},
{
"epoch": 0.09,
"learning_rate": 7.494646680942184e-08,
"logits/chosen": -2.974823474884033,
"logits/rejected": -2.980032444000244,
"logps/chosen": -351.2728576660156,
"logps/rejected": -395.68609619140625,
"loss": 0.3966,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.8828132748603821,
"rewards/margins": 0.9640719294548035,
"rewards/rejected": -0.08125858008861542,
"step": 70
},
{
"epoch": 0.1,
"learning_rate": 8.565310492505352e-08,
"logits/chosen": -2.977529287338257,
"logits/rejected": -2.9725558757781982,
"logps/chosen": -359.2842712402344,
"logps/rejected": -405.7890625,
"loss": 0.3519,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 1.091180682182312,
"rewards/margins": 1.2520115375518799,
"rewards/rejected": -0.16083075106143951,
"step": 80
},
{
"epoch": 0.12,
"learning_rate": 9.635974304068522e-08,
"logits/chosen": -2.979015827178955,
"logits/rejected": -2.9813497066497803,
"logps/chosen": -309.3511047363281,
"logps/rejected": -358.91607666015625,
"loss": 0.3201,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.2030521631240845,
"rewards/margins": 1.6773903369903564,
"rewards/rejected": -0.4743381440639496,
"step": 90
},
{
"epoch": 0.13,
"learning_rate": 1.0706638115631692e-07,
"logits/chosen": -2.941194534301758,
"logits/rejected": -2.9548678398132324,
"logps/chosen": -343.6178894042969,
"logps/rejected": -463.1512145996094,
"loss": 0.2696,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.2106283903121948,
"rewards/margins": 1.9713561534881592,
"rewards/rejected": -0.7607278823852539,
"step": 100
},
{
"epoch": 0.13,
"eval_logits/chosen": -2.977161169052124,
"eval_logits/rejected": -2.957442045211792,
"eval_logps/chosen": -296.8330383300781,
"eval_logps/rejected": -349.66558837890625,
"eval_loss": 0.2511790990829468,
"eval_rewards/accuracies": 0.921875,
"eval_rewards/chosen": 1.1878268718719482,
"eval_rewards/margins": 1.8798556327819824,
"eval_rewards/rejected": -0.6920287609100342,
"eval_runtime": 38.7534,
"eval_samples_per_second": 12.902,
"eval_steps_per_second": 0.413,
"step": 100
},
{
"epoch": 0.14,
"learning_rate": 1.177730192719486e-07,
"logits/chosen": -2.9442899227142334,
"logits/rejected": -2.9481866359710693,
"logps/chosen": -346.63873291015625,
"logps/rejected": -406.31964111328125,
"loss": 0.2493,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.2421057224273682,
"rewards/margins": 2.18147873878479,
"rewards/rejected": -0.9393728971481323,
"step": 110
},
{
"epoch": 0.15,
"learning_rate": 1.284796573875803e-07,
"logits/chosen": -2.94069242477417,
"logits/rejected": -2.9417574405670166,
"logps/chosen": -351.788330078125,
"logps/rejected": -379.61065673828125,
"loss": 0.2406,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.7772680521011353,
"rewards/margins": 1.8036502599716187,
"rewards/rejected": -1.0263820886611938,
"step": 120
},
{
"epoch": 0.17,
"learning_rate": 1.3918629550321198e-07,
"logits/chosen": -2.926699638366699,
"logits/rejected": -2.911668300628662,
"logps/chosen": -327.4112548828125,
"logps/rejected": -408.2745361328125,
"loss": 0.2073,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.5646601915359497,
"rewards/margins": 2.2064461708068848,
"rewards/rejected": -1.6417862176895142,
"step": 130
},
{
"epoch": 0.18,
"learning_rate": 1.4989293361884367e-07,
"logits/chosen": -2.904219150543213,
"logits/rejected": -2.921232223510742,
"logps/chosen": -311.6190185546875,
"logps/rejected": -411.2701110839844,
"loss": 0.1967,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.46902722120285034,
"rewards/margins": 2.7694640159606934,
"rewards/rejected": -2.3004367351531982,
"step": 140
},
{
"epoch": 0.19,
"learning_rate": 1.6059957173447535e-07,
"logits/chosen": -2.901981830596924,
"logits/rejected": -2.9112467765808105,
"logps/chosen": -301.6145324707031,
"logps/rejected": -391.1957092285156,
"loss": 0.1723,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.218230202794075,
"rewards/margins": 3.2492637634277344,
"rewards/rejected": -3.031033992767334,
"step": 150
},
{
"epoch": 0.21,
"learning_rate": 1.7130620985010704e-07,
"logits/chosen": -2.8996052742004395,
"logits/rejected": -2.8838694095611572,
"logps/chosen": -312.6499938964844,
"logps/rejected": -447.8002014160156,
"loss": 0.1554,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.3209637701511383,
"rewards/margins": 4.501524925231934,
"rewards/rejected": -4.180561065673828,
"step": 160
},
{
"epoch": 0.22,
"learning_rate": 1.8201284796573874e-07,
"logits/chosen": -2.8928513526916504,
"logits/rejected": -2.9001543521881104,
"logps/chosen": -329.20953369140625,
"logps/rejected": -423.6446228027344,
"loss": 0.1566,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.1461164504289627,
"rewards/margins": 4.050145626068115,
"rewards/rejected": -3.904029130935669,
"step": 170
},
{
"epoch": 0.23,
"learning_rate": 1.9271948608137044e-07,
"logits/chosen": -2.8557610511779785,
"logits/rejected": -2.855731725692749,
"logps/chosen": -338.60076904296875,
"logps/rejected": -448.8922424316406,
"loss": 0.1421,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.21355919539928436,
"rewards/margins": 4.191808223724365,
"rewards/rejected": -3.9782490730285645,
"step": 180
},
{
"epoch": 0.24,
"learning_rate": 2.0342612419700214e-07,
"logits/chosen": -2.8638434410095215,
"logits/rejected": -2.877293825149536,
"logps/chosen": -347.19573974609375,
"logps/rejected": -469.17755126953125,
"loss": 0.1381,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.14256651699543,
"rewards/margins": 3.96684193611145,
"rewards/rejected": -4.109408378601074,
"step": 190
},
{
"epoch": 0.26,
"learning_rate": 2.1413276231263384e-07,
"logits/chosen": -2.829555034637451,
"logits/rejected": -2.85453462600708,
"logps/chosen": -364.0372009277344,
"logps/rejected": -442.7489318847656,
"loss": 0.1427,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.18970072269439697,
"rewards/margins": 5.029218673706055,
"rewards/rejected": -4.839517593383789,
"step": 200
},
{
"epoch": 0.26,
"eval_logits/chosen": -2.8512933254241943,
"eval_logits/rejected": -2.8302505016326904,
"eval_logps/chosen": -305.8147888183594,
"eval_logps/rejected": -387.1728210449219,
"eval_loss": 0.12157174944877625,
"eval_rewards/accuracies": 0.96875,
"eval_rewards/chosen": 0.28965064883232117,
"eval_rewards/margins": 4.73240327835083,
"eval_rewards/rejected": -4.442752361297607,
"eval_runtime": 38.702,
"eval_samples_per_second": 12.919,
"eval_steps_per_second": 0.413,
"step": 200
},
{
"epoch": 0.27,
"learning_rate": 2.248394004282655e-07,
"logits/chosen": -2.817666530609131,
"logits/rejected": -2.8465371131896973,
"logps/chosen": -325.3854675292969,
"logps/rejected": -439.5003356933594,
"loss": 0.1413,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.21560493111610413,
"rewards/margins": 4.514598369598389,
"rewards/rejected": -4.298993110656738,
"step": 210
},
{
"epoch": 0.28,
"learning_rate": 2.355460385438972e-07,
"logits/chosen": -2.7650692462921143,
"logits/rejected": -2.7801504135131836,
"logps/chosen": -326.321533203125,
"logps/rejected": -456.98663330078125,
"loss": 0.1332,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.027444612234830856,
"rewards/margins": 5.077801704406738,
"rewards/rejected": -5.050357818603516,
"step": 220
},
{
"epoch": 0.3,
"learning_rate": 2.462526766595289e-07,
"logits/chosen": -2.788020610809326,
"logits/rejected": -2.7895946502685547,
"logps/chosen": -324.4822998046875,
"logps/rejected": -439.76397705078125,
"loss": 0.1356,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.29427874088287354,
"rewards/margins": 5.166212558746338,
"rewards/rejected": -4.871933460235596,
"step": 230
},
{
"epoch": 0.31,
"learning_rate": 2.569593147751606e-07,
"logits/chosen": -2.6995949745178223,
"logits/rejected": -2.7345399856567383,
"logps/chosen": -356.4814758300781,
"logps/rejected": -490.60931396484375,
"loss": 0.1074,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.35535210371017456,
"rewards/margins": 6.1955766677856445,
"rewards/rejected": -6.550928592681885,
"step": 240
},
{
"epoch": 0.32,
"learning_rate": 2.676659528907923e-07,
"logits/chosen": -2.6892549991607666,
"logits/rejected": -2.694087505340576,
"logps/chosen": -305.6263122558594,
"logps/rejected": -387.88543701171875,
"loss": 0.0979,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.6652821898460388,
"rewards/margins": 4.929129600524902,
"rewards/rejected": -5.5944108963012695,
"step": 250
},
{
"epoch": 0.33,
"learning_rate": 2.7837259100642395e-07,
"logits/chosen": -2.73167085647583,
"logits/rejected": -2.7620654106140137,
"logps/chosen": -408.2175598144531,
"logps/rejected": -449.8201599121094,
"loss": 0.1298,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.9623678922653198,
"rewards/margins": 5.48039436340332,
"rewards/rejected": -6.4427618980407715,
"step": 260
},
{
"epoch": 0.35,
"learning_rate": 2.890792291220557e-07,
"logits/chosen": -2.7657806873321533,
"logits/rejected": -2.802060604095459,
"logps/chosen": -384.2090148925781,
"logps/rejected": -481.82696533203125,
"loss": 0.1181,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.16663847863674164,
"rewards/margins": 5.502591133117676,
"rewards/rejected": -5.335952281951904,
"step": 270
},
{
"epoch": 0.36,
"learning_rate": 2.9978586723768735e-07,
"logits/chosen": -2.673283815383911,
"logits/rejected": -2.707296848297119,
"logps/chosen": -312.5271911621094,
"logps/rejected": -411.64031982421875,
"loss": 0.0947,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.9499552845954895,
"rewards/margins": 4.852605819702148,
"rewards/rejected": -5.802561283111572,
"step": 280
},
{
"epoch": 0.37,
"learning_rate": 3.1049250535331905e-07,
"logits/chosen": -2.623725175857544,
"logits/rejected": -2.7073614597320557,
"logps/chosen": -391.2462158203125,
"logps/rejected": -474.2684631347656,
"loss": 0.1168,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.0791637897491455,
"rewards/margins": 7.065374851226807,
"rewards/rejected": -8.144537925720215,
"step": 290
},
{
"epoch": 0.39,
"learning_rate": 3.211991434689507e-07,
"logits/chosen": -2.6202073097229004,
"logits/rejected": -2.652608633041382,
"logps/chosen": -341.9140319824219,
"logps/rejected": -462.9012145996094,
"loss": 0.0944,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.275757372379303,
"rewards/margins": 5.93372106552124,
"rewards/rejected": -6.20947790145874,
"step": 300
},
{
"epoch": 0.39,
"eval_logits/chosen": -2.6932637691497803,
"eval_logits/rejected": -2.6872053146362305,
"eval_logps/chosen": -311.619873046875,
"eval_logps/rejected": -409.2980041503906,
"eval_loss": 0.11095032095909119,
"eval_rewards/accuracies": 0.90625,
"eval_rewards/chosen": -0.29085665941238403,
"eval_rewards/margins": 6.364411354064941,
"eval_rewards/rejected": -6.65526819229126,
"eval_runtime": 38.7504,
"eval_samples_per_second": 12.903,
"eval_steps_per_second": 0.413,
"step": 300
},
{
"epoch": 0.4,
"learning_rate": 3.3190578158458244e-07,
"logits/chosen": -2.6386542320251465,
"logits/rejected": -2.7159385681152344,
"logps/chosen": -368.5979919433594,
"logps/rejected": -466.84783935546875,
"loss": 0.131,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.8093490600585938,
"rewards/margins": 7.050684452056885,
"rewards/rejected": -7.8600335121154785,
"step": 310
},
{
"epoch": 0.41,
"learning_rate": 3.426124197002141e-07,
"logits/chosen": -2.571882486343384,
"logits/rejected": -2.6551308631896973,
"logps/chosen": -361.48394775390625,
"logps/rejected": -489.70989990234375,
"loss": 0.0905,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.7510203123092651,
"rewards/margins": 8.015697479248047,
"rewards/rejected": -8.766717910766602,
"step": 320
},
{
"epoch": 0.42,
"learning_rate": 3.533190578158458e-07,
"logits/chosen": -2.5930895805358887,
"logits/rejected": -2.6723227500915527,
"logps/chosen": -384.87664794921875,
"logps/rejected": -509.010986328125,
"loss": 0.1232,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.44466814398765564,
"rewards/margins": 6.863368988037109,
"rewards/rejected": -7.308036804199219,
"step": 330
},
{
"epoch": 0.44,
"learning_rate": 3.640256959314775e-07,
"logits/chosen": -2.5658717155456543,
"logits/rejected": -2.62716007232666,
"logps/chosen": -304.2865295410156,
"logps/rejected": -435.2959899902344,
"loss": 0.0874,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.5686666369438171,
"rewards/margins": 6.669247627258301,
"rewards/rejected": -7.237914085388184,
"step": 340
},
{
"epoch": 0.45,
"learning_rate": 3.747323340471092e-07,
"logits/chosen": -2.584165096282959,
"logits/rejected": -2.70393967628479,
"logps/chosen": -364.13262939453125,
"logps/rejected": -477.5604553222656,
"loss": 0.1015,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.5823951363563538,
"rewards/margins": 7.484101295471191,
"rewards/rejected": -8.066494941711426,
"step": 350
},
{
"epoch": 0.46,
"learning_rate": 3.854389721627409e-07,
"logits/chosen": -2.5895907878875732,
"logits/rejected": -2.646876573562622,
"logps/chosen": -355.0018005371094,
"logps/rejected": -442.65948486328125,
"loss": 0.0896,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.8058759570121765,
"rewards/margins": 8.065896987915039,
"rewards/rejected": -8.871771812438965,
"step": 360
},
{
"epoch": 0.48,
"learning_rate": 3.961456102783726e-07,
"logits/chosen": -2.615499973297119,
"logits/rejected": -2.6612184047698975,
"logps/chosen": -308.342041015625,
"logps/rejected": -432.08319091796875,
"loss": 0.0821,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.5296161770820618,
"rewards/margins": 7.243483066558838,
"rewards/rejected": -7.773098945617676,
"step": 370
},
{
"epoch": 0.49,
"learning_rate": 4.068522483940043e-07,
"logits/chosen": -2.6956448554992676,
"logits/rejected": -2.7061805725097656,
"logps/chosen": -346.4541931152344,
"logps/rejected": -481.19989013671875,
"loss": 0.1104,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.6448992490768433,
"rewards/margins": 7.711002349853516,
"rewards/rejected": -8.355902671813965,
"step": 380
},
{
"epoch": 0.5,
"learning_rate": 4.175588865096359e-07,
"logits/chosen": -2.6077234745025635,
"logits/rejected": -2.6278557777404785,
"logps/chosen": -353.8262634277344,
"logps/rejected": -447.3440856933594,
"loss": 0.0958,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.5978514552116394,
"rewards/margins": 7.370479583740234,
"rewards/rejected": -7.968331336975098,
"step": 390
},
{
"epoch": 0.51,
"learning_rate": 4.282655246252677e-07,
"logits/chosen": -2.603065252304077,
"logits/rejected": -2.675497531890869,
"logps/chosen": -355.2611999511719,
"logps/rejected": -411.75732421875,
"loss": 0.1039,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.2750840187072754,
"rewards/margins": 7.0222907066345215,
"rewards/rejected": -7.2973737716674805,
"step": 400
},
{
"epoch": 0.51,
"eval_logits/chosen": -2.6301259994506836,
"eval_logits/rejected": -2.6286230087280273,
"eval_logps/chosen": -315.64288330078125,
"eval_logps/rejected": -421.1318359375,
"eval_loss": 0.07803654670715332,
"eval_rewards/accuracies": 0.984375,
"eval_rewards/chosen": -0.6931607723236084,
"eval_rewards/margins": 7.145491600036621,
"eval_rewards/rejected": -7.83865213394165,
"eval_runtime": 38.7861,
"eval_samples_per_second": 12.891,
"eval_steps_per_second": 0.413,
"step": 400
},
{
"epoch": 0.53,
"learning_rate": 4.389721627408993e-07,
"logits/chosen": -2.5576305389404297,
"logits/rejected": -2.602813243865967,
"logps/chosen": -361.10797119140625,
"logps/rejected": -468.213134765625,
"loss": 0.1042,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.9836179614067078,
"rewards/margins": 6.6080522537231445,
"rewards/rejected": -7.591670989990234,
"step": 410
},
{
"epoch": 0.54,
"learning_rate": 4.49678800856531e-07,
"logits/chosen": -2.521080732345581,
"logits/rejected": -2.5644307136535645,
"logps/chosen": -325.7511901855469,
"logps/rejected": -407.7994384765625,
"loss": 0.1057,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.45771685242652893,
"rewards/margins": 7.0977678298950195,
"rewards/rejected": -7.555483818054199,
"step": 420
},
{
"epoch": 0.55,
"learning_rate": 4.603854389721627e-07,
"logits/chosen": -2.5245959758758545,
"logits/rejected": -2.559770107269287,
"logps/chosen": -340.15087890625,
"logps/rejected": -485.052490234375,
"loss": 0.084,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.994246780872345,
"rewards/margins": 7.357940673828125,
"rewards/rejected": -8.35218620300293,
"step": 430
},
{
"epoch": 0.57,
"learning_rate": 4.710920770877944e-07,
"logits/chosen": -2.401303768157959,
"logits/rejected": -2.548125743865967,
"logps/chosen": -358.9648742675781,
"logps/rejected": -462.87890625,
"loss": 0.1172,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.293526530265808,
"rewards/margins": 7.095101833343506,
"rewards/rejected": -8.388628005981445,
"step": 440
},
{
"epoch": 0.58,
"learning_rate": 4.817987152034261e-07,
"logits/chosen": -2.4654183387756348,
"logits/rejected": -2.560048818588257,
"logps/chosen": -291.2701721191406,
"logps/rejected": -362.7830505371094,
"loss": 0.0959,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.4040035009384155,
"rewards/margins": 5.726696968078613,
"rewards/rejected": -7.130700588226318,
"step": 450
},
{
"epoch": 0.59,
"learning_rate": 4.925053533190578e-07,
"logits/chosen": -2.489262104034424,
"logits/rejected": -2.5457305908203125,
"logps/chosen": -356.9480285644531,
"logps/rejected": -435.594970703125,
"loss": 0.1132,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.3584586381912231,
"rewards/margins": 6.3141889572143555,
"rewards/rejected": -7.672647953033447,
"step": 460
},
{
"epoch": 0.6,
"learning_rate": 4.996429421566293e-07,
"logits/chosen": -2.5229034423828125,
"logits/rejected": -2.565725326538086,
"logps/chosen": -326.0317077636719,
"logps/rejected": -448.7723083496094,
"loss": 0.1051,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.5931789875030518,
"rewards/margins": 7.049294471740723,
"rewards/rejected": -8.642473220825195,
"step": 470
},
{
"epoch": 0.62,
"learning_rate": 4.98452749345394e-07,
"logits/chosen": -2.5022709369659424,
"logits/rejected": -2.555453062057495,
"logps/chosen": -361.46563720703125,
"logps/rejected": -498.7660217285156,
"loss": 0.1386,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.8986074924468994,
"rewards/margins": 6.340726375579834,
"rewards/rejected": -8.239333152770996,
"step": 480
},
{
"epoch": 0.63,
"learning_rate": 4.972625565341585e-07,
"logits/chosen": -2.4549243450164795,
"logits/rejected": -2.5045337677001953,
"logps/chosen": -320.4005432128906,
"logps/rejected": -437.33612060546875,
"loss": 0.0958,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.8746875524520874,
"rewards/margins": 6.6805620193481445,
"rewards/rejected": -8.555249214172363,
"step": 490
},
{
"epoch": 0.64,
"learning_rate": 4.960723637229232e-07,
"logits/chosen": -2.448908567428589,
"logits/rejected": -2.458101272583008,
"logps/chosen": -355.0153503417969,
"logps/rejected": -504.32330322265625,
"loss": 0.0762,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.4554470777511597,
"rewards/margins": 8.177068710327148,
"rewards/rejected": -9.632516860961914,
"step": 500
},
{
"epoch": 0.64,
"eval_logits/chosen": -2.504735231399536,
"eval_logits/rejected": -2.5092720985412598,
"eval_logps/chosen": -323.16851806640625,
"eval_logps/rejected": -433.9158020019531,
"eval_loss": 0.08059512078762054,
"eval_rewards/accuracies": 0.953125,
"eval_rewards/chosen": -1.4457205533981323,
"eval_rewards/margins": 7.671328544616699,
"eval_rewards/rejected": -9.117048263549805,
"eval_runtime": 38.7512,
"eval_samples_per_second": 12.903,
"eval_steps_per_second": 0.413,
"step": 500
},
{
"epoch": 0.66,
"learning_rate": 4.948821709116876e-07,
"logits/chosen": -2.376183032989502,
"logits/rejected": -2.455298900604248,
"logps/chosen": -485.12603759765625,
"logps/rejected": -551.7554931640625,
"loss": 0.1056,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.5901005268096924,
"rewards/margins": 8.771623611450195,
"rewards/rejected": -10.361722946166992,
"step": 510
},
{
"epoch": 0.67,
"learning_rate": 4.936919781004522e-07,
"logits/chosen": -2.470151424407959,
"logits/rejected": -2.5587172508239746,
"logps/chosen": -377.3062438964844,
"logps/rejected": -507.6141052246094,
"loss": 0.0955,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.233569860458374,
"rewards/margins": 8.123286247253418,
"rewards/rejected": -9.356857299804688,
"step": 520
},
{
"epoch": 0.68,
"learning_rate": 4.925017852892168e-07,
"logits/chosen": -2.5230183601379395,
"logits/rejected": -2.603940725326538,
"logps/chosen": -362.92333984375,
"logps/rejected": -481.7613220214844,
"loss": 0.0683,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -1.9068357944488525,
"rewards/margins": 7.921334743499756,
"rewards/rejected": -9.828168869018555,
"step": 530
},
{
"epoch": 0.69,
"learning_rate": 4.913115924779814e-07,
"logits/chosen": -2.438596248626709,
"logits/rejected": -2.562830924987793,
"logps/chosen": -386.5306701660156,
"logps/rejected": -499.86444091796875,
"loss": 0.0677,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2040196657180786,
"rewards/margins": 8.880427360534668,
"rewards/rejected": -10.084446907043457,
"step": 540
},
{
"epoch": 0.71,
"learning_rate": 4.90121399666746e-07,
"logits/chosen": -2.4589312076568604,
"logits/rejected": -2.524345874786377,
"logps/chosen": -332.1251220703125,
"logps/rejected": -433.63787841796875,
"loss": 0.1309,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.1124681234359741,
"rewards/margins": 7.2715253829956055,
"rewards/rejected": -8.383993148803711,
"step": 550
},
{
"epoch": 0.72,
"learning_rate": 4.889312068555106e-07,
"logits/chosen": -2.58622407913208,
"logits/rejected": -2.60271954536438,
"logps/chosen": -271.59014892578125,
"logps/rejected": -417.29833984375,
"loss": 0.1275,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.1244533061981201,
"rewards/margins": 6.384497165679932,
"rewards/rejected": -7.508950710296631,
"step": 560
},
{
"epoch": 0.73,
"learning_rate": 4.877410140442752e-07,
"logits/chosen": -2.4364261627197266,
"logits/rejected": -2.4858317375183105,
"logps/chosen": -350.3711853027344,
"logps/rejected": -449.4051818847656,
"loss": 0.0982,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.0685746669769287,
"rewards/margins": 7.424908638000488,
"rewards/rejected": -8.49348258972168,
"step": 570
},
{
"epoch": 0.75,
"learning_rate": 4.865508212330398e-07,
"logits/chosen": -2.441240072250366,
"logits/rejected": -2.527020215988159,
"logps/chosen": -366.98150634765625,
"logps/rejected": -525.4156494140625,
"loss": 0.0867,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9126319885253906,
"rewards/margins": 9.27831745147705,
"rewards/rejected": -10.190949440002441,
"step": 580
},
{
"epoch": 0.76,
"learning_rate": 4.853606284218044e-07,
"logits/chosen": -2.3090662956237793,
"logits/rejected": -2.3255538940429688,
"logps/chosen": -371.3923034667969,
"logps/rejected": -526.1776123046875,
"loss": 0.1095,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.190389633178711,
"rewards/margins": 9.64104175567627,
"rewards/rejected": -11.831432342529297,
"step": 590
},
{
"epoch": 0.77,
"learning_rate": 4.841704356105689e-07,
"logits/chosen": -2.334197521209717,
"logits/rejected": -2.423285484313965,
"logps/chosen": -369.0033264160156,
"logps/rejected": -506.4518127441406,
"loss": 0.0959,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.7217298746109009,
"rewards/margins": 8.07056999206543,
"rewards/rejected": -9.7923002243042,
"step": 600
},
{
"epoch": 0.77,
"eval_logits/chosen": -2.467820405960083,
"eval_logits/rejected": -2.440288782119751,
"eval_logps/chosen": -318.6737060546875,
"eval_logps/rejected": -428.9326171875,
"eval_loss": 0.07413332909345627,
"eval_rewards/accuracies": 0.984375,
"eval_rewards/chosen": -0.9962404370307922,
"eval_rewards/margins": 7.622487545013428,
"eval_rewards/rejected": -8.618727684020996,
"eval_runtime": 38.7439,
"eval_samples_per_second": 12.905,
"eval_steps_per_second": 0.413,
"step": 600
},
{
"epoch": 0.78,
"learning_rate": 4.829802427993334e-07,
"logits/chosen": -2.3268227577209473,
"logits/rejected": -2.3746628761291504,
"logps/chosen": -404.0111083984375,
"logps/rejected": -492.5167541503906,
"loss": 0.0859,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.8345616459846497,
"rewards/margins": 8.07560920715332,
"rewards/rejected": -8.910171508789062,
"step": 610
},
{
"epoch": 0.8,
"learning_rate": 4.81790049988098e-07,
"logits/chosen": -2.415301561355591,
"logits/rejected": -2.4919333457946777,
"logps/chosen": -388.5622253417969,
"logps/rejected": -531.6051025390625,
"loss": 0.0631,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.3940558433532715,
"rewards/margins": 7.842892646789551,
"rewards/rejected": -9.23694896697998,
"step": 620
},
{
"epoch": 0.81,
"learning_rate": 4.805998571768626e-07,
"logits/chosen": -2.310925245285034,
"logits/rejected": -2.42446231842041,
"logps/chosen": -342.0956115722656,
"logps/rejected": -516.9351196289062,
"loss": 0.1142,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.3800750970840454,
"rewards/margins": 8.762998580932617,
"rewards/rejected": -10.143075942993164,
"step": 630
},
{
"epoch": 0.82,
"learning_rate": 4.794096643656272e-07,
"logits/chosen": -2.280027151107788,
"logits/rejected": -2.31703782081604,
"logps/chosen": -409.70379638671875,
"logps/rejected": -529.5406494140625,
"loss": 0.0723,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.3363559246063232,
"rewards/margins": 10.320574760437012,
"rewards/rejected": -11.656930923461914,
"step": 640
},
{
"epoch": 0.84,
"learning_rate": 4.782194715543918e-07,
"logits/chosen": -2.276779890060425,
"logits/rejected": -2.343441963195801,
"logps/chosen": -348.50531005859375,
"logps/rejected": -521.2000122070312,
"loss": 0.0902,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -1.5391457080841064,
"rewards/margins": 9.673690795898438,
"rewards/rejected": -11.212836265563965,
"step": 650
},
{
"epoch": 0.85,
"learning_rate": 4.770292787431564e-07,
"logits/chosen": -2.3436553478240967,
"logits/rejected": -2.3175175189971924,
"logps/chosen": -386.4251403808594,
"logps/rejected": -530.1958618164062,
"loss": 0.0787,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -2.0843443870544434,
"rewards/margins": 9.766562461853027,
"rewards/rejected": -11.850906372070312,
"step": 660
},
{
"epoch": 0.86,
"learning_rate": 4.7583908593192097e-07,
"logits/chosen": -2.2515616416931152,
"logits/rejected": -2.2762718200683594,
"logps/chosen": -396.88751220703125,
"logps/rejected": -541.3609619140625,
"loss": 0.0841,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -2.3059911727905273,
"rewards/margins": 9.442736625671387,
"rewards/rejected": -11.748727798461914,
"step": 670
},
{
"epoch": 0.87,
"learning_rate": 4.746488931206855e-07,
"logits/chosen": -2.304055690765381,
"logits/rejected": -2.3429813385009766,
"logps/chosen": -353.8645935058594,
"logps/rejected": -520.8157348632812,
"loss": 0.0793,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -2.6302597522735596,
"rewards/margins": 10.208868980407715,
"rewards/rejected": -12.839129447937012,
"step": 680
},
{
"epoch": 0.89,
"learning_rate": 4.734587003094501e-07,
"logits/chosen": -2.326953887939453,
"logits/rejected": -2.4166040420532227,
"logps/chosen": -377.34356689453125,
"logps/rejected": -494.58782958984375,
"loss": 0.1041,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.1931746006011963,
"rewards/margins": 9.344148635864258,
"rewards/rejected": -11.537323951721191,
"step": 690
},
{
"epoch": 0.9,
"learning_rate": 4.722685074982147e-07,
"logits/chosen": -2.3279807567596436,
"logits/rejected": -2.38569974899292,
"logps/chosen": -320.0870056152344,
"logps/rejected": -498.17706298828125,
"loss": 0.0814,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5494163036346436,
"rewards/margins": 10.105340957641602,
"rewards/rejected": -11.654756546020508,
"step": 700
},
{
"epoch": 0.9,
"eval_logits/chosen": -2.498293399810791,
"eval_logits/rejected": -2.4712274074554443,
"eval_logps/chosen": -323.183837890625,
"eval_logps/rejected": -441.4797058105469,
"eval_loss": 0.055789634585380554,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -1.4472523927688599,
"eval_rewards/margins": 8.426188468933105,
"eval_rewards/rejected": -9.87343978881836,
"eval_runtime": 38.7758,
"eval_samples_per_second": 12.895,
"eval_steps_per_second": 0.413,
"step": 700
},
{
"epoch": 0.91,
"learning_rate": 4.710783146869793e-07,
"logits/chosen": -2.3991808891296387,
"logits/rejected": -2.4218363761901855,
"logps/chosen": -314.1746520996094,
"logps/rejected": -519.7462768554688,
"loss": 0.0819,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.8268877267837524,
"rewards/margins": 10.331625938415527,
"rewards/rejected": -12.158514022827148,
"step": 710
},
{
"epoch": 0.93,
"learning_rate": 4.698881218757438e-07,
"logits/chosen": -2.363438606262207,
"logits/rejected": -2.3997836112976074,
"logps/chosen": -305.2399597167969,
"logps/rejected": -481.65582275390625,
"loss": 0.0786,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.5438249111175537,
"rewards/margins": 8.62690544128418,
"rewards/rejected": -10.17072868347168,
"step": 720
},
{
"epoch": 0.94,
"learning_rate": 4.6869792906450845e-07,
"logits/chosen": -2.3670878410339355,
"logits/rejected": -2.4363322257995605,
"logps/chosen": -342.06622314453125,
"logps/rejected": -468.9805603027344,
"loss": 0.0719,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.5114291906356812,
"rewards/margins": 8.608851432800293,
"rewards/rejected": -10.120282173156738,
"step": 730
},
{
"epoch": 0.95,
"learning_rate": 4.67507736253273e-07,
"logits/chosen": -2.2785589694976807,
"logits/rejected": -2.3089492321014404,
"logps/chosen": -407.75048828125,
"logps/rejected": -557.4127197265625,
"loss": 0.0903,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.8981235027313232,
"rewards/margins": 10.704629898071289,
"rewards/rejected": -12.602753639221191,
"step": 740
},
{
"epoch": 0.96,
"learning_rate": 4.6631754344203763e-07,
"logits/chosen": -2.3073747158050537,
"logits/rejected": -2.383291244506836,
"logps/chosen": -357.61492919921875,
"logps/rejected": -522.1990356445312,
"loss": 0.1043,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.5501503944396973,
"rewards/margins": 8.703204154968262,
"rewards/rejected": -11.253355026245117,
"step": 750
},
{
"epoch": 0.98,
"learning_rate": 4.6512735063080217e-07,
"logits/chosen": -2.492027521133423,
"logits/rejected": -2.534536361694336,
"logps/chosen": -430.7220764160156,
"logps/rejected": -559.482666015625,
"loss": 0.0971,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.385508418083191,
"rewards/margins": 9.584807395935059,
"rewards/rejected": -10.970315933227539,
"step": 760
},
{
"epoch": 0.99,
"learning_rate": 4.6393715781956676e-07,
"logits/chosen": -2.3780312538146973,
"logits/rejected": -2.37473201751709,
"logps/chosen": -326.2506103515625,
"logps/rejected": -496.7969665527344,
"loss": 0.0865,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.0338951349258423,
"rewards/margins": 9.33600902557373,
"rewards/rejected": -10.369903564453125,
"step": 770
},
{
"epoch": 1.0,
"learning_rate": 4.6274696500833135e-07,
"logits/chosen": -2.4264612197875977,
"logits/rejected": -2.45288348197937,
"logps/chosen": -368.6007385253906,
"logps/rejected": -534.6527709960938,
"loss": 0.0645,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.9812146425247192,
"rewards/margins": 9.224861145019531,
"rewards/rejected": -10.206075668334961,
"step": 780
},
{
"epoch": 1.02,
"learning_rate": 4.6155677219709594e-07,
"logits/chosen": -2.383737087249756,
"logits/rejected": -2.4557416439056396,
"logps/chosen": -401.9710388183594,
"logps/rejected": -555.4797973632812,
"loss": 0.0216,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5994548797607422,
"rewards/margins": 12.170892715454102,
"rewards/rejected": -13.770347595214844,
"step": 790
},
{
"epoch": 1.03,
"learning_rate": 4.603665793858605e-07,
"logits/chosen": -2.4060428142547607,
"logits/rejected": -2.4426844120025635,
"logps/chosen": -366.8950500488281,
"logps/rejected": -558.5940551757812,
"loss": 0.0164,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.232177972793579,
"rewards/margins": 12.297248840332031,
"rewards/rejected": -13.529426574707031,
"step": 800
},
{
"epoch": 1.03,
"eval_logits/chosen": -2.492385149002075,
"eval_logits/rejected": -2.468630313873291,
"eval_logps/chosen": -324.3902587890625,
"eval_logps/rejected": -453.6976623535156,
"eval_loss": 0.06341304630041122,
"eval_rewards/accuracies": 0.984375,
"eval_rewards/chosen": -1.5678963661193848,
"eval_rewards/margins": 9.527338981628418,
"eval_rewards/rejected": -11.095235824584961,
"eval_runtime": 38.5408,
"eval_samples_per_second": 12.973,
"eval_steps_per_second": 0.415,
"step": 800
},
{
"epoch": 1.04,
"learning_rate": 4.5917638657462507e-07,
"logits/chosen": -2.33616042137146,
"logits/rejected": -2.3640098571777344,
"logps/chosen": -373.46905517578125,
"logps/rejected": -514.2394409179688,
"loss": 0.0209,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4009530544281006,
"rewards/margins": 10.871899604797363,
"rewards/rejected": -12.272851943969727,
"step": 810
},
{
"epoch": 1.05,
"learning_rate": 4.5798619376338966e-07,
"logits/chosen": -2.4044508934020996,
"logits/rejected": -2.420480966567993,
"logps/chosen": -347.3623962402344,
"logps/rejected": -556.5758056640625,
"loss": 0.0227,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.055593490600586,
"rewards/margins": 11.897196769714355,
"rewards/rejected": -12.952789306640625,
"step": 820
},
{
"epoch": 1.07,
"learning_rate": 4.567960009521542e-07,
"logits/chosen": -2.359771490097046,
"logits/rejected": -2.4249939918518066,
"logps/chosen": -370.0980529785156,
"logps/rejected": -567.7897338867188,
"loss": 0.0131,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.0004942417144775,
"rewards/margins": 12.14315414428711,
"rewards/rejected": -14.143648147583008,
"step": 830
},
{
"epoch": 1.08,
"learning_rate": 4.5560580814091884e-07,
"logits/chosen": -2.3424394130706787,
"logits/rejected": -2.342963457107544,
"logps/chosen": -385.192626953125,
"logps/rejected": -510.11749267578125,
"loss": 0.0098,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.1284375190734863,
"rewards/margins": 11.841325759887695,
"rewards/rejected": -13.969762802124023,
"step": 840
},
{
"epoch": 1.09,
"learning_rate": 4.5441561532968337e-07,
"logits/chosen": -2.3772830963134766,
"logits/rejected": -2.414663791656494,
"logps/chosen": -375.8727722167969,
"logps/rejected": -580.7897338867188,
"loss": 0.0093,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.0805163383483887,
"rewards/margins": 12.892430305480957,
"rewards/rejected": -15.972944259643555,
"step": 850
},
{
"epoch": 1.11,
"learning_rate": 4.5322542251844796e-07,
"logits/chosen": -2.3776564598083496,
"logits/rejected": -2.409484386444092,
"logps/chosen": -331.92431640625,
"logps/rejected": -500.89739990234375,
"loss": 0.0143,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.3345754146575928,
"rewards/margins": 11.422739028930664,
"rewards/rejected": -13.757314682006836,
"step": 860
},
{
"epoch": 1.12,
"learning_rate": 4.5203522970721255e-07,
"logits/chosen": -2.3700737953186035,
"logits/rejected": -2.397162914276123,
"logps/chosen": -340.53094482421875,
"logps/rejected": -506.8477478027344,
"loss": 0.0146,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.4118890166282654,
"rewards/margins": 12.948440551757812,
"rewards/rejected": -13.360328674316406,
"step": 870
},
{
"epoch": 1.13,
"learning_rate": 4.5084503689597714e-07,
"logits/chosen": -2.41035795211792,
"logits/rejected": -2.4271979331970215,
"logps/chosen": -329.87933349609375,
"logps/rejected": -537.0123291015625,
"loss": 0.0135,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4996788501739502,
"rewards/margins": 11.888396263122559,
"rewards/rejected": -13.388073921203613,
"step": 880
},
{
"epoch": 1.14,
"learning_rate": 4.496548440847417e-07,
"logits/chosen": -2.401721477508545,
"logits/rejected": -2.447669506072998,
"logps/chosen": -366.2709045410156,
"logps/rejected": -519.80224609375,
"loss": 0.0139,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3627954721450806,
"rewards/margins": 12.356982231140137,
"rewards/rejected": -13.71977710723877,
"step": 890
},
{
"epoch": 1.16,
"learning_rate": 4.484646512735063e-07,
"logits/chosen": -2.4436986446380615,
"logits/rejected": -2.5449397563934326,
"logps/chosen": -384.5765686035156,
"logps/rejected": -555.2340087890625,
"loss": 0.0172,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.7870714068412781,
"rewards/margins": 11.903576850891113,
"rewards/rejected": -12.690648078918457,
"step": 900
},
{
"epoch": 1.16,
"eval_logits/chosen": -2.5417840480804443,
"eval_logits/rejected": -2.5121681690216064,
"eval_logps/chosen": -326.2882080078125,
"eval_logps/rejected": -464.37054443359375,
"eval_loss": 0.06124735251069069,
"eval_rewards/accuracies": 0.984375,
"eval_rewards/chosen": -1.7576879262924194,
"eval_rewards/margins": 10.404834747314453,
"eval_rewards/rejected": -12.162521362304688,
"eval_runtime": 38.6563,
"eval_samples_per_second": 12.934,
"eval_steps_per_second": 0.414,
"step": 900
},
{
"epoch": 1.17,
"learning_rate": 4.4727445846227086e-07,
"logits/chosen": -2.438345432281494,
"logits/rejected": -2.4737024307250977,
"logps/chosen": -369.38397216796875,
"logps/rejected": -519.6220703125,
"loss": 0.011,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.9280792474746704,
"rewards/margins": 12.675816535949707,
"rewards/rejected": -14.60389518737793,
"step": 910
},
{
"epoch": 1.18,
"learning_rate": 4.4608426565103545e-07,
"logits/chosen": -2.450275182723999,
"logits/rejected": -2.462500810623169,
"logps/chosen": -343.4928283691406,
"logps/rejected": -515.9462280273438,
"loss": 0.0221,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.3710033893585205,
"rewards/margins": 13.644805908203125,
"rewards/rejected": -16.015810012817383,
"step": 920
},
{
"epoch": 1.2,
"learning_rate": 4.4489407283980004e-07,
"logits/chosen": -2.423760414123535,
"logits/rejected": -2.385545253753662,
"logps/chosen": -370.15985107421875,
"logps/rejected": -515.8549194335938,
"loss": 0.0097,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.4730286598205566,
"rewards/margins": 12.967801094055176,
"rewards/rejected": -15.440831184387207,
"step": 930
},
{
"epoch": 1.21,
"learning_rate": 4.437038800285646e-07,
"logits/chosen": -2.399423360824585,
"logits/rejected": -2.418363094329834,
"logps/chosen": -384.27984619140625,
"logps/rejected": -549.5245971679688,
"loss": 0.0156,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.72330904006958,
"rewards/margins": 12.818862915039062,
"rewards/rejected": -16.542171478271484,
"step": 940
},
{
"epoch": 1.22,
"learning_rate": 4.4251368721732916e-07,
"logits/chosen": -2.5278353691101074,
"logits/rejected": -2.5364837646484375,
"logps/chosen": -329.5386657714844,
"logps/rejected": -519.6696166992188,
"loss": 0.0223,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4351348876953125,
"rewards/margins": 11.446606636047363,
"rewards/rejected": -12.881741523742676,
"step": 950
},
{
"epoch": 1.23,
"learning_rate": 4.413234944060938e-07,
"logits/chosen": -2.527299165725708,
"logits/rejected": -2.5759024620056152,
"logps/chosen": -403.71063232421875,
"logps/rejected": -589.4862670898438,
"loss": 0.0147,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.19414202868938446,
"rewards/margins": 12.035063743591309,
"rewards/rejected": -12.229207038879395,
"step": 960
},
{
"epoch": 1.25,
"learning_rate": 4.4013330159485834e-07,
"logits/chosen": -2.4672398567199707,
"logits/rejected": -2.4999210834503174,
"logps/chosen": -334.6300048828125,
"logps/rejected": -534.4932250976562,
"loss": 0.0255,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.7743580341339111,
"rewards/margins": 12.416712760925293,
"rewards/rejected": -14.191072463989258,
"step": 970
},
{
"epoch": 1.26,
"learning_rate": 4.3894310878362293e-07,
"logits/chosen": -2.447817087173462,
"logits/rejected": -2.5005249977111816,
"logps/chosen": -338.5157470703125,
"logps/rejected": -544.09423828125,
"loss": 0.0229,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.8672630786895752,
"rewards/margins": 12.040175437927246,
"rewards/rejected": -13.907438278198242,
"step": 980
},
{
"epoch": 1.27,
"learning_rate": 4.377529159723875e-07,
"logits/chosen": -2.4685416221618652,
"logits/rejected": -2.49491548538208,
"logps/chosen": -366.1611022949219,
"logps/rejected": -518.9093627929688,
"loss": 0.0079,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9218127727508545,
"rewards/margins": 11.573265075683594,
"rewards/rejected": -13.495076179504395,
"step": 990
},
{
"epoch": 1.29,
"learning_rate": 4.365627231611521e-07,
"logits/chosen": -2.470853805541992,
"logits/rejected": -2.497331380844116,
"logps/chosen": -405.1899719238281,
"logps/rejected": -591.7445068359375,
"loss": 0.0057,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9404414892196655,
"rewards/margins": 13.470489501953125,
"rewards/rejected": -15.410931587219238,
"step": 1000
},
{
"epoch": 1.29,
"eval_logits/chosen": -2.5345709323883057,
"eval_logits/rejected": -2.507004737854004,
"eval_logps/chosen": -336.10919189453125,
"eval_logps/rejected": -476.1966552734375,
"eval_loss": 0.0556936077773571,
"eval_rewards/accuracies": 0.984375,
"eval_rewards/chosen": -2.7397918701171875,
"eval_rewards/margins": 10.605344772338867,
"eval_rewards/rejected": -13.345136642456055,
"eval_runtime": 38.7118,
"eval_samples_per_second": 12.916,
"eval_steps_per_second": 0.413,
"step": 1000
},
{
"epoch": 1.3,
"learning_rate": 4.3537253034991665e-07,
"logits/chosen": -2.441990852355957,
"logits/rejected": -2.4507715702056885,
"logps/chosen": -329.62542724609375,
"logps/rejected": -574.9547729492188,
"loss": 0.0214,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.120880603790283,
"rewards/margins": 13.88032054901123,
"rewards/rejected": -17.001201629638672,
"step": 1010
},
{
"epoch": 1.31,
"learning_rate": 4.3418233753868124e-07,
"logits/chosen": -2.3679394721984863,
"logits/rejected": -2.410681962966919,
"logps/chosen": -341.8808898925781,
"logps/rejected": -532.3084106445312,
"loss": 0.0303,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.787487030029297,
"rewards/margins": 11.951956748962402,
"rewards/rejected": -14.739442825317383,
"step": 1020
},
{
"epoch": 1.32,
"learning_rate": 4.3299214472744583e-07,
"logits/chosen": -2.4356143474578857,
"logits/rejected": -2.484920024871826,
"logps/chosen": -378.17376708984375,
"logps/rejected": -561.7147216796875,
"loss": 0.0212,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.4539060592651367,
"rewards/margins": 12.572771072387695,
"rewards/rejected": -15.026677131652832,
"step": 1030
},
{
"epoch": 1.34,
"learning_rate": 4.3180195191621036e-07,
"logits/chosen": -2.4165291786193848,
"logits/rejected": -2.3931941986083984,
"logps/chosen": -377.8540344238281,
"logps/rejected": -555.7592163085938,
"loss": 0.0254,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2512832880020142,
"rewards/margins": 12.33320426940918,
"rewards/rejected": -13.58448600769043,
"step": 1040
},
{
"epoch": 1.35,
"learning_rate": 4.30611759104975e-07,
"logits/chosen": -2.3533992767333984,
"logits/rejected": -2.3296687602996826,
"logps/chosen": -418.5027770996094,
"logps/rejected": -600.8396606445312,
"loss": 0.0201,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.496593952178955,
"rewards/margins": 13.320207595825195,
"rewards/rejected": -15.816801071166992,
"step": 1050
},
{
"epoch": 1.36,
"learning_rate": 4.2942156629373954e-07,
"logits/chosen": -2.246854782104492,
"logits/rejected": -2.3130173683166504,
"logps/chosen": -396.1013488769531,
"logps/rejected": -553.8746337890625,
"loss": 0.0209,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.86772084236145,
"rewards/margins": 13.22656536102295,
"rewards/rejected": -16.094287872314453,
"step": 1060
},
{
"epoch": 1.38,
"learning_rate": 4.2823137348250413e-07,
"logits/chosen": -2.1099252700805664,
"logits/rejected": -2.1625306606292725,
"logps/chosen": -439.188232421875,
"logps/rejected": -567.4981689453125,
"loss": 0.0195,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -3.4562058448791504,
"rewards/margins": 11.824674606323242,
"rewards/rejected": -15.280881881713867,
"step": 1070
},
{
"epoch": 1.39,
"learning_rate": 4.270411806712687e-07,
"logits/chosen": -2.182868480682373,
"logits/rejected": -2.140045642852783,
"logps/chosen": -414.1625061035156,
"logps/rejected": -590.7791748046875,
"loss": 0.0203,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.0633182525634766,
"rewards/margins": 13.505340576171875,
"rewards/rejected": -15.568659782409668,
"step": 1080
},
{
"epoch": 1.4,
"learning_rate": 4.258509878600333e-07,
"logits/chosen": -2.301701068878174,
"logits/rejected": -2.3724331855773926,
"logps/chosen": -318.6136779785156,
"logps/rejected": -549.11572265625,
"loss": 0.0162,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7360296249389648,
"rewards/margins": 12.463074684143066,
"rewards/rejected": -14.199106216430664,
"step": 1090
},
{
"epoch": 1.41,
"learning_rate": 4.2466079504879785e-07,
"logits/chosen": -2.3375637531280518,
"logits/rejected": -2.371568202972412,
"logps/chosen": -355.43218994140625,
"logps/rejected": -497.6923828125,
"loss": 0.0296,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.8382488489151,
"rewards/margins": 11.133204460144043,
"rewards/rejected": -12.971455574035645,
"step": 1100
},
{
"epoch": 1.41,
"eval_logits/chosen": -2.422253131866455,
"eval_logits/rejected": -2.3856472969055176,
"eval_logps/chosen": -327.49688720703125,
"eval_logps/rejected": -458.99761962890625,
"eval_loss": 0.0712868794798851,
"eval_rewards/accuracies": 0.953125,
"eval_rewards/chosen": -1.8785579204559326,
"eval_rewards/margins": 9.746674537658691,
"eval_rewards/rejected": -11.625232696533203,
"eval_runtime": 38.5688,
"eval_samples_per_second": 12.964,
"eval_steps_per_second": 0.415,
"step": 1100
},
{
"epoch": 1.43,
"learning_rate": 4.234706022375625e-07,
"logits/chosen": -2.335549831390381,
"logits/rejected": -2.3633885383605957,
"logps/chosen": -334.0445251464844,
"logps/rejected": -532.0367431640625,
"loss": 0.0173,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.1568909883499146,
"rewards/margins": 12.151830673217773,
"rewards/rejected": -13.308721542358398,
"step": 1110
},
{
"epoch": 1.44,
"learning_rate": 4.2228040942632703e-07,
"logits/chosen": -2.2730376720428467,
"logits/rejected": -2.279794931411743,
"logps/chosen": -372.47711181640625,
"logps/rejected": -565.377197265625,
"loss": 0.0135,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.948188066482544,
"rewards/margins": 12.926470756530762,
"rewards/rejected": -14.874661445617676,
"step": 1120
},
{
"epoch": 1.45,
"learning_rate": 4.210902166150916e-07,
"logits/chosen": -2.1850171089172363,
"logits/rejected": -2.2554237842559814,
"logps/chosen": -330.89398193359375,
"logps/rejected": -572.4408569335938,
"loss": 0.0152,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.2409050464630127,
"rewards/margins": 15.152783393859863,
"rewards/rejected": -17.393688201904297,
"step": 1130
},
{
"epoch": 1.47,
"learning_rate": 4.199000238038562e-07,
"logits/chosen": -2.2348155975341797,
"logits/rejected": -2.276552200317383,
"logps/chosen": -391.0440673828125,
"logps/rejected": -562.8758544921875,
"loss": 0.0083,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.518620014190674,
"rewards/margins": 13.422780990600586,
"rewards/rejected": -15.941401481628418,
"step": 1140
},
{
"epoch": 1.48,
"learning_rate": 4.187098309926208e-07,
"logits/chosen": -2.234314441680908,
"logits/rejected": -2.273665428161621,
"logps/chosen": -379.77752685546875,
"logps/rejected": -609.7650146484375,
"loss": 0.0167,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.2126364707946777,
"rewards/margins": 14.33509635925293,
"rewards/rejected": -16.547733306884766,
"step": 1150
},
{
"epoch": 1.49,
"learning_rate": 4.1751963818138534e-07,
"logits/chosen": -2.2460713386535645,
"logits/rejected": -2.28529953956604,
"logps/chosen": -391.7981872558594,
"logps/rejected": -584.82373046875,
"loss": 0.0106,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.583667278289795,
"rewards/margins": 13.928072929382324,
"rewards/rejected": -16.511741638183594,
"step": 1160
},
{
"epoch": 1.5,
"learning_rate": 4.1632944537015e-07,
"logits/chosen": -2.312187671661377,
"logits/rejected": -2.313152313232422,
"logps/chosen": -332.22418212890625,
"logps/rejected": -550.9510498046875,
"loss": 0.0151,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.8060202598571777,
"rewards/margins": 13.428415298461914,
"rewards/rejected": -16.23443603515625,
"step": 1170
},
{
"epoch": 1.52,
"learning_rate": 4.151392525589145e-07,
"logits/chosen": -2.269207715988159,
"logits/rejected": -2.2718236446380615,
"logps/chosen": -332.3182067871094,
"logps/rejected": -509.44085693359375,
"loss": 0.0267,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -3.1180636882781982,
"rewards/margins": 12.261663436889648,
"rewards/rejected": -15.379727363586426,
"step": 1180
},
{
"epoch": 1.53,
"learning_rate": 4.139490597476791e-07,
"logits/chosen": -2.2478084564208984,
"logits/rejected": -2.3000128269195557,
"logps/chosen": -337.1382141113281,
"logps/rejected": -537.2418212890625,
"loss": 0.0108,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.009748935699463,
"rewards/margins": 12.527368545532227,
"rewards/rejected": -14.537118911743164,
"step": 1190
},
{
"epoch": 1.54,
"learning_rate": 4.127588669364437e-07,
"logits/chosen": -2.313680648803711,
"logits/rejected": -2.327012538909912,
"logps/chosen": -291.6064758300781,
"logps/rejected": -546.3372802734375,
"loss": 0.0148,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -3.4880402088165283,
"rewards/margins": 14.52784252166748,
"rewards/rejected": -18.01588249206543,
"step": 1200
},
{
"epoch": 1.54,
"eval_logits/chosen": -2.3877577781677246,
"eval_logits/rejected": -2.35882568359375,
"eval_logps/chosen": -347.202880859375,
"eval_logps/rejected": -496.7171325683594,
"eval_loss": 0.07778895646333694,
"eval_rewards/accuracies": 0.953125,
"eval_rewards/chosen": -3.8491578102111816,
"eval_rewards/margins": 11.548023223876953,
"eval_rewards/rejected": -15.397181510925293,
"eval_runtime": 38.6215,
"eval_samples_per_second": 12.946,
"eval_steps_per_second": 0.414,
"step": 1200
},
{
"epoch": 1.56,
"learning_rate": 4.115686741252083e-07,
"logits/chosen": -2.292132616043091,
"logits/rejected": -2.347907781600952,
"logps/chosen": -362.74481201171875,
"logps/rejected": -558.0933837890625,
"loss": 0.0163,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.724060297012329,
"rewards/margins": 13.57036304473877,
"rewards/rejected": -16.294422149658203,
"step": 1210
},
{
"epoch": 1.57,
"learning_rate": 4.103784813139728e-07,
"logits/chosen": -2.3167264461517334,
"logits/rejected": -2.3449079990386963,
"logps/chosen": -369.4256591796875,
"logps/rejected": -566.0360107421875,
"loss": 0.0155,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.4625415802001953,
"rewards/margins": 13.401751518249512,
"rewards/rejected": -16.86429214477539,
"step": 1220
},
{
"epoch": 1.58,
"learning_rate": 4.091882885027374e-07,
"logits/chosen": -2.3674325942993164,
"logits/rejected": -2.455508232116699,
"logps/chosen": -381.26068115234375,
"logps/rejected": -550.90625,
"loss": 0.0244,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -3.13775897026062,
"rewards/margins": 13.795980453491211,
"rewards/rejected": -16.933740615844727,
"step": 1230
},
{
"epoch": 1.59,
"learning_rate": 4.07998095691502e-07,
"logits/chosen": -2.3083391189575195,
"logits/rejected": -2.330939769744873,
"logps/chosen": -362.44171142578125,
"logps/rejected": -523.51171875,
"loss": 0.0124,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1269755363464355,
"rewards/margins": 12.574740409851074,
"rewards/rejected": -15.701716423034668,
"step": 1240
},
{
"epoch": 1.61,
"learning_rate": 4.0680790288026654e-07,
"logits/chosen": -2.3918001651763916,
"logits/rejected": -2.426542282104492,
"logps/chosen": -420.2566833496094,
"logps/rejected": -605.3551025390625,
"loss": 0.0202,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.9122402667999268,
"rewards/margins": 13.67309856414795,
"rewards/rejected": -16.585338592529297,
"step": 1250
},
{
"epoch": 1.62,
"learning_rate": 4.056177100690312e-07,
"logits/chosen": -2.2674708366394043,
"logits/rejected": -2.2906508445739746,
"logps/chosen": -390.3266296386719,
"logps/rejected": -587.2613525390625,
"loss": 0.011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.047337532043457,
"rewards/margins": 13.966493606567383,
"rewards/rejected": -18.013832092285156,
"step": 1260
},
{
"epoch": 1.63,
"learning_rate": 4.044275172577957e-07,
"logits/chosen": -2.256685733795166,
"logits/rejected": -2.283980131149292,
"logps/chosen": -307.6758728027344,
"logps/rejected": -536.929931640625,
"loss": 0.0251,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.8640975952148438,
"rewards/margins": 14.663250923156738,
"rewards/rejected": -17.527347564697266,
"step": 1270
},
{
"epoch": 1.65,
"learning_rate": 4.0323732444656036e-07,
"logits/chosen": -2.2302117347717285,
"logits/rejected": -2.319187641143799,
"logps/chosen": -377.6014099121094,
"logps/rejected": -592.4954223632812,
"loss": 0.0208,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.1300606727600098,
"rewards/margins": 14.494562149047852,
"rewards/rejected": -16.624622344970703,
"step": 1280
},
{
"epoch": 1.66,
"learning_rate": 4.020471316353249e-07,
"logits/chosen": -2.3077661991119385,
"logits/rejected": -2.34450364112854,
"logps/chosen": -384.89007568359375,
"logps/rejected": -577.9298095703125,
"loss": 0.0126,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.4994373321533203,
"rewards/margins": 12.733844757080078,
"rewards/rejected": -14.233282089233398,
"step": 1290
},
{
"epoch": 1.67,
"learning_rate": 4.008569388240895e-07,
"logits/chosen": -2.230447292327881,
"logits/rejected": -2.283294677734375,
"logps/chosen": -346.1694641113281,
"logps/rejected": -534.3992919921875,
"loss": 0.019,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3882415294647217,
"rewards/margins": 14.5983247756958,
"rewards/rejected": -15.986566543579102,
"step": 1300
},
{
"epoch": 1.67,
"eval_logits/chosen": -2.403440475463867,
"eval_logits/rejected": -2.378675699234009,
"eval_logps/chosen": -332.9962463378906,
"eval_logps/rejected": -477.9118957519531,
"eval_loss": 0.07047431915998459,
"eval_rewards/accuracies": 0.9375,
"eval_rewards/chosen": -2.4284939765930176,
"eval_rewards/margins": 11.088165283203125,
"eval_rewards/rejected": -13.516657829284668,
"eval_runtime": 38.6695,
"eval_samples_per_second": 12.93,
"eval_steps_per_second": 0.414,
"step": 1300
},
{
"epoch": 1.68,
"learning_rate": 3.996667460128541e-07,
"logits/chosen": -2.31799578666687,
"logits/rejected": -2.3302206993103027,
"logps/chosen": -333.87261962890625,
"logps/rejected": -506.0113220214844,
"loss": 0.0166,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.8233000040054321,
"rewards/margins": 13.524618148803711,
"rewards/rejected": -15.347920417785645,
"step": 1310
},
{
"epoch": 1.7,
"learning_rate": 3.9847655320161867e-07,
"logits/chosen": -2.3380274772644043,
"logits/rejected": -2.3655543327331543,
"logps/chosen": -330.939453125,
"logps/rejected": -566.5387573242188,
"loss": 0.0211,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.293247938156128,
"rewards/margins": 13.109285354614258,
"rewards/rejected": -16.402530670166016,
"step": 1320
},
{
"epoch": 1.71,
"learning_rate": 3.972863603903832e-07,
"logits/chosen": -2.4296791553497314,
"logits/rejected": -2.395019054412842,
"logps/chosen": -368.58843994140625,
"logps/rejected": -550.57177734375,
"loss": 0.0147,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.1659107208251953,
"rewards/margins": 14.171772956848145,
"rewards/rejected": -16.337684631347656,
"step": 1330
},
{
"epoch": 1.72,
"learning_rate": 3.9609616757914784e-07,
"logits/chosen": -2.386429786682129,
"logits/rejected": -2.401638984680176,
"logps/chosen": -347.26214599609375,
"logps/rejected": -538.3074951171875,
"loss": 0.0162,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.806589126586914,
"rewards/margins": 12.520380973815918,
"rewards/rejected": -15.326970100402832,
"step": 1340
},
{
"epoch": 1.74,
"learning_rate": 3.949059747679124e-07,
"logits/chosen": -2.3784899711608887,
"logits/rejected": -2.42669939994812,
"logps/chosen": -364.69512939453125,
"logps/rejected": -592.1053466796875,
"loss": 0.0159,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -3.1103992462158203,
"rewards/margins": 15.538830757141113,
"rewards/rejected": -18.649229049682617,
"step": 1350
},
{
"epoch": 1.75,
"learning_rate": 3.9371578195667697e-07,
"logits/chosen": -2.4179718494415283,
"logits/rejected": -2.4337425231933594,
"logps/chosen": -338.0289001464844,
"logps/rejected": -522.47412109375,
"loss": 0.0343,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -2.291858196258545,
"rewards/margins": 14.658024787902832,
"rewards/rejected": -16.949880599975586,
"step": 1360
},
{
"epoch": 1.76,
"learning_rate": 3.9252558914544156e-07,
"logits/chosen": -2.37274169921875,
"logits/rejected": -2.376906633377075,
"logps/chosen": -371.0089111328125,
"logps/rejected": -562.0587158203125,
"loss": 0.0236,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -2.699599504470825,
"rewards/margins": 14.066309928894043,
"rewards/rejected": -16.76590919494629,
"step": 1370
},
{
"epoch": 1.77,
"learning_rate": 3.9133539633420615e-07,
"logits/chosen": -2.3570303916931152,
"logits/rejected": -2.4414098262786865,
"logps/chosen": -347.50531005859375,
"logps/rejected": -606.2113647460938,
"loss": 0.015,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.3379924297332764,
"rewards/margins": 14.867982864379883,
"rewards/rejected": -17.205974578857422,
"step": 1380
},
{
"epoch": 1.79,
"learning_rate": 3.901452035229707e-07,
"logits/chosen": -2.373347043991089,
"logits/rejected": -2.4218459129333496,
"logps/chosen": -421.48187255859375,
"logps/rejected": -606.8762817382812,
"loss": 0.0132,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.6556594371795654,
"rewards/margins": 14.492483139038086,
"rewards/rejected": -17.148143768310547,
"step": 1390
},
{
"epoch": 1.8,
"learning_rate": 3.8895501071173533e-07,
"logits/chosen": -2.3142504692077637,
"logits/rejected": -2.3538806438446045,
"logps/chosen": -325.9708557128906,
"logps/rejected": -511.67449951171875,
"loss": 0.0214,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -2.7798726558685303,
"rewards/margins": 13.109631538391113,
"rewards/rejected": -15.889503479003906,
"step": 1400
},
{
"epoch": 1.8,
"eval_logits/chosen": -2.3960964679718018,
"eval_logits/rejected": -2.3517098426818848,
"eval_logps/chosen": -346.35821533203125,
"eval_logps/rejected": -495.85186767578125,
"eval_loss": 0.07910314947366714,
"eval_rewards/accuracies": 0.96875,
"eval_rewards/chosen": -3.7646918296813965,
"eval_rewards/margins": 11.545960426330566,
"eval_rewards/rejected": -15.310651779174805,
"eval_runtime": 38.7173,
"eval_samples_per_second": 12.914,
"eval_steps_per_second": 0.413,
"step": 1400
},
{
"epoch": 1.81,
"learning_rate": 3.8776481790049987e-07,
"logits/chosen": -2.3062312602996826,
"logits/rejected": -2.3327383995056152,
"logps/chosen": -354.59381103515625,
"logps/rejected": -503.6541442871094,
"loss": 0.0196,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -3.5312271118164062,
"rewards/margins": 13.276026725769043,
"rewards/rejected": -16.807254791259766,
"step": 1410
},
{
"epoch": 1.83,
"learning_rate": 3.865746250892644e-07,
"logits/chosen": -2.396146774291992,
"logits/rejected": -2.3744444847106934,
"logps/chosen": -397.74609375,
"logps/rejected": -583.1174926757812,
"loss": 0.0162,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.2743606567382812,
"rewards/margins": 15.37347412109375,
"rewards/rejected": -18.647836685180664,
"step": 1420
},
{
"epoch": 1.84,
"learning_rate": 3.8538443227802905e-07,
"logits/chosen": -2.3621578216552734,
"logits/rejected": -2.3470935821533203,
"logps/chosen": -374.19757080078125,
"logps/rejected": -564.0121459960938,
"loss": 0.022,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -4.340083122253418,
"rewards/margins": 13.78313159942627,
"rewards/rejected": -18.123212814331055,
"step": 1430
},
{
"epoch": 1.85,
"learning_rate": 3.841942394667936e-07,
"logits/chosen": -2.304884672164917,
"logits/rejected": -2.4029793739318848,
"logps/chosen": -369.39898681640625,
"logps/rejected": -578.387451171875,
"loss": 0.0146,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.7867379188537598,
"rewards/margins": 14.443509101867676,
"rewards/rejected": -17.23024559020996,
"step": 1440
},
{
"epoch": 1.86,
"learning_rate": 3.8300404665555817e-07,
"logits/chosen": -2.2816107273101807,
"logits/rejected": -2.2829480171203613,
"logps/chosen": -374.7585144042969,
"logps/rejected": -540.5015869140625,
"loss": 0.0164,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.205556869506836,
"rewards/margins": 14.464788436889648,
"rewards/rejected": -16.670345306396484,
"step": 1450
},
{
"epoch": 1.88,
"learning_rate": 3.8181385384432276e-07,
"logits/chosen": -2.282743453979492,
"logits/rejected": -2.2942354679107666,
"logps/chosen": -394.46502685546875,
"logps/rejected": -594.6571044921875,
"loss": 0.0112,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.892620325088501,
"rewards/margins": 14.386013984680176,
"rewards/rejected": -17.27863311767578,
"step": 1460
},
{
"epoch": 1.89,
"learning_rate": 3.8062366103308735e-07,
"logits/chosen": -2.2720725536346436,
"logits/rejected": -2.245262622833252,
"logps/chosen": -342.9836730957031,
"logps/rejected": -546.7418212890625,
"loss": 0.0365,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.7557284832000732,
"rewards/margins": 14.667689323425293,
"rewards/rejected": -17.423416137695312,
"step": 1470
},
{
"epoch": 1.9,
"learning_rate": 3.794334682218519e-07,
"logits/chosen": -2.295213222503662,
"logits/rejected": -2.3375067710876465,
"logps/chosen": -370.61798095703125,
"logps/rejected": -474.4059143066406,
"loss": 0.0237,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -3.196665048599243,
"rewards/margins": 12.084269523620605,
"rewards/rejected": -15.28093433380127,
"step": 1480
},
{
"epoch": 1.92,
"learning_rate": 3.7824327541061653e-07,
"logits/chosen": -2.4100170135498047,
"logits/rejected": -2.4586007595062256,
"logps/chosen": -358.7035217285156,
"logps/rejected": -547.9478149414062,
"loss": 0.0184,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.731393337249756,
"rewards/margins": 12.000238418579102,
"rewards/rejected": -15.73162841796875,
"step": 1490
},
{
"epoch": 1.93,
"learning_rate": 3.7705308259938107e-07,
"logits/chosen": -2.432584047317505,
"logits/rejected": -2.430572032928467,
"logps/chosen": -400.4476318359375,
"logps/rejected": -589.388427734375,
"loss": 0.0124,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.0731418132781982,
"rewards/margins": 13.324457168579102,
"rewards/rejected": -16.397600173950195,
"step": 1500
},
{
"epoch": 1.93,
"eval_logits/chosen": -2.4233508110046387,
"eval_logits/rejected": -2.3732004165649414,
"eval_logps/chosen": -345.49517822265625,
"eval_logps/rejected": -491.72662353515625,
"eval_loss": 0.08803335577249527,
"eval_rewards/accuracies": 0.9375,
"eval_rewards/chosen": -3.678384304046631,
"eval_rewards/margins": 11.219746589660645,
"eval_rewards/rejected": -14.89813232421875,
"eval_runtime": 38.608,
"eval_samples_per_second": 12.951,
"eval_steps_per_second": 0.414,
"step": 1500
},
{
"epoch": 1.94,
"learning_rate": 3.7586288978814566e-07,
"logits/chosen": -2.335282564163208,
"logits/rejected": -2.330732583999634,
"logps/chosen": -388.20806884765625,
"logps/rejected": -580.2225341796875,
"loss": 0.0118,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1078379154205322,
"rewards/margins": 13.242405891418457,
"rewards/rejected": -16.350242614746094,
"step": 1510
},
{
"epoch": 1.95,
"learning_rate": 3.7467269697691025e-07,
"logits/chosen": -2.3464579582214355,
"logits/rejected": -2.3436694145202637,
"logps/chosen": -335.885986328125,
"logps/rejected": -532.0635986328125,
"loss": 0.0328,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1488466262817383,
"rewards/margins": 13.591397285461426,
"rewards/rejected": -16.740243911743164,
"step": 1520
},
{
"epoch": 1.97,
"learning_rate": 3.7348250416567484e-07,
"logits/chosen": -2.2621750831604004,
"logits/rejected": -2.2600533962249756,
"logps/chosen": -415.00982666015625,
"logps/rejected": -549.5345458984375,
"loss": 0.0264,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.42472767829895,
"rewards/margins": 13.469167709350586,
"rewards/rejected": -15.893896102905273,
"step": 1530
},
{
"epoch": 1.98,
"learning_rate": 3.722923113544394e-07,
"logits/chosen": -2.361262559890747,
"logits/rejected": -2.315338611602783,
"logps/chosen": -394.708740234375,
"logps/rejected": -578.1019287109375,
"loss": 0.0251,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.082348585128784,
"rewards/margins": 15.23118782043457,
"rewards/rejected": -17.31353759765625,
"step": 1540
},
{
"epoch": 1.99,
"learning_rate": 3.71102118543204e-07,
"logits/chosen": -2.315455913543701,
"logits/rejected": -2.284585952758789,
"logps/chosen": -367.0815734863281,
"logps/rejected": -577.2198486328125,
"loss": 0.0113,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.139265537261963,
"rewards/margins": 14.051069259643555,
"rewards/rejected": -16.19033432006836,
"step": 1550
}
],
"logging_steps": 10,
"max_steps": 4668,
"num_train_epochs": 6,
"save_steps": 500,
"total_flos": 0.0,
"trial_name": null,
"trial_params": null
}