zephyr-7b-align-scan / trainer_state.json
taicheng's picture
Model save
09dcdb0 verified
raw
history blame
57.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994767137624281,
"eval_steps": 100,
"global_step": 955,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010465724751439038,
"grad_norm": 7.312430627947873,
"learning_rate": 5.208333333333333e-09,
"logits/chosen": -2.6544837951660156,
"logits/rejected": -2.5759358406066895,
"logps/chosen": -101.20021057128906,
"logps/rejected": -85.73662567138672,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010465724751439037,
"grad_norm": 6.798309843145578,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -2.5073227882385254,
"logits/rejected": -2.498267650604248,
"logps/chosen": -76.74188995361328,
"logps/rejected": -68.77124786376953,
"loss": 0.6931,
"rewards/accuracies": 0.1597222238779068,
"rewards/chosen": -0.00010638780076988041,
"rewards/margins": -0.00026712569524534047,
"rewards/rejected": 0.00016073790902737528,
"step": 10
},
{
"epoch": 0.020931449502878074,
"grad_norm": 6.328409074575995,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.5329933166503906,
"logits/rejected": -2.5121402740478516,
"logps/chosen": -91.55634307861328,
"logps/rejected": -97.98811340332031,
"loss": 0.6931,
"rewards/accuracies": 0.26875001192092896,
"rewards/chosen": -8.86881971382536e-05,
"rewards/margins": 0.0001090427249437198,
"rewards/rejected": -0.00019773092935793102,
"step": 20
},
{
"epoch": 0.03139717425431711,
"grad_norm": 5.987591394147146,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": -2.6352438926696777,
"logits/rejected": -2.624114513397217,
"logps/chosen": -74.1455078125,
"logps/rejected": -74.52520751953125,
"loss": 0.6929,
"rewards/accuracies": 0.2562499940395355,
"rewards/chosen": -0.00024412055790890008,
"rewards/margins": 0.00047922172234393656,
"rewards/rejected": -0.0007233422948047519,
"step": 30
},
{
"epoch": 0.04186289900575615,
"grad_norm": 6.813717975556312,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.5477070808410645,
"logits/rejected": -2.495793104171753,
"logps/chosen": -90.96524810791016,
"logps/rejected": -89.30764770507812,
"loss": 0.6925,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": -3.622327858465724e-05,
"rewards/margins": 0.0016909090336412191,
"rewards/rejected": -0.0017271323595196009,
"step": 40
},
{
"epoch": 0.052328623757195186,
"grad_norm": 6.382358089839322,
"learning_rate": 2.604166666666667e-07,
"logits/chosen": -2.5686728954315186,
"logits/rejected": -2.5408482551574707,
"logps/chosen": -76.85763549804688,
"logps/rejected": -78.096923828125,
"loss": 0.6914,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": 0.002494217362254858,
"rewards/margins": 0.002928710076957941,
"rewards/rejected": -0.00043449303484521806,
"step": 50
},
{
"epoch": 0.06279434850863422,
"grad_norm": 6.142397539571157,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.520242214202881,
"logits/rejected": -2.5160062313079834,
"logps/chosen": -71.42273712158203,
"logps/rejected": -71.63546752929688,
"loss": 0.6901,
"rewards/accuracies": 0.2874999940395355,
"rewards/chosen": 0.011287200264632702,
"rewards/margins": 0.006170675158500671,
"rewards/rejected": 0.0051165251061320305,
"step": 60
},
{
"epoch": 0.07326007326007326,
"grad_norm": 6.834963526035445,
"learning_rate": 3.645833333333333e-07,
"logits/chosen": -2.462669849395752,
"logits/rejected": -2.448659658432007,
"logps/chosen": -72.4560775756836,
"logps/rejected": -89.7001953125,
"loss": 0.6863,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.019068170338869095,
"rewards/margins": 0.014615567401051521,
"rewards/rejected": 0.0044526029378175735,
"step": 70
},
{
"epoch": 0.0837257980115123,
"grad_norm": 7.385617041745493,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.4439797401428223,
"logits/rejected": -2.4138834476470947,
"logps/chosen": -81.91383361816406,
"logps/rejected": -80.78437042236328,
"loss": 0.6811,
"rewards/accuracies": 0.34375,
"rewards/chosen": 0.00990500207990408,
"rewards/margins": 0.032499730587005615,
"rewards/rejected": -0.02259472757577896,
"step": 80
},
{
"epoch": 0.09419152276295134,
"grad_norm": 8.102390740927866,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": -2.3960914611816406,
"logits/rejected": -2.3794617652893066,
"logps/chosen": -73.29556274414062,
"logps/rejected": -76.41487121582031,
"loss": 0.6784,
"rewards/accuracies": 0.28125,
"rewards/chosen": -0.040218498557806015,
"rewards/margins": 0.016615843400359154,
"rewards/rejected": -0.056834347546100616,
"step": 90
},
{
"epoch": 0.10465724751439037,
"grad_norm": 6.755359569975091,
"learning_rate": 4.999732492681437e-07,
"logits/chosen": -2.3556602001190186,
"logits/rejected": -2.3346455097198486,
"logps/chosen": -78.03315734863281,
"logps/rejected": -95.56941223144531,
"loss": 0.6719,
"rewards/accuracies": 0.32499998807907104,
"rewards/chosen": -0.005892142653465271,
"rewards/margins": 0.07231049239635468,
"rewards/rejected": -0.07820263504981995,
"step": 100
},
{
"epoch": 0.10465724751439037,
"eval_logits/chosen": -2.3486523628234863,
"eval_logits/rejected": -2.3309624195098877,
"eval_logps/chosen": -73.28648376464844,
"eval_logps/rejected": -85.4478530883789,
"eval_loss": 0.6686670184135437,
"eval_rewards/accuracies": 0.3273809552192688,
"eval_rewards/chosen": 0.012047496624290943,
"eval_rewards/margins": 0.05524253472685814,
"eval_rewards/rejected": -0.04319504275918007,
"eval_runtime": 113.7223,
"eval_samples_per_second": 17.587,
"eval_steps_per_second": 0.554,
"step": 100
},
{
"epoch": 0.1151229722658294,
"grad_norm": 13.391566933736375,
"learning_rate": 4.996723692767926e-07,
"logits/chosen": -2.406934976577759,
"logits/rejected": -2.4089908599853516,
"logps/chosen": -80.33895111083984,
"logps/rejected": -93.52415466308594,
"loss": 0.6563,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.034039516001939774,
"rewards/margins": 0.08607066422700882,
"rewards/rejected": -0.1201101765036583,
"step": 110
},
{
"epoch": 0.12558869701726844,
"grad_norm": 9.646857443256218,
"learning_rate": 4.990375746213598e-07,
"logits/chosen": -2.2991995811462402,
"logits/rejected": -2.265392303466797,
"logps/chosen": -76.64207458496094,
"logps/rejected": -94.79474639892578,
"loss": 0.6601,
"rewards/accuracies": 0.36250001192092896,
"rewards/chosen": -0.01742006093263626,
"rewards/margins": 0.11232365667819977,
"rewards/rejected": -0.12974372506141663,
"step": 120
},
{
"epoch": 0.1360544217687075,
"grad_norm": 16.35113879818621,
"learning_rate": 4.980697142834314e-07,
"logits/chosen": -2.244642496109009,
"logits/rejected": -2.2259411811828613,
"logps/chosen": -65.43486022949219,
"logps/rejected": -84.06461334228516,
"loss": 0.66,
"rewards/accuracies": 0.29374998807907104,
"rewards/chosen": -0.04491991177201271,
"rewards/margins": 0.0634341612458229,
"rewards/rejected": -0.10835406929254532,
"step": 130
},
{
"epoch": 0.14652014652014653,
"grad_norm": 17.442951288216843,
"learning_rate": 4.967700826904229e-07,
"logits/chosen": -2.218097686767578,
"logits/rejected": -2.2064428329467773,
"logps/chosen": -109.58231353759766,
"logps/rejected": -120.42280578613281,
"loss": 0.6529,
"rewards/accuracies": 0.39375001192092896,
"rewards/chosen": -0.18489012122154236,
"rewards/margins": 0.14332745969295502,
"rewards/rejected": -0.3282175660133362,
"step": 140
},
{
"epoch": 0.15698587127158556,
"grad_norm": 21.84882656667421,
"learning_rate": 4.951404179843962e-07,
"logits/chosen": -2.3305749893188477,
"logits/rejected": -2.3431344032287598,
"logps/chosen": -69.88795471191406,
"logps/rejected": -94.39766693115234,
"loss": 0.6506,
"rewards/accuracies": 0.2874999940395355,
"rewards/chosen": -0.14505597949028015,
"rewards/margins": 0.1382911652326584,
"rewards/rejected": -0.28334707021713257,
"step": 150
},
{
"epoch": 0.1674515960230246,
"grad_norm": 13.04813612493708,
"learning_rate": 4.931828996974498e-07,
"logits/chosen": -2.0861306190490723,
"logits/rejected": -2.0900378227233887,
"logps/chosen": -87.21963500976562,
"logps/rejected": -114.88661193847656,
"loss": 0.6528,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.13326936960220337,
"rewards/margins": 0.1332504153251648,
"rewards/rejected": -0.26651981472969055,
"step": 160
},
{
"epoch": 0.17791732077446362,
"grad_norm": 14.778101043613702,
"learning_rate": 4.909001458367866e-07,
"logits/chosen": -1.9173187017440796,
"logits/rejected": -1.8886661529541016,
"logps/chosen": -81.811767578125,
"logps/rejected": -101.0234375,
"loss": 0.634,
"rewards/accuracies": 0.32499998807907104,
"rewards/chosen": -0.16403724253177643,
"rewards/margins": 0.19587047398090363,
"rewards/rejected": -0.35990768671035767,
"step": 170
},
{
"epoch": 0.18838304552590268,
"grad_norm": 26.424629291070275,
"learning_rate": 4.882952093833627e-07,
"logits/chosen": -1.3328689336776733,
"logits/rejected": -1.3415606021881104,
"logps/chosen": -121.7206039428711,
"logps/rejected": -163.23861694335938,
"loss": 0.6215,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.4643549919128418,
"rewards/margins": 0.2982317805290222,
"rewards/rejected": -0.7625867128372192,
"step": 180
},
{
"epoch": 0.1988487702773417,
"grad_norm": 18.286232555939137,
"learning_rate": 4.853715742087946e-07,
"logits/chosen": -1.299851655960083,
"logits/rejected": -1.2263530492782593,
"logps/chosen": -130.7896270751953,
"logps/rejected": -153.83375549316406,
"loss": 0.6187,
"rewards/accuracies": 0.3812499940395355,
"rewards/chosen": -0.5220120549201965,
"rewards/margins": 0.28233885765075684,
"rewards/rejected": -0.8043509721755981,
"step": 190
},
{
"epoch": 0.20931449502878074,
"grad_norm": 22.532611448739736,
"learning_rate": 4.821331504159906e-07,
"logits/chosen": -1.2297742366790771,
"logits/rejected": -1.1969741582870483,
"logps/chosen": -117.6633071899414,
"logps/rejected": -130.67193603515625,
"loss": 0.6488,
"rewards/accuracies": 0.32499998807907104,
"rewards/chosen": -0.3429523706436157,
"rewards/margins": 0.20222480595111847,
"rewards/rejected": -0.545177161693573,
"step": 200
},
{
"epoch": 0.20931449502878074,
"eval_logits/chosen": -1.2583706378936768,
"eval_logits/rejected": -1.23958420753479,
"eval_logps/chosen": -102.57432556152344,
"eval_logps/rejected": -130.3724822998047,
"eval_loss": 0.634810745716095,
"eval_rewards/accuracies": 0.3373015820980072,
"eval_rewards/chosen": -0.2808309495449066,
"eval_rewards/margins": 0.21161039173603058,
"eval_rewards/rejected": -0.492441326379776,
"eval_runtime": 113.6607,
"eval_samples_per_second": 17.596,
"eval_steps_per_second": 0.554,
"step": 200
},
{
"epoch": 0.21978021978021978,
"grad_norm": 43.96668983794973,
"learning_rate": 4.785842691097342e-07,
"logits/chosen": -1.166576623916626,
"logits/rejected": -1.0878881216049194,
"logps/chosen": -102.22654724121094,
"logps/rejected": -119.87858581542969,
"loss": 0.6449,
"rewards/accuracies": 0.30000001192092896,
"rewards/chosen": -0.30706560611724854,
"rewards/margins": 0.2078002393245697,
"rewards/rejected": -0.5148658752441406,
"step": 210
},
{
"epoch": 0.2302459445316588,
"grad_norm": 21.60611363424848,
"learning_rate": 4.7472967660421603e-07,
"logits/chosen": -1.4838167428970337,
"logits/rejected": -1.293268084526062,
"logps/chosen": -140.9451446533203,
"logps/rejected": -159.35256958007812,
"loss": 0.6317,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.40442484617233276,
"rewards/margins": 0.2767654359340668,
"rewards/rejected": -0.6811902523040771,
"step": 220
},
{
"epoch": 0.24071166928309787,
"grad_norm": 23.269156895827596,
"learning_rate": 4.705745280752585e-07,
"logits/chosen": -1.360938310623169,
"logits/rejected": -1.2307772636413574,
"logps/chosen": -92.72604370117188,
"logps/rejected": -110.38468933105469,
"loss": 0.6411,
"rewards/accuracies": 0.30000001192092896,
"rewards/chosen": -0.3325788080692291,
"rewards/margins": 0.19615355134010315,
"rewards/rejected": -0.5287323594093323,
"step": 230
},
{
"epoch": 0.25117739403453687,
"grad_norm": 32.70987860826756,
"learning_rate": 4.6612438066572555e-07,
"logits/chosen": -1.1462054252624512,
"logits/rejected": -0.9243119359016418,
"logps/chosen": -129.86032104492188,
"logps/rejected": -173.3503875732422,
"loss": 0.6198,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": -0.4205262064933777,
"rewards/margins": 0.42327141761779785,
"rewards/rejected": -0.8437975645065308,
"step": 240
},
{
"epoch": 0.2616431187859759,
"grad_norm": 18.56295517177359,
"learning_rate": 4.6138518605333664e-07,
"logits/chosen": -0.9204837083816528,
"logits/rejected": -0.8685296177864075,
"logps/chosen": -83.56452178955078,
"logps/rejected": -112.06929779052734,
"loss": 0.6227,
"rewards/accuracies": 0.24375000596046448,
"rewards/chosen": -0.30188173055648804,
"rewards/margins": 0.20583298802375793,
"rewards/rejected": -0.5077147483825684,
"step": 250
},
{
"epoch": 0.272108843537415,
"grad_norm": 24.278208900563758,
"learning_rate": 4.5636328249082514e-07,
"logits/chosen": -0.5377733111381531,
"logits/rejected": -0.28726479411125183,
"logps/chosen": -126.16239166259766,
"logps/rejected": -144.44386291503906,
"loss": 0.6132,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.43910473585128784,
"rewards/margins": 0.27166199684143066,
"rewards/rejected": -0.7107667922973633,
"step": 260
},
{
"epoch": 0.282574568288854,
"grad_norm": 33.830045643602666,
"learning_rate": 4.510653863290871e-07,
"logits/chosen": -0.43365517258644104,
"logits/rejected": -0.23527593910694122,
"logps/chosen": -127.06233215332031,
"logps/rejected": -150.31356811523438,
"loss": 0.6191,
"rewards/accuracies": 0.36250001192092896,
"rewards/chosen": -0.4495798647403717,
"rewards/margins": 0.3023374378681183,
"rewards/rejected": -0.7519172430038452,
"step": 270
},
{
"epoch": 0.29304029304029305,
"grad_norm": 24.405765366439173,
"learning_rate": 4.4549858303465737e-07,
"logits/chosen": -0.9159714579582214,
"logits/rejected": -0.717955470085144,
"logps/chosen": -109.26994323730469,
"logps/rejected": -139.60409545898438,
"loss": 0.622,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.3430403769016266,
"rewards/margins": 0.24182644486427307,
"rewards/rejected": -0.5848668217658997,
"step": 280
},
{
"epoch": 0.3035060177917321,
"grad_norm": 18.88355766247808,
"learning_rate": 4.396703177135261e-07,
"logits/chosen": -1.3694268465042114,
"logits/rejected": -1.1670420169830322,
"logps/chosen": -97.29615783691406,
"logps/rejected": -123.79087829589844,
"loss": 0.6219,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": -0.3448092043399811,
"rewards/margins": 0.27321183681488037,
"rewards/rejected": -0.6180210709571838,
"step": 290
},
{
"epoch": 0.3139717425431711,
"grad_norm": 24.161262620892497,
"learning_rate": 4.335883851539693e-07,
"logits/chosen": -1.5254216194152832,
"logits/rejected": -1.4045015573501587,
"logps/chosen": -147.19302368164062,
"logps/rejected": -174.0820770263672,
"loss": 0.6331,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.5355431437492371,
"rewards/margins": 0.2548714876174927,
"rewards/rejected": -0.7904146313667297,
"step": 300
},
{
"epoch": 0.3139717425431711,
"eval_logits/chosen": -1.1872740983963013,
"eval_logits/rejected": -1.0319762229919434,
"eval_logps/chosen": -120.13069915771484,
"eval_logps/rejected": -157.09765625,
"eval_loss": 0.6194990277290344,
"eval_rewards/accuracies": 0.3452380895614624,
"eval_rewards/chosen": -0.45639467239379883,
"eval_rewards/margins": 0.3032984435558319,
"eval_rewards/rejected": -0.7596930265426636,
"eval_runtime": 113.7203,
"eval_samples_per_second": 17.587,
"eval_steps_per_second": 0.554,
"step": 300
},
{
"epoch": 0.32443746729461015,
"grad_norm": 27.882854283662915,
"learning_rate": 4.272609194017105e-07,
"logits/chosen": -0.8721631765365601,
"logits/rejected": -0.47974568605422974,
"logps/chosen": -142.952392578125,
"logps/rejected": -164.13180541992188,
"loss": 0.6108,
"rewards/accuracies": 0.35624998807907104,
"rewards/chosen": -0.44492608308792114,
"rewards/margins": 0.36958834528923035,
"rewards/rejected": -0.8145144581794739,
"step": 310
},
{
"epoch": 0.3349031920460492,
"grad_norm": 40.79438243043005,
"learning_rate": 4.2069638288135547e-07,
"logits/chosen": 0.030854111537337303,
"logits/rejected": 0.30916082859039307,
"logps/chosen": -143.35768127441406,
"logps/rejected": -215.4879608154297,
"loss": 0.646,
"rewards/accuracies": 0.35624998807907104,
"rewards/chosen": -0.6657307147979736,
"rewards/margins": 0.5068569779396057,
"rewards/rejected": -1.1725876331329346,
"step": 320
},
{
"epoch": 0.3453689167974882,
"grad_norm": 28.74932802994849,
"learning_rate": 4.139035550786494e-07,
"logits/chosen": 0.10449258983135223,
"logits/rejected": 0.19263358414173126,
"logps/chosen": -125.71956634521484,
"logps/rejected": -157.4947509765625,
"loss": 0.6184,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": -0.5608196258544922,
"rewards/margins": 0.2601728141307831,
"rewards/rejected": -0.8209924697875977,
"step": 330
},
{
"epoch": 0.35583464154892724,
"grad_norm": 20.26835641704999,
"learning_rate": 4.0689152079869306e-07,
"logits/chosen": -0.6976083517074585,
"logits/rejected": -0.4943923354148865,
"logps/chosen": -127.49371337890625,
"logps/rejected": -158.2890625,
"loss": 0.6271,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.5544435977935791,
"rewards/margins": 0.3053310215473175,
"rewards/rejected": -0.8597745895385742,
"step": 340
},
{
"epoch": 0.3663003663003663,
"grad_norm": 27.63474707817879,
"learning_rate": 3.99669658015821e-07,
"logits/chosen": -0.6875920295715332,
"logits/rejected": -0.5427245497703552,
"logps/chosen": -149.16458129882812,
"logps/rejected": -175.6387481689453,
"loss": 0.6043,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": -0.6470784544944763,
"rewards/margins": 0.2719033360481262,
"rewards/rejected": -0.9189817309379578,
"step": 350
},
{
"epoch": 0.37676609105180536,
"grad_norm": 30.4860863672758,
"learning_rate": 3.92247625331392e-07,
"logits/chosen": 0.019889334216713905,
"logits/rejected": 0.449519544839859,
"logps/chosen": -158.36898803710938,
"logps/rejected": -190.15274047851562,
"loss": 0.5979,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.6107655763626099,
"rewards/margins": 0.40659332275390625,
"rewards/rejected": -1.0173588991165161,
"step": 360
},
{
"epoch": 0.3872318158032444,
"grad_norm": 28.105949004143817,
"learning_rate": 3.846353490562664e-07,
"logits/chosen": 0.23578917980194092,
"logits/rejected": 0.578147292137146,
"logps/chosen": -144.15545654296875,
"logps/rejected": -183.06863403320312,
"loss": 0.6068,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.577487587928772,
"rewards/margins": 0.37577182054519653,
"rewards/rejected": -0.9532594680786133,
"step": 370
},
{
"epoch": 0.3976975405546834,
"grad_norm": 19.345534639045226,
"learning_rate": 3.768430099352445e-07,
"logits/chosen": -0.3674705922603607,
"logits/rejected": 0.07584401965141296,
"logps/chosen": -130.8544464111328,
"logps/rejected": -168.41697692871094,
"loss": 0.6092,
"rewards/accuracies": 0.39375001192092896,
"rewards/chosen": -0.49021464586257935,
"rewards/margins": 0.39871546626091003,
"rewards/rejected": -0.8889301419258118,
"step": 380
},
{
"epoch": 0.40816326530612246,
"grad_norm": 29.65852362420373,
"learning_rate": 3.6888102953122304e-07,
"logits/chosen": -0.23846562206745148,
"logits/rejected": -0.008897816762328148,
"logps/chosen": -138.2523956298828,
"logps/rejected": -194.8299102783203,
"loss": 0.6241,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.5844453573226929,
"rewards/margins": 0.4089936316013336,
"rewards/rejected": -0.9934390187263489,
"step": 390
},
{
"epoch": 0.4186289900575615,
"grad_norm": 42.485008339554724,
"learning_rate": 3.607600562872785e-07,
"logits/chosen": 0.0847388356924057,
"logits/rejected": 0.3739756643772125,
"logps/chosen": -133.99862670898438,
"logps/rejected": -157.2317352294922,
"loss": 0.6321,
"rewards/accuracies": 0.30000001192092896,
"rewards/chosen": -0.6070858836174011,
"rewards/margins": 0.2605039179325104,
"rewards/rejected": -0.8675897717475891,
"step": 400
},
{
"epoch": 0.4186289900575615,
"eval_logits/chosen": 0.03354182466864586,
"eval_logits/rejected": 0.37280067801475525,
"eval_logps/chosen": -146.9637451171875,
"eval_logps/rejected": -190.27566528320312,
"eval_loss": 0.6099374294281006,
"eval_rewards/accuracies": 0.363095223903656,
"eval_rewards/chosen": -0.7247251272201538,
"eval_rewards/margins": 0.36674803495407104,
"eval_rewards/rejected": -1.0914732217788696,
"eval_runtime": 113.6653,
"eval_samples_per_second": 17.596,
"eval_steps_per_second": 0.554,
"step": 400
},
{
"epoch": 0.4290947148090005,
"grad_norm": 17.771390083818016,
"learning_rate": 3.5249095128531856e-07,
"logits/chosen": -0.3344365656375885,
"logits/rejected": -0.13601410388946533,
"logps/chosen": -168.97885131835938,
"logps/rejected": -211.15029907226562,
"loss": 0.6274,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.8895459175109863,
"rewards/margins": 0.2965359091758728,
"rewards/rejected": -1.186081886291504,
"step": 410
},
{
"epoch": 0.43956043956043955,
"grad_norm": 29.30170184710936,
"learning_rate": 3.4408477372034736e-07,
"logits/chosen": -0.6253395080566406,
"logits/rejected": -0.4562221169471741,
"logps/chosen": -117.9905014038086,
"logps/rejected": -139.88853454589844,
"loss": 0.6305,
"rewards/accuracies": 0.33125001192092896,
"rewards/chosen": -0.49884548783302307,
"rewards/margins": 0.21792730689048767,
"rewards/rejected": -0.7167727947235107,
"step": 420
},
{
"epoch": 0.4500261643118786,
"grad_norm": 19.52934749077977,
"learning_rate": 3.3555276610977276e-07,
"logits/chosen": 0.060841239988803864,
"logits/rejected": 0.28291866183280945,
"logps/chosen": -119.32076263427734,
"logps/rejected": -161.02288818359375,
"loss": 0.6071,
"rewards/accuracies": 0.28125,
"rewards/chosen": -0.5378307104110718,
"rewards/margins": 0.3328271806240082,
"rewards/rejected": -0.8706579208374023,
"step": 430
},
{
"epoch": 0.4604918890633176,
"grad_norm": 28.977327104522658,
"learning_rate": 3.269063392575352e-07,
"logits/chosen": 0.562627911567688,
"logits/rejected": 0.706725001335144,
"logps/chosen": -145.38662719726562,
"logps/rejected": -177.89791870117188,
"loss": 0.6155,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": -0.7169743776321411,
"rewards/margins": 0.19164128601551056,
"rewards/rejected": -0.9086155891418457,
"step": 440
},
{
"epoch": 0.47095761381475665,
"grad_norm": 21.22082345177454,
"learning_rate": 3.1815705699316964e-07,
"logits/chosen": 0.6601327657699585,
"logits/rejected": 0.9368169903755188,
"logps/chosen": -159.88311767578125,
"logps/rejected": -201.655517578125,
"loss": 0.6175,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": -0.6956228017807007,
"rewards/margins": 0.4413565993309021,
"rewards/rejected": -1.1369794607162476,
"step": 450
},
{
"epoch": 0.48142333856619574,
"grad_norm": 19.35642436173428,
"learning_rate": 3.0931662070620794e-07,
"logits/chosen": 0.47756925225257874,
"logits/rejected": 1.0075833797454834,
"logps/chosen": -136.54437255859375,
"logps/rejected": -176.9443359375,
"loss": 0.6108,
"rewards/accuracies": 0.3812499940395355,
"rewards/chosen": -0.6124375462532043,
"rewards/margins": 0.4467083811759949,
"rewards/rejected": -1.0591459274291992,
"step": 460
},
{
"epoch": 0.49188906331763477,
"grad_norm": 35.6743715566195,
"learning_rate": 3.003968536966078e-07,
"logits/chosen": 0.9841750264167786,
"logits/rejected": 1.1566669940948486,
"logps/chosen": -134.76565551757812,
"logps/rejected": -183.22018432617188,
"loss": 0.6033,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": -0.6426252722740173,
"rewards/margins": 0.34931105375289917,
"rewards/rejected": -0.9919363856315613,
"step": 470
},
{
"epoch": 0.5023547880690737,
"grad_norm": 31.355461154744457,
"learning_rate": 2.9140968536213693e-07,
"logits/chosen": 1.8672128915786743,
"logits/rejected": 2.3499321937561035,
"logps/chosen": -142.2679443359375,
"logps/rejected": -197.79867553710938,
"loss": 0.6029,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.6666483879089355,
"rewards/margins": 0.5482696890830994,
"rewards/rejected": -1.2149180173873901,
"step": 480
},
{
"epoch": 0.5128205128205128,
"grad_norm": 39.792642390254535,
"learning_rate": 2.823671352438608e-07,
"logits/chosen": 2.002504825592041,
"logits/rejected": 2.7407174110412598,
"logps/chosen": -147.71644592285156,
"logps/rejected": -184.10256958007812,
"loss": 0.6191,
"rewards/accuracies": 0.38749998807907104,
"rewards/chosen": -0.5607318878173828,
"rewards/margins": 0.49417656660079956,
"rewards/rejected": -1.0549085140228271,
"step": 490
},
{
"epoch": 0.5232862375719518,
"grad_norm": 19.501719693409513,
"learning_rate": 2.73281296951072e-07,
"logits/chosen": 2.3184399604797363,
"logits/rejected": 2.6984035968780518,
"logps/chosen": -179.01693725585938,
"logps/rejected": -221.22781372070312,
"loss": 0.6318,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": -0.8433685302734375,
"rewards/margins": 0.404899924993515,
"rewards/rejected": -1.248268485069275,
"step": 500
},
{
"epoch": 0.5232862375719518,
"eval_logits/chosen": 2.6547250747680664,
"eval_logits/rejected": 2.9545063972473145,
"eval_logps/chosen": -155.49295043945312,
"eval_logps/rejected": -204.6371307373047,
"eval_loss": 0.6104578375816345,
"eval_rewards/accuracies": 0.3551587164402008,
"eval_rewards/chosen": -0.8100170493125916,
"eval_rewards/margins": 0.4250708818435669,
"eval_rewards/rejected": -1.2350879907608032,
"eval_runtime": 113.5938,
"eval_samples_per_second": 17.607,
"eval_steps_per_second": 0.555,
"step": 500
},
{
"epoch": 0.533751962323391,
"grad_norm": 23.676894445603228,
"learning_rate": 2.641643219871597e-07,
"logits/chosen": 2.713271379470825,
"logits/rejected": 2.843205213546753,
"logps/chosen": -130.48731994628906,
"logps/rejected": -174.28225708007812,
"loss": 0.6232,
"rewards/accuracies": 0.29374998807907104,
"rewards/chosen": -0.6893380284309387,
"rewards/margins": 0.37240949273109436,
"rewards/rejected": -1.061747431755066,
"step": 510
},
{
"epoch": 0.54421768707483,
"grad_norm": 35.84824398777263,
"learning_rate": 2.550284034980507e-07,
"logits/chosen": 2.0955018997192383,
"logits/rejected": 2.464780330657959,
"logps/chosen": -167.99371337890625,
"logps/rejected": -198.072021484375,
"loss": 0.6346,
"rewards/accuracies": 0.26875001192092896,
"rewards/chosen": -0.9579731822013855,
"rewards/margins": 0.2499997913837433,
"rewards/rejected": -1.2079728841781616,
"step": 520
},
{
"epoch": 0.554683411826269,
"grad_norm": 19.326207651996775,
"learning_rate": 2.4588575996495794e-07,
"logits/chosen": 1.578254222869873,
"logits/rejected": 1.7954685688018799,
"logps/chosen": -124.0101547241211,
"logps/rejected": -157.9552764892578,
"loss": 0.6063,
"rewards/accuracies": 0.29374998807907104,
"rewards/chosen": -0.6386234760284424,
"rewards/margins": 0.3164999485015869,
"rewards/rejected": -0.9551234245300293,
"step": 530
},
{
"epoch": 0.565149136577708,
"grad_norm": 25.47159513000541,
"learning_rate": 2.367486188632446e-07,
"logits/chosen": 1.5957086086273193,
"logits/rejected": 1.9952272176742554,
"logps/chosen": -136.2624053955078,
"logps/rejected": -160.97589111328125,
"loss": 0.6202,
"rewards/accuracies": 0.29374998807907104,
"rewards/chosen": -0.6916071772575378,
"rewards/margins": 0.2573884129524231,
"rewards/rejected": -0.9489954710006714,
"step": 540
},
{
"epoch": 0.5756148613291471,
"grad_norm": 27.189003326527832,
"learning_rate": 2.276292003092593e-07,
"logits/chosen": 0.5969494581222534,
"logits/rejected": 1.1224400997161865,
"logps/chosen": -165.4878387451172,
"logps/rejected": -201.8157958984375,
"loss": 0.6072,
"rewards/accuracies": 0.38749998807907104,
"rewards/chosen": -0.7528584599494934,
"rewards/margins": 0.4292448163032532,
"rewards/rejected": -1.1821032762527466,
"step": 550
},
{
"epoch": 0.5860805860805861,
"grad_norm": 19.01493291730421,
"learning_rate": 2.185397007170141e-07,
"logits/chosen": 0.5469252467155457,
"logits/rejected": 0.9992968440055847,
"logps/chosen": -109.37480163574219,
"logps/rejected": -158.11180114746094,
"loss": 0.607,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": -0.5237552523612976,
"rewards/margins": 0.4429488778114319,
"rewards/rejected": -0.9667040705680847,
"step": 560
},
{
"epoch": 0.5965463108320251,
"grad_norm": 34.53884646430518,
"learning_rate": 2.094922764865619e-07,
"logits/chosen": 0.47024235129356384,
"logits/rejected": 1.3419710397720337,
"logps/chosen": -152.4022674560547,
"logps/rejected": -181.06829833984375,
"loss": 0.6036,
"rewards/accuracies": 0.35624998807907104,
"rewards/chosen": -0.6253000497817993,
"rewards/margins": 0.45260563492774963,
"rewards/rejected": -1.0779056549072266,
"step": 570
},
{
"epoch": 0.6070120355834642,
"grad_norm": 23.195476547368756,
"learning_rate": 2.0049902774588797e-07,
"logits/chosen": 0.7025114297866821,
"logits/rejected": 1.4946035146713257,
"logps/chosen": -155.9736328125,
"logps/rejected": -197.6754150390625,
"loss": 0.6174,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.7866873741149902,
"rewards/margins": 0.5116966366767883,
"rewards/rejected": -1.2983839511871338,
"step": 580
},
{
"epoch": 0.6174777603349032,
"grad_norm": 28.28948495055075,
"learning_rate": 1.9157198216806238e-07,
"logits/chosen": 1.2547266483306885,
"logits/rejected": 1.6798299551010132,
"logps/chosen": -143.81948852539062,
"logps/rejected": -180.07110595703125,
"loss": 0.5974,
"rewards/accuracies": 0.32499998807907104,
"rewards/chosen": -0.7290927171707153,
"rewards/margins": 0.344825804233551,
"rewards/rejected": -1.0739185810089111,
"step": 590
},
{
"epoch": 0.6279434850863422,
"grad_norm": 26.475084176869974,
"learning_rate": 1.8272307888529274e-07,
"logits/chosen": 1.0473191738128662,
"logits/rejected": 1.4664316177368164,
"logps/chosen": -151.5343017578125,
"logps/rejected": -203.01600646972656,
"loss": 0.5978,
"rewards/accuracies": 0.38749998807907104,
"rewards/chosen": -0.8304673433303833,
"rewards/margins": 0.4478435516357422,
"rewards/rejected": -1.278310775756836,
"step": 600
},
{
"epoch": 0.6279434850863422,
"eval_logits/chosen": 0.9605558514595032,
"eval_logits/rejected": 1.442029356956482,
"eval_logps/chosen": -147.85601806640625,
"eval_logps/rejected": -199.51206970214844,
"eval_loss": 0.6014743447303772,
"eval_rewards/accuracies": 0.3591269850730896,
"eval_rewards/chosen": -0.733647882938385,
"eval_rewards/margins": 0.4501895010471344,
"eval_rewards/rejected": -1.1838374137878418,
"eval_runtime": 113.6375,
"eval_samples_per_second": 17.6,
"eval_steps_per_second": 0.554,
"step": 600
},
{
"epoch": 0.6384092098377813,
"grad_norm": 24.64809094353209,
"learning_rate": 1.7396415252139288e-07,
"logits/chosen": 0.9734107255935669,
"logits/rejected": 1.4407756328582764,
"logps/chosen": -136.4254150390625,
"logps/rejected": -157.22183227539062,
"loss": 0.6155,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": -0.6510840654373169,
"rewards/margins": 0.3478087782859802,
"rewards/rejected": -0.9988927841186523,
"step": 610
},
{
"epoch": 0.6488749345892203,
"grad_norm": 24.74151564246123,
"learning_rate": 1.6530691736402316e-07,
"logits/chosen": 0.4937843680381775,
"logits/rejected": 0.9153006672859192,
"logps/chosen": -137.00144958496094,
"logps/rejected": -177.6410675048828,
"loss": 0.6073,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": -0.725879967212677,
"rewards/margins": 0.3728798031806946,
"rewards/rejected": -1.0987598896026611,
"step": 620
},
{
"epoch": 0.6593406593406593,
"grad_norm": 22.938989151746902,
"learning_rate": 1.5676295169786864e-07,
"logits/chosen": 0.19430339336395264,
"logits/rejected": 0.6654868721961975,
"logps/chosen": -176.47686767578125,
"logps/rejected": -213.91622924804688,
"loss": 0.5789,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": -0.7090758085250854,
"rewards/margins": 0.45301565527915955,
"rewards/rejected": -1.1620914936065674,
"step": 630
},
{
"epoch": 0.6698063840920984,
"grad_norm": 43.50900642344418,
"learning_rate": 1.483436823197092e-07,
"logits/chosen": 0.7957710027694702,
"logits/rejected": 1.4320136308670044,
"logps/chosen": -182.98187255859375,
"logps/rejected": -233.4198455810547,
"loss": 0.5792,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.872964084148407,
"rewards/margins": 0.5010480284690857,
"rewards/rejected": -1.3740123510360718,
"step": 640
},
{
"epoch": 0.6802721088435374,
"grad_norm": 24.400296552508813,
"learning_rate": 1.4006036925609243e-07,
"logits/chosen": 1.008284330368042,
"logits/rejected": 1.555418848991394,
"logps/chosen": -128.50022888183594,
"logps/rejected": -170.05349731445312,
"loss": 0.59,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": -0.5598582029342651,
"rewards/margins": 0.4066081643104553,
"rewards/rejected": -0.9664663076400757,
"step": 650
},
{
"epoch": 0.6907378335949764,
"grad_norm": 31.773454484895552,
"learning_rate": 1.319240907040458e-07,
"logits/chosen": 0.5741680860519409,
"logits/rejected": 1.095399022102356,
"logps/chosen": -144.4388427734375,
"logps/rejected": -190.87571716308594,
"loss": 0.6023,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.5422704815864563,
"rewards/margins": 0.48283880949020386,
"rewards/rejected": -1.0251094102859497,
"step": 660
},
{
"epoch": 0.7012035583464155,
"grad_norm": 37.13339246311252,
"learning_rate": 1.239457282149695e-07,
"logits/chosen": 0.8501984477043152,
"logits/rejected": 1.152748942375183,
"logps/chosen": -102.63143157958984,
"logps/rejected": -154.28704833984375,
"loss": 0.5949,
"rewards/accuracies": 0.32499998807907104,
"rewards/chosen": -0.4462759494781494,
"rewards/margins": 0.41997307538986206,
"rewards/rejected": -0.8662489652633667,
"step": 670
},
{
"epoch": 0.7116692830978545,
"grad_norm": 32.56523005952964,
"learning_rate": 1.1613595214152711e-07,
"logits/chosen": 1.335402250289917,
"logits/rejected": 1.811517357826233,
"logps/chosen": -135.314453125,
"logps/rejected": -198.50515747070312,
"loss": 0.6071,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": -0.587442934513092,
"rewards/margins": 0.5670086145401001,
"rewards/rejected": -1.154451608657837,
"step": 680
},
{
"epoch": 0.7221350078492935,
"grad_norm": 23.014936124623496,
"learning_rate": 1.0850520736699362e-07,
"logits/chosen": 0.7806999683380127,
"logits/rejected": 1.3070814609527588,
"logps/chosen": -168.81216430664062,
"logps/rejected": -202.5950469970703,
"loss": 0.5659,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.7739877700805664,
"rewards/margins": 0.40968823432922363,
"rewards/rejected": -1.18367600440979,
"step": 690
},
{
"epoch": 0.7326007326007326,
"grad_norm": 26.764323919821496,
"learning_rate": 1.0106369933615042e-07,
"logits/chosen": 0.9588180780410767,
"logits/rejected": 1.4978833198547363,
"logps/chosen": -165.7656707763672,
"logps/rejected": -204.122802734375,
"loss": 0.6113,
"rewards/accuracies": 0.3687500059604645,
"rewards/chosen": -0.9200434684753418,
"rewards/margins": 0.37314558029174805,
"rewards/rejected": -1.2931890487670898,
"step": 700
},
{
"epoch": 0.7326007326007326,
"eval_logits/chosen": 1.1833491325378418,
"eval_logits/rejected": 1.7187780141830444,
"eval_logps/chosen": -150.68544006347656,
"eval_logps/rejected": -204.91946411132812,
"eval_loss": 0.5986347794532776,
"eval_rewards/accuracies": 0.3650793731212616,
"eval_rewards/chosen": -0.7619420289993286,
"eval_rewards/margins": 0.4759688675403595,
"eval_rewards/rejected": -1.2379108667373657,
"eval_runtime": 113.6638,
"eval_samples_per_second": 17.596,
"eval_steps_per_second": 0.554,
"step": 700
},
{
"epoch": 0.7430664573521716,
"grad_norm": 27.404707816356066,
"learning_rate": 9.382138040640714e-08,
"logits/chosen": 0.7452703714370728,
"logits/rejected": 1.4386818408966064,
"logps/chosen": -175.61923217773438,
"logps/rejected": -214.57119750976562,
"loss": 0.618,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.852981686592102,
"rewards/margins": 0.44363918900489807,
"rewards/rejected": -1.2966209650039673,
"step": 710
},
{
"epoch": 0.7535321821036107,
"grad_norm": 27.03589930657382,
"learning_rate": 8.678793653740632e-08,
"logits/chosen": 1.3341294527053833,
"logits/rejected": 1.880934476852417,
"logps/chosen": -154.9960174560547,
"logps/rejected": -193.92404174804688,
"loss": 0.5852,
"rewards/accuracies": 0.38749998807907104,
"rewards/chosen": -0.8138143420219421,
"rewards/margins": 0.4843239188194275,
"rewards/rejected": -1.2981382608413696,
"step": 720
},
{
"epoch": 0.7639979068550498,
"grad_norm": 19.503526282992237,
"learning_rate": 7.997277433690983e-08,
"logits/chosen": 1.2488057613372803,
"logits/rejected": 1.543897271156311,
"logps/chosen": -179.2456817626953,
"logps/rejected": -218.6804656982422,
"loss": 0.6017,
"rewards/accuracies": 0.38749998807907104,
"rewards/chosen": -0.9957185983657837,
"rewards/margins": 0.3561645448207855,
"rewards/rejected": -1.3518832921981812,
"step": 730
},
{
"epoch": 0.7744636316064888,
"grad_norm": 24.561492093850955,
"learning_rate": 7.338500848029602e-08,
"logits/chosen": 0.7433587908744812,
"logits/rejected": 1.235414981842041,
"logps/chosen": -166.8399200439453,
"logps/rejected": -215.26876831054688,
"loss": 0.6178,
"rewards/accuracies": 0.35624998807907104,
"rewards/chosen": -0.9067522883415222,
"rewards/margins": 0.41579413414001465,
"rewards/rejected": -1.322546362876892,
"step": 740
},
{
"epoch": 0.7849293563579278,
"grad_norm": 28.115986705653192,
"learning_rate": 6.70334495204884e-08,
"logits/chosen": 0.6296231150627136,
"logits/rejected": 1.1273549795150757,
"logps/chosen": -182.21527099609375,
"logps/rejected": -222.38735961914062,
"loss": 0.6104,
"rewards/accuracies": 0.33125001192092896,
"rewards/chosen": -0.9979060292243958,
"rewards/margins": 0.37623411417007446,
"rewards/rejected": -1.3741401433944702,
"step": 750
},
{
"epoch": 0.7953950811093669,
"grad_norm": 25.524078897067774,
"learning_rate": 6.092659210462231e-08,
"logits/chosen": 0.6932498216629028,
"logits/rejected": 1.2560994625091553,
"logps/chosen": -163.45713806152344,
"logps/rejected": -210.75875854492188,
"loss": 0.6428,
"rewards/accuracies": 0.3812499940395355,
"rewards/chosen": -0.8338336944580078,
"rewards/margins": 0.41811808943748474,
"rewards/rejected": -1.2519516944885254,
"step": 760
},
{
"epoch": 0.8058608058608059,
"grad_norm": 19.037858132036448,
"learning_rate": 5.507260361320737e-08,
"logits/chosen": 0.7545400261878967,
"logits/rejected": 1.186693787574768,
"logps/chosen": -139.1665802001953,
"logps/rejected": -173.541748046875,
"loss": 0.5694,
"rewards/accuracies": 0.3187499940395355,
"rewards/chosen": -0.6477771401405334,
"rewards/margins": 0.3577363193035126,
"rewards/rejected": -1.0055135488510132,
"step": 770
},
{
"epoch": 0.8163265306122449,
"grad_norm": 25.539683354526822,
"learning_rate": 4.947931323697982e-08,
"logits/chosen": 0.8985282778739929,
"logits/rejected": 1.1507294178009033,
"logps/chosen": -127.5772933959961,
"logps/rejected": -150.71971130371094,
"loss": 0.6274,
"rewards/accuracies": 0.2562499940395355,
"rewards/chosen": -0.5861515402793884,
"rewards/margins": 0.2556864023208618,
"rewards/rejected": -0.8418378829956055,
"step": 780
},
{
"epoch": 0.826792255363684,
"grad_norm": 29.296410897786252,
"learning_rate": 4.415420150605398e-08,
"logits/chosen": 0.5066564083099365,
"logits/rejected": 1.2835947275161743,
"logps/chosen": -169.21194458007812,
"logps/rejected": -226.9739990234375,
"loss": 0.5972,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7719463109970093,
"rewards/margins": 0.5745865702629089,
"rewards/rejected": -1.346532940864563,
"step": 790
},
{
"epoch": 0.837257980115123,
"grad_norm": 23.071826857167007,
"learning_rate": 3.9104390285376374e-08,
"logits/chosen": 0.2603650689125061,
"logits/rejected": 0.9912735819816589,
"logps/chosen": -189.1995086669922,
"logps/rejected": -238.74069213867188,
"loss": 0.5885,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.8696195483207703,
"rewards/margins": 0.551138162612915,
"rewards/rejected": -1.4207580089569092,
"step": 800
},
{
"epoch": 0.837257980115123,
"eval_logits/chosen": 0.5612532496452332,
"eval_logits/rejected": 1.0127543210983276,
"eval_logps/chosen": -141.69253540039062,
"eval_logps/rejected": -192.48452758789062,
"eval_loss": 0.5973930954933167,
"eval_rewards/accuracies": 0.369047611951828,
"eval_rewards/chosen": -0.672012984752655,
"eval_rewards/margins": 0.4415486454963684,
"eval_rewards/rejected": -1.113561749458313,
"eval_runtime": 113.6177,
"eval_samples_per_second": 17.603,
"eval_steps_per_second": 0.554,
"step": 800
},
{
"epoch": 0.847723704866562,
"grad_norm": 17.313265289048484,
"learning_rate": 3.433663324986208e-08,
"logits/chosen": 0.3017066419124603,
"logits/rejected": 0.7334527373313904,
"logps/chosen": -151.9001007080078,
"logps/rejected": -182.63177490234375,
"loss": 0.613,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.6817248463630676,
"rewards/margins": 0.36351272463798523,
"rewards/rejected": -1.0452375411987305,
"step": 810
},
{
"epoch": 0.858189429618001,
"grad_norm": 32.324000689082936,
"learning_rate": 2.9857306851953897e-08,
"logits/chosen": 0.6850260496139526,
"logits/rejected": 1.2093479633331299,
"logps/chosen": -144.95767211914062,
"logps/rejected": -198.352783203125,
"loss": 0.6,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": -0.6805930733680725,
"rewards/margins": 0.44533902406692505,
"rewards/rejected": -1.1259320974349976,
"step": 820
},
{
"epoch": 0.8686551543694401,
"grad_norm": 27.72643919363414,
"learning_rate": 2.567240179368185e-08,
"logits/chosen": -0.07449465245008469,
"logits/rejected": 0.24005027115345,
"logps/chosen": -154.6387176513672,
"logps/rejected": -227.32882690429688,
"loss": 0.6169,
"rewards/accuracies": 0.36250001192092896,
"rewards/chosen": -0.7048233151435852,
"rewards/margins": 0.4948086142539978,
"rewards/rejected": -1.199631929397583,
"step": 830
},
{
"epoch": 0.8791208791208791,
"grad_norm": 23.57031908849105,
"learning_rate": 2.1787515014630357e-08,
"logits/chosen": 0.2434501200914383,
"logits/rejected": 0.7666997313499451,
"logps/chosen": -151.32212829589844,
"logps/rejected": -209.18057250976562,
"loss": 0.5855,
"rewards/accuracies": 0.38749998807907104,
"rewards/chosen": -0.6667734980583191,
"rewards/margins": 0.47117409110069275,
"rewards/rejected": -1.1379475593566895,
"step": 840
},
{
"epoch": 0.8895866038723181,
"grad_norm": 27.822093386580878,
"learning_rate": 1.820784220652766e-08,
"logits/chosen": 0.09173062443733215,
"logits/rejected": 0.7935197949409485,
"logps/chosen": -160.8597869873047,
"logps/rejected": -215.7743377685547,
"loss": 0.5858,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.6746786236763,
"rewards/margins": 0.5673048496246338,
"rewards/rejected": -1.2419836521148682,
"step": 850
},
{
"epoch": 0.9000523286237572,
"grad_norm": 24.993719056225927,
"learning_rate": 1.4938170864468636e-08,
"logits/chosen": -0.022973239421844482,
"logits/rejected": 0.728354811668396,
"logps/chosen": -177.0865936279297,
"logps/rejected": -222.30313110351562,
"loss": 0.587,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.7254469990730286,
"rewards/margins": 0.5123113989830017,
"rewards/rejected": -1.2377583980560303,
"step": 860
},
{
"epoch": 0.9105180533751962,
"grad_norm": 15.152551046729396,
"learning_rate": 1.1982873884064465e-08,
"logits/chosen": 0.3221861720085144,
"logits/rejected": 0.7626439332962036,
"logps/chosen": -138.0549774169922,
"logps/rejected": -176.89402770996094,
"loss": 0.6052,
"rewards/accuracies": 0.39375001192092896,
"rewards/chosen": -0.6149898767471313,
"rewards/margins": 0.41914796829223633,
"rewards/rejected": -1.0341379642486572,
"step": 870
},
{
"epoch": 0.9209837781266352,
"grad_norm": 34.8714524353856,
"learning_rate": 9.345903713082304e-09,
"logits/chosen": 0.6613011360168457,
"logits/rejected": 0.8276697397232056,
"logps/chosen": -124.2752914428711,
"logps/rejected": -161.05874633789062,
"loss": 0.6179,
"rewards/accuracies": 0.3062500059604645,
"rewards/chosen": -0.602181077003479,
"rewards/margins": 0.27649611234664917,
"rewards/rejected": -0.8786771893501282,
"step": 880
},
{
"epoch": 0.9314495028780743,
"grad_norm": 17.42009363611703,
"learning_rate": 7.030787065396865e-09,
"logits/chosen": 0.14433155953884125,
"logits/rejected": 0.636074423789978,
"logps/chosen": -144.4778289794922,
"logps/rejected": -189.1996612548828,
"loss": 0.6092,
"rewards/accuracies": 0.3499999940395355,
"rewards/chosen": -0.6060083508491516,
"rewards/margins": 0.42767366766929626,
"rewards/rejected": -1.033682107925415,
"step": 890
},
{
"epoch": 0.9419152276295133,
"grad_norm": 25.137415279564916,
"learning_rate": 5.04062020432286e-09,
"logits/chosen": 0.5784530639648438,
"logits/rejected": 1.0418832302093506,
"logps/chosen": -118.29461669921875,
"logps/rejected": -145.42251586914062,
"loss": 0.595,
"rewards/accuracies": 0.30000001192092896,
"rewards/chosen": -0.4733172357082367,
"rewards/margins": 0.31973880529403687,
"rewards/rejected": -0.793056070804596,
"step": 900
},
{
"epoch": 0.9419152276295133,
"eval_logits/chosen": 0.4325558543205261,
"eval_logits/rejected": 0.9106192588806152,
"eval_logps/chosen": -136.28819274902344,
"eval_logps/rejected": -189.55056762695312,
"eval_loss": 0.5958317518234253,
"eval_rewards/accuracies": 0.3710317313671112,
"eval_rewards/chosen": -0.6179695725440979,
"eval_rewards/margins": 0.4662524461746216,
"eval_rewards/rejected": -1.0842220783233643,
"eval_runtime": 113.6384,
"eval_samples_per_second": 17.6,
"eval_steps_per_second": 0.554,
"step": 900
},
{
"epoch": 0.9523809523809523,
"grad_norm": 48.333955356090875,
"learning_rate": 3.3780648016376866e-09,
"logits/chosen": -0.08735128492116928,
"logits/rejected": 0.43545690178871155,
"logps/chosen": -137.79881286621094,
"logps/rejected": -196.9468231201172,
"loss": 0.593,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.7070980072021484,
"rewards/margins": 0.4561308026313782,
"rewards/rejected": -1.1632287502288818,
"step": 910
},
{
"epoch": 0.9628466771323915,
"grad_norm": 28.43671889890296,
"learning_rate": 2.0453443778310766e-09,
"logits/chosen": -0.048371605575084686,
"logits/rejected": 0.41192588210105896,
"logps/chosen": -149.82281494140625,
"logps/rejected": -189.8140411376953,
"loss": 0.6088,
"rewards/accuracies": 0.3687500059604645,
"rewards/chosen": -0.6467713713645935,
"rewards/margins": 0.3985586166381836,
"rewards/rejected": -1.0453299283981323,
"step": 920
},
{
"epoch": 0.9733124018838305,
"grad_norm": 37.55350929589785,
"learning_rate": 1.0442413283435758e-09,
"logits/chosen": -0.05256899446249008,
"logits/rejected": 0.6733183860778809,
"logps/chosen": -138.57061767578125,
"logps/rejected": -181.3246307373047,
"loss": 0.5951,
"rewards/accuracies": 0.3687500059604645,
"rewards/chosen": -0.5108522772789001,
"rewards/margins": 0.48942360281944275,
"rewards/rejected": -1.0002758502960205,
"step": 930
},
{
"epoch": 0.9837781266352695,
"grad_norm": 23.109646236441566,
"learning_rate": 3.760945397705828e-10,
"logits/chosen": 0.17422077059745789,
"logits/rejected": 0.798999547958374,
"logps/chosen": -132.8301239013672,
"logps/rejected": -187.94265747070312,
"loss": 0.5961,
"rewards/accuracies": 0.36250001192092896,
"rewards/chosen": -0.599638819694519,
"rewards/margins": 0.4609258770942688,
"rewards/rejected": -1.060564637184143,
"step": 940
},
{
"epoch": 0.9942438513867086,
"grad_norm": 18.08118855716985,
"learning_rate": 4.17975992204056e-11,
"logits/chosen": 0.5573434829711914,
"logits/rejected": 0.9020156860351562,
"logps/chosen": -134.12124633789062,
"logps/rejected": -163.89334106445312,
"loss": 0.618,
"rewards/accuracies": 0.30000001192092896,
"rewards/chosen": -0.6402407288551331,
"rewards/margins": 0.31644412875175476,
"rewards/rejected": -0.9566848874092102,
"step": 950
},
{
"epoch": 0.9994767137624281,
"step": 955,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 0.0268,
"train_samples_per_second": 685440.279,
"train_steps_per_second": 10726.356
}
],
"logging_steps": 10,
"max_steps": 287,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}