{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 8.114700597292016, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.630662441253662, "logits/rejected": -2.588312864303589, "logps/chosen": -251.37826538085938, "logps/rejected": -245.56118774414062, "loss": 0.6932, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": 0.0003000783617608249, "rewards/margins": 3.4782169677782804e-05, "rewards/rejected": 0.00026529619935899973, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 9.2292553026982, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.6437935829162598, "logits/rejected": -2.5945396423339844, "logps/chosen": -305.3244323730469, "logps/rejected": -288.41082763671875, "loss": 0.692, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0019361503655090928, "rewards/margins": 0.0025582597590982914, "rewards/rejected": -0.0006221095682121813, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 8.362484362577634, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6338467597961426, "logits/rejected": -2.5890305042266846, "logps/chosen": -290.099365234375, "logps/rejected": -311.11444091796875, "loss": 0.6875, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.012083572335541248, "rewards/margins": 0.01316638570278883, "rewards/rejected": -0.0010828140657395124, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 8.520101675940221, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.6183691024780273, "logits/rejected": -2.586637258529663, "logps/chosen": -266.4432067871094, "logps/rejected": -251.4285125732422, "loss": 0.6773, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.022834882140159607, "rewards/margins": 0.023432452231645584, "rewards/rejected": -0.0005975713720545173, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 9.078671253466315, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.6266233921051025, "logits/rejected": -2.5520858764648438, "logps/chosen": -304.1348571777344, "logps/rejected": -262.32061767578125, "loss": 0.6617, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.002302716951817274, "rewards/margins": 0.0754440575838089, "rewards/rejected": -0.07314133644104004, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.561208486557007, "eval_logits/rejected": -2.4839041233062744, "eval_logps/chosen": -288.2777099609375, "eval_logps/rejected": -261.73126220703125, "eval_loss": 0.6429266333580017, "eval_rewards/accuracies": 0.6982758641242981, "eval_rewards/chosen": -0.031876176595687866, "eval_rewards/margins": 0.12021197378635406, "eval_rewards/rejected": -0.15208815038204193, "eval_runtime": 95.5992, "eval_samples_per_second": 19.017, "eval_steps_per_second": 0.303, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 11.596073380531864, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.547043561935425, "logits/rejected": -2.4718384742736816, "logps/chosen": -296.81402587890625, "logps/rejected": -309.17828369140625, "loss": 0.6408, "rewards/accuracies": 0.625, "rewards/chosen": -0.10427121073007584, "rewards/margins": 0.10880544036626816, "rewards/rejected": -0.2130766659975052, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 14.345685973653804, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.57369065284729, "logits/rejected": -2.5374178886413574, "logps/chosen": -286.39215087890625, "logps/rejected": -286.45330810546875, "loss": 0.6181, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.11706429719924927, "rewards/margins": 0.20492573082447052, "rewards/rejected": -0.321990042924881, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 15.011490045096238, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.5018064975738525, "logits/rejected": -2.4190878868103027, "logps/chosen": -328.02960205078125, "logps/rejected": -310.5951843261719, "loss": 0.6089, "rewards/accuracies": 0.71875, "rewards/chosen": -0.22072939574718475, "rewards/margins": 0.22173993289470673, "rewards/rejected": -0.44246941804885864, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 12.064286648165384, "learning_rate": 4.832031033425662e-07, "logits/chosen": -2.316053628921509, "logits/rejected": -2.2170588970184326, "logps/chosen": -309.4535217285156, "logps/rejected": -293.6477966308594, "loss": 0.6033, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.34011608362197876, "rewards/margins": 0.24611127376556396, "rewards/rejected": -0.5862273573875427, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 18.008124319426113, "learning_rate": 4.752422169756047e-07, "logits/chosen": -2.297001838684082, "logits/rejected": -2.1643242835998535, "logps/chosen": -329.64617919921875, "logps/rejected": -325.0845947265625, "loss": 0.5671, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3444996774196625, "rewards/margins": 0.4821701645851135, "rewards/rejected": -0.8266698718070984, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": -1.9024137258529663, "eval_logits/rejected": -1.719163417816162, "eval_logps/chosen": -347.71966552734375, "eval_logps/rejected": -351.6959533691406, "eval_loss": 0.5741031765937805, "eval_rewards/accuracies": 0.6896551847457886, "eval_rewards/chosen": -0.6262954473495483, "eval_rewards/margins": 0.4254392087459564, "eval_rewards/rejected": -1.0517346858978271, "eval_runtime": 94.9312, "eval_samples_per_second": 19.151, "eval_steps_per_second": 0.305, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 18.209810501109782, "learning_rate": 4.658354083558188e-07, "logits/chosen": -1.37660813331604, "logits/rejected": -1.3109476566314697, "logps/chosen": -306.15673828125, "logps/rejected": -364.8440856933594, "loss": 0.5638, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5968034267425537, "rewards/margins": 0.4451308250427246, "rewards/rejected": -1.0419342517852783, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 30.619035412465948, "learning_rate": 4.550430636492389e-07, "logits/chosen": -0.5667544007301331, "logits/rejected": -0.11496512591838837, "logps/chosen": -352.6363830566406, "logps/rejected": -321.98138427734375, "loss": 0.5429, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5322494506835938, "rewards/margins": 0.4666845202445984, "rewards/rejected": -0.9989339709281921, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 25.16692077643166, "learning_rate": 4.429344633468004e-07, "logits/chosen": 0.31375107169151306, "logits/rejected": 0.7534220814704895, "logps/chosen": -333.3154602050781, "logps/rejected": -383.91571044921875, "loss": 0.5276, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.756668746471405, "rewards/margins": 0.7230855226516724, "rewards/rejected": -1.479754090309143, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 21.8628335306802, "learning_rate": 4.2958733752443187e-07, "logits/chosen": 0.12007780373096466, "logits/rejected": 0.32281678915023804, "logps/chosen": -341.0147705078125, "logps/rejected": -368.9652404785156, "loss": 0.5337, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6602333188056946, "rewards/margins": 0.6314884424209595, "rewards/rejected": -1.2917217016220093, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 26.496344548145927, "learning_rate": 4.150873668617898e-07, "logits/chosen": 0.6530567407608032, "logits/rejected": 1.3690438270568848, "logps/chosen": -375.08856201171875, "logps/rejected": -408.30181884765625, "loss": 0.5328, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8673305511474609, "rewards/margins": 0.7217624187469482, "rewards/rejected": -1.5890929698944092, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": -0.10693416744470596, "eval_logits/rejected": 0.6796554327011108, "eval_logps/chosen": -340.38214111328125, "eval_logps/rejected": -367.83489990234375, "eval_loss": 0.5237244367599487, "eval_rewards/accuracies": 0.7155172228813171, "eval_rewards/chosen": -0.5529204607009888, "eval_rewards/margins": 0.6602039933204651, "eval_rewards/rejected": -1.2131245136260986, "eval_runtime": 95.5215, "eval_samples_per_second": 19.032, "eval_steps_per_second": 0.304, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 22.090085860189564, "learning_rate": 3.9952763262280397e-07, "logits/chosen": -0.6242966651916504, "logits/rejected": 0.1017322987318039, "logps/chosen": -337.90997314453125, "logps/rejected": -389.50592041015625, "loss": 0.5376, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4958871901035309, "rewards/margins": 0.6162235140800476, "rewards/rejected": -1.1121107339859009, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 29.49090849875655, "learning_rate": 3.8300801912883414e-07, "logits/chosen": 0.3189530670642853, "logits/rejected": 0.7786465287208557, "logps/chosen": -349.08013916015625, "logps/rejected": -422.4642639160156, "loss": 0.5223, "rewards/accuracies": 0.6875, "rewards/chosen": -0.786384642124176, "rewards/margins": 0.660692036151886, "rewards/rejected": -1.4470767974853516, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 21.619610594454734, "learning_rate": 3.6563457256020884e-07, "logits/chosen": 0.5451809167861938, "logits/rejected": 1.3867518901824951, "logps/chosen": -417.93450927734375, "logps/rejected": -453.64581298828125, "loss": 0.5262, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0858782529830933, "rewards/margins": 0.799088180065155, "rewards/rejected": -1.8849666118621826, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 22.879012626203956, "learning_rate": 3.475188202022617e-07, "logits/chosen": 0.8325613141059875, "logits/rejected": 1.527114748954773, "logps/chosen": -372.27227783203125, "logps/rejected": -433.6874084472656, "loss": 0.5396, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9668887853622437, "rewards/margins": 0.787070095539093, "rewards/rejected": -1.7539589405059814, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 19.71033831580365, "learning_rate": 3.287770545059052e-07, "logits/chosen": 0.25724777579307556, "logits/rejected": 1.0539257526397705, "logps/chosen": -339.7835388183594, "logps/rejected": -395.4685363769531, "loss": 0.5339, "rewards/accuracies": 0.75, "rewards/chosen": -0.7703949809074402, "rewards/margins": 0.7419241070747375, "rewards/rejected": -1.5123189687728882, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": 0.1333327293395996, "eval_logits/rejected": 1.1592669486999512, "eval_logps/chosen": -372.03033447265625, "eval_logps/rejected": -410.098388671875, "eval_loss": 0.5135313272476196, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -0.8694021701812744, "eval_rewards/margins": 0.7663572430610657, "eval_rewards/rejected": -1.6357594728469849, "eval_runtime": 94.9597, "eval_samples_per_second": 19.145, "eval_steps_per_second": 0.305, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 24.166340722703165, "learning_rate": 3.0952958655864954e-07, "logits/chosen": -0.02151186764240265, "logits/rejected": 0.7990398406982422, "logps/chosen": -386.0631408691406, "logps/rejected": -439.3416442871094, "loss": 0.5165, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.859545111656189, "rewards/margins": 0.791865348815918, "rewards/rejected": -1.651410460472107, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 29.831111320846208, "learning_rate": 2.898999737583448e-07, "logits/chosen": 0.28069013357162476, "logits/rejected": 0.9275471568107605, "logps/chosen": -352.3778076171875, "logps/rejected": -410.13238525390625, "loss": 0.5119, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7737405300140381, "rewards/margins": 0.7621678113937378, "rewards/rejected": -1.5359084606170654, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 27.12612279862232, "learning_rate": 2.7001422664752333e-07, "logits/chosen": 0.31911998987197876, "logits/rejected": 1.4982731342315674, "logps/chosen": -380.7137145996094, "logps/rejected": -428.25030517578125, "loss": 0.5326, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8266562223434448, "rewards/margins": 0.8594551086425781, "rewards/rejected": -1.6861114501953125, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 30.270434326846676, "learning_rate": 2.5e-07, "logits/chosen": 0.08341093361377716, "logits/rejected": 0.8582345843315125, "logps/chosen": -345.74591064453125, "logps/rejected": -403.52093505859375, "loss": 0.5195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8471673727035522, "rewards/margins": 0.6442986726760864, "rewards/rejected": -1.4914662837982178, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 24.62506692170475, "learning_rate": 2.2998577335247667e-07, "logits/chosen": 0.6792846918106079, "logits/rejected": 1.6594598293304443, "logps/chosen": -357.1456298828125, "logps/rejected": -407.39794921875, "loss": 0.5206, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9902165532112122, "rewards/margins": 0.7551018595695496, "rewards/rejected": -1.7453181743621826, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": 1.0596588850021362, "eval_logits/rejected": 2.125216007232666, "eval_logps/chosen": -399.8708190917969, "eval_logps/rejected": -448.6092529296875, "eval_loss": 0.5051079988479614, "eval_rewards/accuracies": 0.7456896305084229, "eval_rewards/chosen": -1.1478071212768555, "eval_rewards/margins": 0.8730602860450745, "eval_rewards/rejected": -2.020867347717285, "eval_runtime": 95.375, "eval_samples_per_second": 19.062, "eval_steps_per_second": 0.304, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 25.512119180845332, "learning_rate": 2.1010002624165524e-07, "logits/chosen": 1.0334103107452393, "logits/rejected": 1.9036356210708618, "logps/chosen": -393.27923583984375, "logps/rejected": -442.31634521484375, "loss": 0.5225, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1329892873764038, "rewards/margins": 0.6801810264587402, "rewards/rejected": -1.8131701946258545, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 22.529726863175615, "learning_rate": 1.9047041344135043e-07, "logits/chosen": 0.06802092492580414, "logits/rejected": 0.6059505343437195, "logps/chosen": -379.96771240234375, "logps/rejected": -420.07244873046875, "loss": 0.5157, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.902859091758728, "rewards/margins": 0.6471339464187622, "rewards/rejected": -1.5499929189682007, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 29.61705869960123, "learning_rate": 1.7122294549409482e-07, "logits/chosen": 0.36484724283218384, "logits/rejected": 1.1159727573394775, "logps/chosen": -378.2865905761719, "logps/rejected": -446.976318359375, "loss": 0.5034, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9187177419662476, "rewards/margins": 0.7557224035263062, "rewards/rejected": -1.6744401454925537, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 32.55159666302868, "learning_rate": 1.524811797977383e-07, "logits/chosen": 0.4420185089111328, "logits/rejected": 1.4995023012161255, "logps/chosen": -382.71185302734375, "logps/rejected": -434.3760681152344, "loss": 0.5136, "rewards/accuracies": 0.75, "rewards/chosen": -0.9455305933952332, "rewards/margins": 0.9165040254592896, "rewards/rejected": -1.862034559249878, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 23.09866796839295, "learning_rate": 1.3436542743979125e-07, "logits/chosen": 0.40113481879234314, "logits/rejected": 1.5598547458648682, "logps/chosen": -423.24615478515625, "logps/rejected": -433.6683044433594, "loss": 0.5161, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1671854257583618, "rewards/margins": 0.6783391237258911, "rewards/rejected": -1.845524549484253, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": 0.468977689743042, "eval_logits/rejected": 1.6077337265014648, "eval_logps/chosen": -392.01226806640625, "eval_logps/rejected": -439.55352783203125, "eval_loss": 0.4995412528514862, "eval_rewards/accuracies": 0.7413793206214905, "eval_rewards/chosen": -1.0692216157913208, "eval_rewards/margins": 0.8610891103744507, "eval_rewards/rejected": -1.9303104877471924, "eval_runtime": 95.9852, "eval_samples_per_second": 18.94, "eval_steps_per_second": 0.302, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 23.93680772701649, "learning_rate": 1.1699198087116588e-07, "logits/chosen": 0.4794091284275055, "logits/rejected": 1.1960773468017578, "logps/chosen": -373.32708740234375, "logps/rejected": -457.6255798339844, "loss": 0.4872, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9246217012405396, "rewards/margins": 0.8957231640815735, "rewards/rejected": -1.8203446865081787, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 33.75818947226454, "learning_rate": 1.00472367377196e-07, "logits/chosen": 0.50605309009552, "logits/rejected": 1.3813035488128662, "logps/chosen": -395.261474609375, "logps/rejected": -462.6114196777344, "loss": 0.4904, "rewards/accuracies": 0.75, "rewards/chosen": -1.1317824125289917, "rewards/margins": 0.8041917085647583, "rewards/rejected": -1.93597412109375, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 26.64078079133333, "learning_rate": 8.49126331382102e-08, "logits/chosen": 0.7998399138450623, "logits/rejected": 1.5950431823730469, "logps/chosen": -358.0443420410156, "logps/rejected": -450.14898681640625, "loss": 0.4977, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0693919658660889, "rewards/margins": 0.917527973651886, "rewards/rejected": -1.9869201183319092, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 25.809036872331053, "learning_rate": 7.041266247556812e-08, "logits/chosen": 0.4978507161140442, "logits/rejected": 1.611681580543518, "logps/chosen": -386.6365661621094, "logps/rejected": -486.8155212402344, "loss": 0.4861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9396215677261353, "rewards/margins": 1.1603189706802368, "rewards/rejected": -2.099940538406372, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 25.432021542271517, "learning_rate": 5.706553665319955e-08, "logits/chosen": 0.37401479482650757, "logits/rejected": 1.426959753036499, "logps/chosen": -393.84173583984375, "logps/rejected": -441.216796875, "loss": 0.5113, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0162795782089233, "rewards/margins": 0.8663312792778015, "rewards/rejected": -1.8826109170913696, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": 0.28273966908454895, "eval_logits/rejected": 1.4599332809448242, "eval_logps/chosen": -384.1430969238281, "eval_logps/rejected": -436.3081359863281, "eval_loss": 0.4952593147754669, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -0.9905301332473755, "eval_rewards/margins": 0.9073269963264465, "eval_rewards/rejected": -1.8978571891784668, "eval_runtime": 98.0468, "eval_samples_per_second": 18.542, "eval_steps_per_second": 0.296, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 24.878023970166044, "learning_rate": 4.4956936350761005e-08, "logits/chosen": 0.011499330401420593, "logits/rejected": 1.0976569652557373, "logps/chosen": -381.95184326171875, "logps/rejected": -447.99334716796875, "loss": 0.4998, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8815473318099976, "rewards/margins": 0.9280586242675781, "rewards/rejected": -1.8096059560775757, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 27.48602383662752, "learning_rate": 3.416459164418123e-08, "logits/chosen": 0.2708785831928253, "logits/rejected": 0.9552985429763794, "logps/chosen": -397.5840148925781, "logps/rejected": -466.92999267578125, "loss": 0.4821, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9426911473274231, "rewards/margins": 0.8755933046340942, "rewards/rejected": -1.8182843923568726, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 22.77493294832679, "learning_rate": 2.475778302439524e-08, "logits/chosen": 0.3190244436264038, "logits/rejected": 1.6189016103744507, "logps/chosen": -393.47637939453125, "logps/rejected": -460.20428466796875, "loss": 0.5039, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0399200916290283, "rewards/margins": 0.9450514912605286, "rewards/rejected": -1.9849714040756226, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 21.22789089972862, "learning_rate": 1.6796896657433805e-08, "logits/chosen": 0.4111763834953308, "logits/rejected": 1.2173652648925781, "logps/chosen": -375.0528869628906, "logps/rejected": -447.7088317871094, "loss": 0.5199, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.9053140878677368, "rewards/margins": 0.9649880528450012, "rewards/rejected": -1.8703022003173828, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 25.945198405726252, "learning_rate": 1.0333036740834855e-08, "logits/chosen": 0.3489355742931366, "logits/rejected": 1.733727216720581, "logps/chosen": -385.9686584472656, "logps/rejected": -407.83514404296875, "loss": 0.5006, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9153203964233398, "rewards/margins": 0.9544361233711243, "rewards/rejected": -1.8697564601898193, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": 0.33653396368026733, "eval_logits/rejected": 1.545719861984253, "eval_logps/chosen": -384.1485290527344, "eval_logps/rejected": -438.1469421386719, "eval_loss": 0.49427667260169983, "eval_rewards/accuracies": 0.732758641242981, "eval_rewards/chosen": -0.9905844330787659, "eval_rewards/margins": 0.9256603717803955, "eval_rewards/rejected": -1.9162448644638062, "eval_runtime": 95.1799, "eval_samples_per_second": 19.101, "eval_steps_per_second": 0.305, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 29.414671515939542, "learning_rate": 5.4076974448211685e-09, "logits/chosen": 0.3652718663215637, "logits/rejected": 1.4336962699890137, "logps/chosen": -407.9559326171875, "logps/rejected": -456.4903259277344, "loss": 0.486, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9434130787849426, "rewards/margins": 0.8941701650619507, "rewards/rejected": -1.837583303451538, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 23.283884603445415, "learning_rate": 2.052496544188487e-09, "logits/chosen": 0.2882634997367859, "logits/rejected": 1.3258743286132812, "logps/chosen": -375.098876953125, "logps/rejected": -462.24639892578125, "loss": 0.4823, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0388104915618896, "rewards/margins": 0.9280673861503601, "rewards/rejected": -1.9668779373168945, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 32.32447531053772, "learning_rate": 2.889724508297886e-10, "logits/chosen": 0.6452657580375671, "logits/rejected": 1.3607791662216187, "logps/chosen": -378.0513916015625, "logps/rejected": -468.52301025390625, "loss": 0.504, "rewards/accuracies": 0.75, "rewards/chosen": -1.1194813251495361, "rewards/margins": 0.9433539509773254, "rewards/rejected": -2.062835216522217, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.5444534902178914, "train_runtime": 11753.731, "train_samples_per_second": 4.744, "train_steps_per_second": 0.037 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }