{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988571428571429, "eval_steps": 50, "global_step": 437, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022857142857142857, "grad_norm": 6.795239341624469, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.700852632522583, "logits/rejected": -2.6250014305114746, "logps/chosen": -301.27313232421875, "logps/rejected": -281.78619384765625, "loss": 0.6931, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 0.0001308169448748231, "rewards/margins": 0.0004958957433700562, "rewards/rejected": -0.00036507885670289397, "step": 10 }, { "epoch": 0.045714285714285714, "grad_norm": 5.31428372226332, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.6415421962738037, "logits/rejected": -2.606222629547119, "logps/chosen": -278.8970642089844, "logps/rejected": -254.64749145507812, "loss": 0.6924, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0027411712799221277, "rewards/margins": 0.001525188097730279, "rewards/rejected": 0.001215982949361205, "step": 20 }, { "epoch": 0.06857142857142857, "grad_norm": 5.9664481153189435, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.638169765472412, "logits/rejected": -2.617159843444824, "logps/chosen": -263.23223876953125, "logps/rejected": -263.40374755859375, "loss": 0.6883, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.014508177526295185, "rewards/margins": 0.00861530750989914, "rewards/rejected": 0.0058928681537508965, "step": 30 }, { "epoch": 0.09142857142857143, "grad_norm": 6.667336557428276, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.648975372314453, "logits/rejected": -2.585244655609131, "logps/chosen": -290.2044372558594, "logps/rejected": -268.3276062011719, "loss": 0.6785, "rewards/accuracies": 0.59375, "rewards/chosen": 0.037928324192762375, "rewards/margins": 0.044891245663166046, "rewards/rejected": -0.006962914951145649, "step": 40 }, { "epoch": 0.11428571428571428, "grad_norm": 9.813117329804816, "learning_rate": 4.997124959943201e-07, "logits/chosen": -2.6792047023773193, "logits/rejected": -2.5978188514709473, "logps/chosen": -293.65264892578125, "logps/rejected": -254.2649688720703, "loss": 0.6663, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.026576850563287735, "rewards/margins": 0.10058300197124481, "rewards/rejected": -0.07400616258382797, "step": 50 }, { "epoch": 0.11428571428571428, "eval_logits/chosen": -2.541201591491699, "eval_logits/rejected": -2.4377598762512207, "eval_logps/chosen": -276.20166015625, "eval_logps/rejected": -235.61155700683594, "eval_loss": 0.6532372832298279, "eval_rewards/accuracies": 0.6896551847457886, "eval_rewards/chosen": -0.005977254826575518, "eval_rewards/margins": 0.15937723219394684, "eval_rewards/rejected": -0.16535447537899017, "eval_runtime": 91.1786, "eval_samples_per_second": 20.081, "eval_steps_per_second": 0.318, "step": 50 }, { "epoch": 0.13714285714285715, "grad_norm": 8.50170881260791, "learning_rate": 4.979579212164186e-07, "logits/chosen": -2.5797510147094727, "logits/rejected": -2.472832202911377, "logps/chosen": -293.24212646484375, "logps/rejected": -275.13885498046875, "loss": 0.646, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.12736138701438904, "rewards/margins": 0.1385059803724289, "rewards/rejected": -0.2658673822879791, "step": 60 }, { "epoch": 0.16, "grad_norm": 9.027696167666651, "learning_rate": 4.946196886175515e-07, "logits/chosen": -2.5882785320281982, "logits/rejected": -2.539330005645752, "logps/chosen": -293.43145751953125, "logps/rejected": -300.1482849121094, "loss": 0.6244, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17653189599514008, "rewards/margins": 0.22868318855762482, "rewards/rejected": -0.4052151143550873, "step": 70 }, { "epoch": 0.18285714285714286, "grad_norm": 10.603734895101157, "learning_rate": 4.897191188239667e-07, "logits/chosen": -2.623680591583252, "logits/rejected": -2.5742952823638916, "logps/chosen": -285.3603820800781, "logps/rejected": -306.60211181640625, "loss": 0.6123, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.18331322073936462, "rewards/margins": 0.3296189308166504, "rewards/rejected": -0.5129320621490479, "step": 80 }, { "epoch": 0.2057142857142857, "grad_norm": 15.25024463895093, "learning_rate": 4.832875107981763e-07, "logits/chosen": -2.6875650882720947, "logits/rejected": -2.6345021724700928, "logps/chosen": -295.8832092285156, "logps/rejected": -313.13983154296875, "loss": 0.6191, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.19597890973091125, "rewards/margins": 0.37993985414505005, "rewards/rejected": -0.5759187340736389, "step": 90 }, { "epoch": 0.22857142857142856, "grad_norm": 12.34704700331818, "learning_rate": 4.753659419387223e-07, "logits/chosen": -2.679297685623169, "logits/rejected": -2.5944952964782715, "logps/chosen": -330.1031799316406, "logps/rejected": -318.5290832519531, "loss": 0.6051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.43716782331466675, "rewards/margins": 0.4136085510253906, "rewards/rejected": -0.8507764935493469, "step": 100 }, { "epoch": 0.22857142857142856, "eval_logits/chosen": -2.493269920349121, "eval_logits/rejected": -2.3746213912963867, "eval_logps/chosen": -356.7496643066406, "eval_logps/rejected": -346.2107238769531, "eval_loss": 0.612766683101654, "eval_rewards/accuracies": 0.7112069129943848, "eval_rewards/chosen": -0.8114572167396545, "eval_rewards/margins": 0.4598887860774994, "eval_rewards/rejected": -1.2713459730148315, "eval_runtime": 90.1893, "eval_samples_per_second": 20.302, "eval_steps_per_second": 0.322, "step": 100 }, { "epoch": 0.25142857142857145, "grad_norm": 13.527645485553371, "learning_rate": 4.660050057270191e-07, "logits/chosen": -2.269178628921509, "logits/rejected": -2.168506622314453, "logps/chosen": -387.71820068359375, "logps/rejected": -411.88427734375, "loss": 0.5873, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7141460180282593, "rewards/margins": 0.4256005883216858, "rewards/rejected": -1.1397466659545898, "step": 110 }, { "epoch": 0.2742857142857143, "grad_norm": 17.046687026346753, "learning_rate": 4.5526448859687144e-07, "logits/chosen": -1.293348789215088, "logits/rejected": -0.927165687084198, "logps/chosen": -381.55316162109375, "logps/rejected": -354.0766906738281, "loss": 0.5755, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7064869403839111, "rewards/margins": 0.5171381831169128, "rewards/rejected": -1.2236251831054688, "step": 120 }, { "epoch": 0.29714285714285715, "grad_norm": 19.945418074044913, "learning_rate": 4.432129880904388e-07, "logits/chosen": -0.14520399272441864, "logits/rejected": 0.31017133593559265, "logps/chosen": -394.09820556640625, "logps/rejected": -395.47674560546875, "loss": 0.5448, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8894233703613281, "rewards/margins": 0.5606414675712585, "rewards/rejected": -1.4500648975372314, "step": 130 }, { "epoch": 0.32, "grad_norm": 23.269894199893105, "learning_rate": 4.299274747394055e-07, "logits/chosen": 0.3922499716281891, "logits/rejected": 0.7626418471336365, "logps/chosen": -402.1969299316406, "logps/rejected": -436.99725341796875, "loss": 0.5611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8674923777580261, "rewards/margins": 0.783365786075592, "rewards/rejected": -1.6508581638336182, "step": 140 }, { "epoch": 0.34285714285714286, "grad_norm": 18.255646968547413, "learning_rate": 4.1549280046953653e-07, "logits/chosen": -0.056426752358675, "logits/rejected": 0.6437274813652039, "logps/chosen": -360.7496032714844, "logps/rejected": -432.40399169921875, "loss": 0.5375, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7376368641853333, "rewards/margins": 0.8234134912490845, "rewards/rejected": -1.5610501766204834, "step": 150 }, { "epoch": 0.34285714285714286, "eval_logits/chosen": 0.20309801399707794, "eval_logits/rejected": 1.3727048635482788, "eval_logps/chosen": -358.7465515136719, "eval_logps/rejected": -413.1859436035156, "eval_loss": 0.5486596822738647, "eval_rewards/accuracies": 0.767241358757019, "eval_rewards/chosen": -0.8314265012741089, "eval_rewards/margins": 1.1096714735031128, "eval_rewards/rejected": -1.9410980939865112, "eval_runtime": 90.1892, "eval_samples_per_second": 20.302, "eval_steps_per_second": 0.322, "step": 150 }, { "epoch": 0.3657142857142857, "grad_norm": 20.04208599875873, "learning_rate": 4.000011566683401e-07, "logits/chosen": 0.4694085121154785, "logits/rejected": 1.3121615648269653, "logps/chosen": -412.69488525390625, "logps/rejected": -459.99188232421875, "loss": 0.548, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1335456371307373, "rewards/margins": 0.9099456071853638, "rewards/rejected": -2.0434913635253906, "step": 160 }, { "epoch": 0.38857142857142857, "grad_norm": 22.516250506361718, "learning_rate": 3.8355148537705047e-07, "logits/chosen": 0.1457391083240509, "logits/rejected": 0.8692816495895386, "logps/chosen": -395.64947509765625, "logps/rejected": -417.6402282714844, "loss": 0.5469, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9154456257820129, "rewards/margins": 0.6036463379859924, "rewards/rejected": -1.5190918445587158, "step": 170 }, { "epoch": 0.4114285714285714, "grad_norm": 20.567923807377685, "learning_rate": 3.662488473675315e-07, "logits/chosen": 0.6181103587150574, "logits/rejected": 1.7128187417984009, "logps/chosen": -436.68780517578125, "logps/rejected": -469.717041015625, "loss": 0.5551, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.065079927444458, "rewards/margins": 1.0325844287872314, "rewards/rejected": -2.0976643562316895, "step": 180 }, { "epoch": 0.4342857142857143, "grad_norm": 20.909674986872478, "learning_rate": 3.48203751140067e-07, "logits/chosen": 1.2501403093338013, "logits/rejected": 2.2078864574432373, "logps/chosen": -380.656982421875, "logps/rejected": -409.70556640625, "loss": 0.5412, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0439938306808472, "rewards/margins": 0.7080799341201782, "rewards/rejected": -1.7520736455917358, "step": 190 }, { "epoch": 0.45714285714285713, "grad_norm": 21.67729434989596, "learning_rate": 3.2953144712759537e-07, "logits/chosen": 0.7689538598060608, "logits/rejected": 1.9063518047332764, "logps/chosen": -359.4909362792969, "logps/rejected": -411.184814453125, "loss": 0.5435, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9114822149276733, "rewards/margins": 0.91156005859375, "rewards/rejected": -1.8230421543121338, "step": 200 }, { "epoch": 0.45714285714285713, "eval_logits/chosen": 1.3441277742385864, "eval_logits/rejected": 2.707573652267456, "eval_logps/chosen": -374.5489501953125, "eval_logps/rejected": -426.7857971191406, "eval_loss": 0.5358834266662598, "eval_rewards/accuracies": 0.7543103694915771, "eval_rewards/chosen": -0.9894503355026245, "eval_rewards/margins": 1.087646245956421, "eval_rewards/rejected": -2.077096462249756, "eval_runtime": 90.1648, "eval_samples_per_second": 20.307, "eval_steps_per_second": 0.322, "step": 200 }, { "epoch": 0.48, "grad_norm": 19.28945802551147, "learning_rate": 3.103511916141658e-07, "logits/chosen": 1.5224826335906982, "logits/rejected": 2.394577741622925, "logps/chosen": -385.7353210449219, "logps/rejected": -451.604248046875, "loss": 0.5372, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2228174209594727, "rewards/margins": 0.8404253125190735, "rewards/rejected": -2.0632426738739014, "step": 210 }, { "epoch": 0.5028571428571429, "grad_norm": 24.47080032118637, "learning_rate": 2.9078548506882117e-07, "logits/chosen": 1.5350468158721924, "logits/rejected": 2.541968822479248, "logps/chosen": -425.51287841796875, "logps/rejected": -466.1084899902344, "loss": 0.5604, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4044690132141113, "rewards/margins": 0.794781506061554, "rewards/rejected": -2.1992506980895996, "step": 220 }, { "epoch": 0.5257142857142857, "grad_norm": 20.61426463626924, "learning_rate": 2.709592897595191e-07, "logits/chosen": 1.438730001449585, "logits/rejected": 2.638312816619873, "logps/chosen": -390.794189453125, "logps/rejected": -433.10406494140625, "loss": 0.5311, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0973504781723022, "rewards/margins": 0.8408235311508179, "rewards/rejected": -1.9381740093231201, "step": 230 }, { "epoch": 0.5485714285714286, "grad_norm": 31.905445593128672, "learning_rate": 2.509992316440332e-07, "logits/chosen": 1.2066385746002197, "logits/rejected": 2.3449177742004395, "logps/chosen": -413.14825439453125, "logps/rejected": -506.625, "loss": 0.5256, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.1443836688995361, "rewards/margins": 1.2076470851898193, "rewards/rejected": -2.3520307540893555, "step": 240 }, { "epoch": 0.5714285714285714, "grad_norm": 18.97837160736367, "learning_rate": 2.3103279163519918e-07, "logits/chosen": 0.9885716438293457, "logits/rejected": 1.7852414846420288, "logps/chosen": -384.52496337890625, "logps/rejected": -472.253662109375, "loss": 0.5433, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0389870405197144, "rewards/margins": 0.9815452694892883, "rewards/rejected": -2.0205321311950684, "step": 250 }, { "epoch": 0.5714285714285714, "eval_logits/chosen": 0.858768880367279, "eval_logits/rejected": 2.412114381790161, "eval_logps/chosen": -365.6370544433594, "eval_logps/rejected": -425.7062683105469, "eval_loss": 0.528998613357544, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -0.9003310203552246, "eval_rewards/margins": 1.1659703254699707, "eval_rewards/rejected": -2.0663013458251953, "eval_runtime": 90.3653, "eval_samples_per_second": 20.262, "eval_steps_per_second": 0.321, "step": 250 }, { "epoch": 0.5942857142857143, "grad_norm": 21.94464499251825, "learning_rate": 2.1118749140573358e-07, "logits/chosen": 1.5066580772399902, "logits/rejected": 2.079137086868286, "logps/chosen": -411.3843688964844, "logps/rejected": -482.978515625, "loss": 0.5408, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3159770965576172, "rewards/margins": 0.7803784608840942, "rewards/rejected": -2.096355438232422, "step": 260 }, { "epoch": 0.6171428571428571, "grad_norm": 23.287724561115347, "learning_rate": 1.9159007893272703e-07, "logits/chosen": 1.869363784790039, "logits/rejected": 3.169628620147705, "logps/chosen": -400.696533203125, "logps/rejected": -456.28155517578125, "loss": 0.517, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2817071676254272, "rewards/margins": 0.9760338664054871, "rewards/rejected": -2.2577412128448486, "step": 270 }, { "epoch": 0.64, "grad_norm": 26.1145325639797, "learning_rate": 1.7236571898357766e-07, "logits/chosen": 2.085681438446045, "logits/rejected": 2.909884214401245, "logps/chosen": -402.3949890136719, "logps/rejected": -493.7689514160156, "loss": 0.5287, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2935690879821777, "rewards/margins": 1.0130523443222046, "rewards/rejected": -2.3066213130950928, "step": 280 }, { "epoch": 0.6628571428571428, "grad_norm": 28.3817297395316, "learning_rate": 1.5363719371356882e-07, "logits/chosen": 1.904044508934021, "logits/rejected": 2.7162575721740723, "logps/chosen": -424.409912109375, "logps/rejected": -482.04913330078125, "loss": 0.5285, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2350399494171143, "rewards/margins": 0.9299663305282593, "rewards/rejected": -2.165006399154663, "step": 290 }, { "epoch": 0.6857142857142857, "grad_norm": 17.85129410356221, "learning_rate": 1.3552411848071565e-07, "logits/chosen": 1.697782278060913, "logits/rejected": 3.180041551589966, "logps/chosen": -419.85028076171875, "logps/rejected": -478.419677734375, "loss": 0.5194, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1794109344482422, "rewards/margins": 1.078364610671997, "rewards/rejected": -2.2577755451202393, "step": 300 }, { "epoch": 0.6857142857142857, "eval_logits/chosen": 1.7084869146347046, "eval_logits/rejected": 3.311720132827759, "eval_logps/chosen": -371.6744689941406, "eval_logps/rejected": -439.6499938964844, "eval_loss": 0.5213173031806946, "eval_rewards/accuracies": 0.7715517282485962, "eval_rewards/chosen": -0.9607052206993103, "eval_rewards/margins": 1.2450333833694458, "eval_rewards/rejected": -2.2057385444641113, "eval_runtime": 89.9422, "eval_samples_per_second": 20.358, "eval_steps_per_second": 0.322, "step": 300 }, { "epoch": 0.7085714285714285, "grad_norm": 22.76802438882901, "learning_rate": 1.1814217788631473e-07, "logits/chosen": 1.900792121887207, "logits/rejected": 2.7918269634246826, "logps/chosen": -372.843994140625, "logps/rejected": -442.9312438964844, "loss": 0.5285, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1653985977172852, "rewards/margins": 0.8919604420661926, "rewards/rejected": -2.057358980178833, "step": 310 }, { "epoch": 0.7314285714285714, "grad_norm": 19.73975657149685, "learning_rate": 1.0160238692045331e-07, "logits/chosen": 2.1896469593048096, "logits/rejected": 2.8715972900390625, "logps/chosen": -380.424560546875, "logps/rejected": -454.2293395996094, "loss": 0.536, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3614219427108765, "rewards/margins": 0.7709897756576538, "rewards/rejected": -2.132411479949951, "step": 320 }, { "epoch": 0.7542857142857143, "grad_norm": 29.56922781200817, "learning_rate": 8.601038193139438e-08, "logits/chosen": 1.6053155660629272, "logits/rejected": 2.692516565322876, "logps/chosen": -416.57342529296875, "logps/rejected": -465.4991760253906, "loss": 0.5313, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1735525131225586, "rewards/margins": 1.003560185432434, "rewards/rejected": -2.177112579345703, "step": 330 }, { "epoch": 0.7771428571428571, "grad_norm": 18.098670935967576, "learning_rate": 7.146574594727572e-08, "logits/chosen": 2.0766067504882812, "logits/rejected": 2.8303616046905518, "logps/chosen": -387.4620361328125, "logps/rejected": -468.67718505859375, "loss": 0.5193, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2151105403900146, "rewards/margins": 1.0514241456985474, "rewards/rejected": -2.2665345668792725, "step": 340 }, { "epoch": 0.8, "grad_norm": 20.794164513921476, "learning_rate": 5.8061372659157306e-08, "logits/chosen": 1.6319509744644165, "logits/rejected": 2.7972917556762695, "logps/chosen": -412.102783203125, "logps/rejected": -458.27191162109375, "loss": 0.5325, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2239553928375244, "rewards/margins": 0.8157873153686523, "rewards/rejected": -2.0397427082061768, "step": 350 }, { "epoch": 0.8, "eval_logits/chosen": 2.0842368602752686, "eval_logits/rejected": 3.6707816123962402, "eval_logps/chosen": -389.46490478515625, "eval_logps/rejected": -456.7085266113281, "eval_loss": 0.5216463804244995, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -1.1386092901229858, "eval_rewards/margins": 1.237714409828186, "eval_rewards/rejected": -2.3763234615325928, "eval_runtime": 89.8616, "eval_samples_per_second": 20.376, "eval_steps_per_second": 0.323, "step": 350 }, { "epoch": 0.8228571428571428, "grad_norm": 18.184259604484346, "learning_rate": 4.5882873127531614e-08, "logits/chosen": 1.648209810256958, "logits/rejected": 2.9181623458862305, "logps/chosen": -407.1295166015625, "logps/rejected": -477.27447509765625, "loss": 0.5146, "rewards/accuracies": 0.78125, "rewards/chosen": -1.217245101928711, "rewards/margins": 1.044634222984314, "rewards/rejected": -2.2618794441223145, "step": 360 }, { "epoch": 0.8457142857142858, "grad_norm": 19.108285818305696, "learning_rate": 3.500802900154412e-08, "logits/chosen": 1.801898717880249, "logits/rejected": 3.196338176727295, "logps/chosen": -383.25311279296875, "logps/rejected": -463.01727294921875, "loss": 0.5188, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1219167709350586, "rewards/margins": 1.1241002082824707, "rewards/rejected": -2.2460172176361084, "step": 370 }, { "epoch": 0.8685714285714285, "grad_norm": 23.620382836684982, "learning_rate": 2.550629574310309e-08, "logits/chosen": 1.4818474054336548, "logits/rejected": 2.90739107131958, "logps/chosen": -453.0061950683594, "logps/rejected": -476.94830322265625, "loss": 0.5263, "rewards/accuracies": 0.71875, "rewards/chosen": -1.349498987197876, "rewards/margins": 0.8459898233413696, "rewards/rejected": -2.195488691329956, "step": 380 }, { "epoch": 0.8914285714285715, "grad_norm": 22.14239335519297, "learning_rate": 1.7438359028687983e-08, "logits/chosen": 1.8351167440414429, "logits/rejected": 2.6260292530059814, "logps/chosen": -425.75128173828125, "logps/rejected": -503.3841857910156, "loss": 0.5275, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1437828540802002, "rewards/margins": 0.9423319697380066, "rewards/rejected": -2.0861151218414307, "step": 390 }, { "epoch": 0.9142857142857143, "grad_norm": 32.206706951444914, "learning_rate": 1.0855747162029361e-08, "logits/chosen": 2.132110357284546, "logits/rejected": 2.6392226219177246, "logps/chosen": -411.29962158203125, "logps/rejected": -477.0232849121094, "loss": 0.5483, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3256638050079346, "rewards/margins": 0.7788330316543579, "rewards/rejected": -2.104496955871582, "step": 400 }, { "epoch": 0.9142857142857143, "eval_logits/chosen": 2.117452621459961, "eval_logits/rejected": 3.7050397396087646, "eval_logps/chosen": -386.83795166015625, "eval_logps/rejected": -455.0307312011719, "eval_loss": 0.520908772945404, "eval_rewards/accuracies": 0.767241358757019, "eval_rewards/chosen": -1.112339973449707, "eval_rewards/margins": 1.2472059726715088, "eval_rewards/rejected": -2.3595457077026367, "eval_runtime": 90.8703, "eval_samples_per_second": 20.15, "eval_steps_per_second": 0.319, "step": 400 }, { "epoch": 0.9371428571428572, "grad_norm": 21.9038704574243, "learning_rate": 5.8005019731033615e-09, "logits/chosen": 1.9021247625350952, "logits/rejected": 2.9709084033966064, "logps/chosen": -423.39990234375, "logps/rejected": -478.46929931640625, "loss": 0.5184, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3521995544433594, "rewards/margins": 0.8351926803588867, "rewards/rejected": -2.187392234802246, "step": 410 }, { "epoch": 0.96, "grad_norm": 21.54473906200769, "learning_rate": 2.3049103053431886e-09, "logits/chosen": 1.8090896606445312, "logits/rejected": 3.297045946121216, "logps/chosen": -384.42333984375, "logps/rejected": -458.969482421875, "loss": 0.522, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0167523622512817, "rewards/margins": 1.2330738306045532, "rewards/rejected": -2.249825954437256, "step": 420 }, { "epoch": 0.9828571428571429, "grad_norm": 22.41955699037185, "learning_rate": 3.9129780600541397e-10, "logits/chosen": 2.2351975440979004, "logits/rejected": 3.178173065185547, "logps/chosen": -401.39642333984375, "logps/rejected": -481.4127502441406, "loss": 0.5214, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.191645622253418, "rewards/margins": 0.9857856631278992, "rewards/rejected": -2.177431344985962, "step": 430 }, { "epoch": 0.9988571428571429, "step": 437, "total_flos": 0.0, "train_loss": 0.5630035629534339, "train_runtime": 11387.5716, "train_samples_per_second": 4.918, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 437, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }