{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9900896649362907, "eval_steps": 500, "global_step": 396, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 36.62869333113681, "learning_rate": 1.25e-08, "logps/chosen": -45.61046600341797, "logps/rejected": -53.26762390136719, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 1.611132025718689, "losses/total": 0.6931471824645996, "ref_logps/chosen": -45.61046600341797, "ref_logps/rejected": -53.26762390136719, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 42.54885701113561, "learning_rate": 2.5e-08, "logps/chosen": -41.86249542236328, "logps/rejected": -46.448211669921875, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 1.5374380350112915, "losses/total": 0.6931471824645996, "ref_logps/chosen": -41.86249542236328, "ref_logps/rejected": -46.448211669921875, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.02, "grad_norm": 39.00442507634352, "learning_rate": 3.75e-08, "logps/chosen": -45.93993377685547, "logps/rejected": -51.82826614379883, "loss": 0.6911, "losses/dpo": 0.6855990290641785, "losses/sft": 1.6630754470825195, "losses/total": 0.6855990290641785, "ref_logps/chosen": -45.96686935424805, "ref_logps/rejected": -51.811519622802734, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.0026936656795442104, "rewards/margins": 0.0043686856515705585, "rewards/rejected": -0.0016750202048569918, "step": 3 }, { "epoch": 0.03, "grad_norm": 26.85932008129627, "learning_rate": 5e-08, "logps/chosen": -42.3924560546875, "logps/rejected": -48.834312438964844, "loss": 0.6914, "losses/dpo": 0.6885390877723694, "losses/sft": 1.7307008504867554, "losses/total": 0.6885390877723694, "ref_logps/chosen": -42.41312789916992, "ref_logps/rejected": -48.81687545776367, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0020673051476478577, "rewards/margins": 0.003811263246461749, "rewards/rejected": -0.0017439575167372823, "step": 4 }, { "epoch": 0.04, "grad_norm": 27.143923382855128, "learning_rate": 6.25e-08, "logps/chosen": -41.313140869140625, "logps/rejected": -47.7628173828125, "loss": 0.6928, "losses/dpo": 0.6861088275909424, "losses/sft": 1.5048630237579346, "losses/total": 0.6861088275909424, "ref_logps/chosen": -41.3216667175293, "ref_logps/rejected": -47.76300048828125, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.0008526199380867183, "rewards/margins": 0.0008343784138560295, "rewards/rejected": 1.8241582438349724e-05, "step": 5 }, { "epoch": 0.05, "grad_norm": 28.003490160385883, "learning_rate": 7.5e-08, "logps/chosen": -43.02016067504883, "logps/rejected": -48.82761764526367, "loss": 0.6937, "losses/dpo": 0.6989647150039673, "losses/sft": 1.5936439037322998, "losses/total": 0.6989647150039673, "ref_logps/chosen": -43.02461242675781, "ref_logps/rejected": -48.84097671508789, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.0004450226842891425, "rewards/margins": -0.0008908101008273661, "rewards/rejected": 0.0013358332216739655, "step": 6 }, { "epoch": 0.05, "grad_norm": 34.50235061147626, "learning_rate": 8.75e-08, "logps/chosen": -45.01873016357422, "logps/rejected": -49.4879264831543, "loss": 0.6943, "losses/dpo": 0.6984776258468628, "losses/sft": 1.502335548400879, "losses/total": 0.6984776258468628, "ref_logps/chosen": -45.0099983215332, "ref_logps/rejected": -49.500762939453125, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.0008732563583180308, "rewards/margins": -0.002156583359465003, "rewards/rejected": 0.0012833268847316504, "step": 7 }, { "epoch": 0.06, "grad_norm": 30.70052702392896, "learning_rate": 1e-07, "logps/chosen": -44.85755920410156, "logps/rejected": -48.57131576538086, "loss": 0.6936, "losses/dpo": 0.6966960430145264, "losses/sft": 1.7092387676239014, "losses/total": 0.6966960430145264, "ref_logps/chosen": -44.8560905456543, "ref_logps/rejected": -48.575496673583984, "rewards/accuracies": 0.515625, "rewards/chosen": -0.00014672009274363518, "rewards/margins": -0.00056473194854334, "rewards/rejected": 0.00041801203042268753, "step": 8 }, { "epoch": 0.07, "grad_norm": 33.16879367731749, "learning_rate": 1.125e-07, "logps/chosen": -35.955230712890625, "logps/rejected": -46.44932556152344, "loss": 0.6948, "losses/dpo": 0.6989741325378418, "losses/sft": 1.477859616279602, "losses/total": 0.6989741325378418, "ref_logps/chosen": -35.95939636230469, "ref_logps/rejected": -46.48387908935547, "rewards/accuracies": 0.453125, "rewards/chosen": 0.0004162658005952835, "rewards/margins": -0.003039113013073802, "rewards/rejected": 0.003455378580838442, "step": 9 }, { "epoch": 0.08, "grad_norm": 50.63241601482298, "learning_rate": 1.25e-07, "logps/chosen": -47.975284576416016, "logps/rejected": -53.06822967529297, "loss": 0.695, "losses/dpo": 0.69514000415802, "losses/sft": 1.6934856176376343, "losses/total": 0.69514000415802, "ref_logps/chosen": -47.96134567260742, "ref_logps/rejected": -53.08888626098633, "rewards/accuracies": 0.4296875, "rewards/chosen": -0.001393341924995184, "rewards/margins": -0.003458950901404023, "rewards/rejected": 0.002065609209239483, "step": 10 }, { "epoch": 0.08, "grad_norm": 30.253970893581076, "learning_rate": 1.375e-07, "logps/chosen": -42.77811050415039, "logps/rejected": -47.56971740722656, "loss": 0.6935, "losses/dpo": 0.6940985321998596, "losses/sft": 1.6380270719528198, "losses/total": 0.6940985321998596, "ref_logps/chosen": -42.81279754638672, "ref_logps/rejected": -47.608062744140625, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0034681265242397785, "rewards/margins": -0.0003662836679723114, "rewards/rejected": 0.003834410337731242, "step": 11 }, { "epoch": 0.09, "grad_norm": 27.952400827978398, "learning_rate": 1.5e-07, "logps/chosen": -42.66459655761719, "logps/rejected": -51.37031173706055, "loss": 0.6931, "losses/dpo": 0.6945158243179321, "losses/sft": 1.6248236894607544, "losses/total": 0.6945158243179321, "ref_logps/chosen": -42.65010070800781, "ref_logps/rejected": -51.35215759277344, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014495223294943571, "rewards/margins": 0.00036603002808988094, "rewards/rejected": -0.0018155521247535944, "step": 12 }, { "epoch": 0.1, "grad_norm": 40.83336397601219, "learning_rate": 1.625e-07, "logps/chosen": -47.69675827026367, "logps/rejected": -53.5283203125, "loss": 0.6919, "losses/dpo": 0.6918787956237793, "losses/sft": 1.78733491897583, "losses/total": 0.6918787956237793, "ref_logps/chosen": -47.74595260620117, "ref_logps/rejected": -53.549530029296875, "rewards/accuracies": 0.5, "rewards/chosen": 0.004919426515698433, "rewards/margins": 0.002798424568027258, "rewards/rejected": 0.002121001947671175, "step": 13 }, { "epoch": 0.11, "grad_norm": 27.578090909874792, "learning_rate": 1.75e-07, "logps/chosen": -41.67103576660156, "logps/rejected": -48.25772476196289, "loss": 0.6924, "losses/dpo": 0.6962527632713318, "losses/sft": 1.7687420845031738, "losses/total": 0.6962527632713318, "ref_logps/chosen": -41.667564392089844, "ref_logps/rejected": -48.236515045166016, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.0003475874837022275, "rewards/margins": 0.0017732965061441064, "rewards/rejected": -0.002120884135365486, "step": 14 }, { "epoch": 0.11, "grad_norm": 28.741690779816427, "learning_rate": 1.875e-07, "logps/chosen": -43.92478942871094, "logps/rejected": -49.33970260620117, "loss": 0.6957, "losses/dpo": 0.7017595171928406, "losses/sft": 1.6299105882644653, "losses/total": 0.7017595171928406, "ref_logps/chosen": -43.88625717163086, "ref_logps/rejected": -49.34822463989258, "rewards/accuracies": 0.453125, "rewards/chosen": -0.003853556467220187, "rewards/margins": -0.0047055985778570175, "rewards/rejected": 0.0008520419942215085, "step": 15 }, { "epoch": 0.12, "grad_norm": 32.30543117981525, "learning_rate": 2e-07, "logps/chosen": -42.70464324951172, "logps/rejected": -47.57340621948242, "loss": 0.6941, "losses/dpo": 0.6879873871803284, "losses/sft": 1.699347734451294, "losses/total": 0.6879873871803284, "ref_logps/chosen": -42.67546463012695, "ref_logps/rejected": -47.56122589111328, "rewards/accuracies": 0.484375, "rewards/chosen": -0.002917766571044922, "rewards/margins": -0.0016999886138364673, "rewards/rejected": -0.0012177781900390983, "step": 16 }, { "epoch": 0.13, "grad_norm": 28.63281166682985, "learning_rate": 2.1249999999999998e-07, "logps/chosen": -41.62788391113281, "logps/rejected": -51.75229263305664, "loss": 0.6926, "losses/dpo": 0.6939769983291626, "losses/sft": 1.6408178806304932, "losses/total": 0.6939769983291626, "ref_logps/chosen": -41.612945556640625, "ref_logps/rejected": -51.722015380859375, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.0014935210347175598, "rewards/margins": 0.0015340042300522327, "rewards/rejected": -0.00302752573043108, "step": 17 }, { "epoch": 0.14, "grad_norm": 36.3280812943497, "learning_rate": 2.25e-07, "logps/chosen": -45.48798751831055, "logps/rejected": -51.9184455871582, "loss": 0.6934, "losses/dpo": 0.6971999406814575, "losses/sft": 1.5770013332366943, "losses/total": 0.6971999406814575, "ref_logps/chosen": -45.48552322387695, "ref_logps/rejected": -51.91908264160156, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.00024626130471006036, "rewards/margins": -0.0003098389133810997, "rewards/rejected": 6.357775419019163e-05, "step": 18 }, { "epoch": 0.14, "grad_norm": 28.786810535348106, "learning_rate": 2.3749999999999998e-07, "logps/chosen": -42.64775466918945, "logps/rejected": -50.45472717285156, "loss": 0.6914, "losses/dpo": 0.6857149600982666, "losses/sft": 1.5922735929489136, "losses/total": 0.6857149600982666, "ref_logps/chosen": -42.64891815185547, "ref_logps/rejected": -50.41812515258789, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.00011651660315692425, "rewards/margins": 0.003776898607611656, "rewards/rejected": -0.0036603824701160192, "step": 19 }, { "epoch": 0.15, "grad_norm": 35.068203163344556, "learning_rate": 2.5e-07, "logps/chosen": -47.786109924316406, "logps/rejected": -54.04900360107422, "loss": 0.6928, "losses/dpo": 0.7033498287200928, "losses/sft": 1.6538355350494385, "losses/total": 0.7033498287200928, "ref_logps/chosen": -47.77252197265625, "ref_logps/rejected": -54.026241302490234, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.0013585876440629363, "rewards/margins": 0.0009179831249639392, "rewards/rejected": -0.002276570536196232, "step": 20 }, { "epoch": 0.16, "grad_norm": 43.33971320402607, "learning_rate": 2.625e-07, "logps/chosen": -42.39413070678711, "logps/rejected": -46.84844970703125, "loss": 0.6927, "losses/dpo": 0.6936322450637817, "losses/sft": 1.5914438962936401, "losses/total": 0.6936322450637817, "ref_logps/chosen": -42.36817169189453, "ref_logps/rejected": -46.81178665161133, "rewards/accuracies": 0.5, "rewards/chosen": -0.0025956789031624794, "rewards/margins": 0.0010705376043915749, "rewards/rejected": -0.0036662165075540543, "step": 21 }, { "epoch": 0.17, "grad_norm": 30.844607036268723, "learning_rate": 2.75e-07, "logps/chosen": -42.504478454589844, "logps/rejected": -50.62303924560547, "loss": 0.6892, "losses/dpo": 0.6893997192382812, "losses/sft": 1.7737653255462646, "losses/total": 0.6893997192382812, "ref_logps/chosen": -42.531009674072266, "ref_logps/rejected": -50.56878662109375, "rewards/accuracies": 0.625, "rewards/chosen": 0.0026530339382588863, "rewards/margins": 0.008078843355178833, "rewards/rejected": -0.005425809882581234, "step": 22 }, { "epoch": 0.17, "grad_norm": 60.152706494875815, "learning_rate": 2.8749999999999995e-07, "logps/chosen": -44.01787567138672, "logps/rejected": -50.38100814819336, "loss": 0.6954, "losses/dpo": 0.691292405128479, "losses/sft": 1.4661805629730225, "losses/total": 0.691292405128479, "ref_logps/chosen": -43.98109817504883, "ref_logps/rejected": -50.386802673339844, "rewards/accuracies": 0.4609375, "rewards/chosen": -0.0036771753802895546, "rewards/margins": -0.004256190732121468, "rewards/rejected": 0.0005790154682472348, "step": 23 }, { "epoch": 0.18, "grad_norm": 35.45751349259834, "learning_rate": 3e-07, "logps/chosen": -45.31752395629883, "logps/rejected": -53.34571838378906, "loss": 0.6936, "losses/dpo": 0.6907269358634949, "losses/sft": 1.7325340509414673, "losses/total": 0.6907269358634949, "ref_logps/chosen": -45.26490783691406, "ref_logps/rejected": -53.29972839355469, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.0052614156156778336, "rewards/margins": -0.0006628208793699741, "rewards/rejected": -0.004598594270646572, "step": 24 }, { "epoch": 0.19, "grad_norm": 33.901573981343496, "learning_rate": 3.1249999999999997e-07, "logps/chosen": -42.9231071472168, "logps/rejected": -50.88954544067383, "loss": 0.6928, "losses/dpo": 0.6915858387947083, "losses/sft": 1.6041566133499146, "losses/total": 0.6915858387947083, "ref_logps/chosen": -42.87569046020508, "ref_logps/rejected": -50.831993103027344, "rewards/accuracies": 0.4609375, "rewards/chosen": -0.0047416831366717815, "rewards/margins": 0.0010139658115804195, "rewards/rejected": -0.005755649879574776, "step": 25 }, { "epoch": 0.2, "grad_norm": 48.83007887751364, "learning_rate": 3.25e-07, "logps/chosen": -47.005882263183594, "logps/rejected": -53.044517517089844, "loss": 0.6904, "losses/dpo": 0.6913453936576843, "losses/sft": 1.683605670928955, "losses/total": 0.6913453936576843, "ref_logps/chosen": -46.938358306884766, "ref_logps/rejected": -52.9183349609375, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.006752187851816416, "rewards/margins": 0.0058664847165346146, "rewards/rejected": -0.012618672102689743, "step": 26 }, { "epoch": 0.2, "grad_norm": 27.121079423094894, "learning_rate": 3.375e-07, "logps/chosen": -41.84714889526367, "logps/rejected": -50.05345153808594, "loss": 0.6911, "losses/dpo": 0.6876205801963806, "losses/sft": 1.42634117603302, "losses/total": 0.6876205801963806, "ref_logps/chosen": -41.82341003417969, "ref_logps/rejected": -49.98626708984375, "rewards/accuracies": 0.578125, "rewards/chosen": -0.0023737833835184574, "rewards/margins": 0.004344451241195202, "rewards/rejected": -0.006718234624713659, "step": 27 }, { "epoch": 0.21, "grad_norm": 46.49682946441541, "learning_rate": 3.5e-07, "logps/chosen": -45.08810043334961, "logps/rejected": -51.73465347290039, "loss": 0.6908, "losses/dpo": 0.6922367811203003, "losses/sft": 1.5698598623275757, "losses/total": 0.6922367811203003, "ref_logps/chosen": -45.04021453857422, "ref_logps/rejected": -51.63726043701172, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.004788544494658709, "rewards/margins": 0.0049506318755447865, "rewards/rejected": -0.009739176370203495, "step": 28 }, { "epoch": 0.22, "grad_norm": 37.01702048881829, "learning_rate": 3.6249999999999997e-07, "logps/chosen": -41.04375076293945, "logps/rejected": -48.00986862182617, "loss": 0.6927, "losses/dpo": 0.6946566104888916, "losses/sft": 1.500718593597412, "losses/total": 0.6946566104888916, "ref_logps/chosen": -41.03290557861328, "ref_logps/rejected": -47.987754821777344, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.0010843857889994979, "rewards/margins": 0.0011270248796790838, "rewards/rejected": -0.0022114107850939035, "step": 29 }, { "epoch": 0.23, "grad_norm": 32.62375662873659, "learning_rate": 3.75e-07, "logps/chosen": -47.39645767211914, "logps/rejected": -51.44966506958008, "loss": 0.6915, "losses/dpo": 0.6964210271835327, "losses/sft": 1.6298069953918457, "losses/total": 0.6964210271835327, "ref_logps/chosen": -47.29740905761719, "ref_logps/rejected": -51.31504821777344, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.009905603714287281, "rewards/margins": 0.0035561914555728436, "rewards/rejected": -0.013461794704198837, "step": 30 }, { "epoch": 0.23, "grad_norm": 30.43849818213995, "learning_rate": 3.875e-07, "logps/chosen": -43.67304992675781, "logps/rejected": -47.03245162963867, "loss": 0.6908, "losses/dpo": 0.686834454536438, "losses/sft": 1.6353853940963745, "losses/total": 0.686834454536438, "ref_logps/chosen": -43.62676239013672, "ref_logps/rejected": -46.93518829345703, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.004628042224794626, "rewards/margins": 0.005098389461636543, "rewards/rejected": -0.009726430289447308, "step": 31 }, { "epoch": 0.24, "grad_norm": 61.8544371472138, "learning_rate": 4e-07, "logps/chosen": -45.38051223754883, "logps/rejected": -55.16764831542969, "loss": 0.6901, "losses/dpo": 0.6919147968292236, "losses/sft": 1.6552284955978394, "losses/total": 0.6919147968292236, "ref_logps/chosen": -45.261253356933594, "ref_logps/rejected": -54.984039306640625, "rewards/accuracies": 0.59375, "rewards/chosen": -0.011925606057047844, "rewards/margins": 0.0064354524947702885, "rewards/rejected": -0.018361059948801994, "step": 32 }, { "epoch": 0.25, "grad_norm": 31.01419653261721, "learning_rate": 4.1249999999999997e-07, "logps/chosen": -43.31538009643555, "logps/rejected": -50.30097198486328, "loss": 0.691, "losses/dpo": 0.688191831111908, "losses/sft": 1.5828959941864014, "losses/total": 0.688191831111908, "ref_logps/chosen": -43.17654800415039, "ref_logps/rejected": -50.11687469482422, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.013883114792406559, "rewards/margins": 0.004527126904577017, "rewards/rejected": -0.01841024123132229, "step": 33 }, { "epoch": 0.26, "grad_norm": 35.01965206830056, "learning_rate": 4.2499999999999995e-07, "logps/chosen": -42.3648681640625, "logps/rejected": -54.2373046875, "loss": 0.6906, "losses/dpo": 0.6907436847686768, "losses/sft": 1.4812008142471313, "losses/total": 0.6907436847686768, "ref_logps/chosen": -42.26384735107422, "ref_logps/rejected": -54.082027435302734, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.010102039203047752, "rewards/margins": 0.005425349343568087, "rewards/rejected": -0.015527388080954552, "step": 34 }, { "epoch": 0.26, "grad_norm": 42.23881769572077, "learning_rate": 4.375e-07, "logps/chosen": -44.468162536621094, "logps/rejected": -52.50397872924805, "loss": 0.687, "losses/dpo": 0.6782665848731995, "losses/sft": 1.6573729515075684, "losses/total": 0.6782665848731995, "ref_logps/chosen": -44.36711883544922, "ref_logps/rejected": -52.27581024169922, "rewards/accuracies": 0.671875, "rewards/chosen": -0.01010376587510109, "rewards/margins": 0.012713316828012466, "rewards/rejected": -0.022817078977823257, "step": 35 }, { "epoch": 0.27, "grad_norm": 26.700732932432807, "learning_rate": 4.5e-07, "logps/chosen": -44.475521087646484, "logps/rejected": -50.47951889038086, "loss": 0.6889, "losses/dpo": 0.6929667592048645, "losses/sft": 1.500726580619812, "losses/total": 0.6929667592048645, "ref_logps/chosen": -44.35942840576172, "ref_logps/rejected": -50.275177001953125, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.011609955690801144, "rewards/margins": 0.008824457414448261, "rewards/rejected": -0.020434413105249405, "step": 36 }, { "epoch": 0.28, "grad_norm": 27.61858224618308, "learning_rate": 4.625e-07, "logps/chosen": -43.92258834838867, "logps/rejected": -48.82867431640625, "loss": 0.6877, "losses/dpo": 0.6838952898979187, "losses/sft": 1.6169151067733765, "losses/total": 0.6838952898979187, "ref_logps/chosen": -43.79357147216797, "ref_logps/rejected": -48.58594512939453, "rewards/accuracies": 0.625, "rewards/chosen": -0.01290148589760065, "rewards/margins": 0.01137150265276432, "rewards/rejected": -0.024272989481687546, "step": 37 }, { "epoch": 0.29, "grad_norm": 33.719746144096646, "learning_rate": 4.7499999999999995e-07, "logps/chosen": -42.070594787597656, "logps/rejected": -47.68130111694336, "loss": 0.6882, "losses/dpo": 0.6955613493919373, "losses/sft": 1.6116496324539185, "losses/total": 0.6955613493919373, "ref_logps/chosen": -41.907012939453125, "ref_logps/rejected": -47.41321563720703, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.016358288004994392, "rewards/margins": 0.010450540110468864, "rewards/rejected": -0.026808828115463257, "step": 38 }, { "epoch": 0.29, "grad_norm": 28.619582221306466, "learning_rate": 4.875e-07, "logps/chosen": -39.9019660949707, "logps/rejected": -46.925193786621094, "loss": 0.6862, "losses/dpo": 0.6819075345993042, "losses/sft": 1.469438076019287, "losses/total": 0.6819075345993042, "ref_logps/chosen": -39.801448822021484, "ref_logps/rejected": -46.680477142333984, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.010051984339952469, "rewards/margins": 0.01442030631005764, "rewards/rejected": -0.024472292512655258, "step": 39 }, { "epoch": 0.3, "grad_norm": 33.234080277590806, "learning_rate": 5e-07, "logps/chosen": -41.56800842285156, "logps/rejected": -50.79865264892578, "loss": 0.6853, "losses/dpo": 0.6727162003517151, "losses/sft": 1.5431392192840576, "losses/total": 0.6727162003517151, "ref_logps/chosen": -41.45555877685547, "ref_logps/rejected": -50.52134323120117, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.011244192719459534, "rewards/margins": 0.01648637279868126, "rewards/rejected": -0.027730563655495644, "step": 40 }, { "epoch": 0.31, "grad_norm": 30.640267946291875, "learning_rate": 4.985955056179775e-07, "logps/chosen": -48.3279914855957, "logps/rejected": -53.94540786743164, "loss": 0.6841, "losses/dpo": 0.6733967661857605, "losses/sft": 1.5284626483917236, "losses/total": 0.6733967661857605, "ref_logps/chosen": -48.16774368286133, "ref_logps/rejected": -53.59696960449219, "rewards/accuracies": 0.671875, "rewards/chosen": -0.016024738550186157, "rewards/margins": 0.018818693235516548, "rewards/rejected": -0.034843433648347855, "step": 41 }, { "epoch": 0.32, "grad_norm": 34.92574816226864, "learning_rate": 4.97191011235955e-07, "logps/chosen": -42.43712615966797, "logps/rejected": -48.71431350708008, "loss": 0.689, "losses/dpo": 0.6903843879699707, "losses/sft": 1.396918535232544, "losses/total": 0.6903843879699707, "ref_logps/chosen": -42.228118896484375, "ref_logps/rejected": -48.41496658325195, "rewards/accuracies": 0.59375, "rewards/chosen": -0.020901169627904892, "rewards/margins": 0.009032947942614555, "rewards/rejected": -0.029934115707874298, "step": 42 }, { "epoch": 0.32, "grad_norm": 33.22674015475124, "learning_rate": 4.957865168539325e-07, "logps/chosen": -47.800575256347656, "logps/rejected": -52.92036437988281, "loss": 0.6853, "losses/dpo": 0.6714830994606018, "losses/sft": 1.4566082954406738, "losses/total": 0.6714830994606018, "ref_logps/chosen": -47.58274841308594, "ref_logps/rejected": -52.53615951538086, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.021782677620649338, "rewards/margins": 0.016637606546282768, "rewards/rejected": -0.03842028230428696, "step": 43 }, { "epoch": 0.33, "grad_norm": 27.946919148116255, "learning_rate": 4.943820224719101e-07, "logps/chosen": -45.53565216064453, "logps/rejected": -48.98249816894531, "loss": 0.6852, "losses/dpo": 0.6921014189720154, "losses/sft": 1.7539501190185547, "losses/total": 0.6921014189720154, "ref_logps/chosen": -45.313297271728516, "ref_logps/rejected": -48.59336471557617, "rewards/accuracies": 0.609375, "rewards/chosen": -0.02223561517894268, "rewards/margins": 0.016677962616086006, "rewards/rejected": -0.038913577795028687, "step": 44 }, { "epoch": 0.34, "grad_norm": 52.217701396260416, "learning_rate": 4.929775280898877e-07, "logps/chosen": -49.74474334716797, "logps/rejected": -53.14861297607422, "loss": 0.6884, "losses/dpo": 0.6738002896308899, "losses/sft": 1.830601692199707, "losses/total": 0.6738002896308899, "ref_logps/chosen": -49.402587890625, "ref_logps/rejected": -52.700096130371094, "rewards/accuracies": 0.625, "rewards/chosen": -0.03421623632311821, "rewards/margins": 0.010634900070726871, "rewards/rejected": -0.044851139187812805, "step": 45 }, { "epoch": 0.35, "grad_norm": 38.73518237743624, "learning_rate": 4.915730337078651e-07, "logps/chosen": -43.27875518798828, "logps/rejected": -49.50944900512695, "loss": 0.6812, "losses/dpo": 0.6921982765197754, "losses/sft": 1.5264118909835815, "losses/total": 0.6921982765197754, "ref_logps/chosen": -43.157135009765625, "ref_logps/rejected": -49.134822845458984, "rewards/accuracies": 0.609375, "rewards/chosen": -0.012161856517195702, "rewards/margins": 0.025300683453679085, "rewards/rejected": -0.037462539970874786, "step": 46 }, { "epoch": 0.35, "grad_norm": 40.37558403892761, "learning_rate": 4.901685393258427e-07, "logps/chosen": -39.27291488647461, "logps/rejected": -46.131248474121094, "loss": 0.6804, "losses/dpo": 0.6849299073219299, "losses/sft": 1.6318694353103638, "losses/total": 0.6849299073219299, "ref_logps/chosen": -39.05021667480469, "ref_logps/rejected": -45.63941192626953, "rewards/accuracies": 0.5625, "rewards/chosen": -0.022269288077950478, "rewards/margins": 0.02691444754600525, "rewards/rejected": -0.049183737486600876, "step": 47 }, { "epoch": 0.36, "grad_norm": 42.28625216499906, "learning_rate": 4.887640449438202e-07, "logps/chosen": -47.6192741394043, "logps/rejected": -54.06623458862305, "loss": 0.6727, "losses/dpo": 0.6601410508155823, "losses/sft": 1.5543663501739502, "losses/total": 0.6601410508155823, "ref_logps/chosen": -47.447486877441406, "ref_logps/rejected": -53.463829040527344, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.01717934012413025, "rewards/margins": 0.04306170716881752, "rewards/rejected": -0.06024104729294777, "step": 48 }, { "epoch": 0.37, "grad_norm": 27.412767032270754, "learning_rate": 4.873595505617978e-07, "logps/chosen": -45.029903411865234, "logps/rejected": -50.96942138671875, "loss": 0.6821, "losses/dpo": 0.6688118577003479, "losses/sft": 1.7650786638259888, "losses/total": 0.6688118577003479, "ref_logps/chosen": -44.74635314941406, "ref_logps/rejected": -50.44926452636719, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.028354784473776817, "rewards/margins": 0.023661181330680847, "rewards/rejected": -0.052015963941812515, "step": 49 }, { "epoch": 0.38, "grad_norm": 31.933558413809592, "learning_rate": 4.859550561797752e-07, "logps/chosen": -42.08510208129883, "logps/rejected": -51.267513275146484, "loss": 0.6813, "losses/dpo": 0.6841184496879578, "losses/sft": 1.7661519050598145, "losses/total": 0.6841184496879578, "ref_logps/chosen": -41.938785552978516, "ref_logps/rejected": -50.863258361816406, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.014631946571171284, "rewards/margins": 0.025793185457587242, "rewards/rejected": -0.0404251329600811, "step": 50 }, { "epoch": 0.39, "grad_norm": 28.54466589478064, "learning_rate": 4.845505617977528e-07, "logps/chosen": -37.07756805419922, "logps/rejected": -48.509490966796875, "loss": 0.6777, "losses/dpo": 0.6732208132743835, "losses/sft": 1.4192291498184204, "losses/total": 0.6732208132743835, "ref_logps/chosen": -36.8990364074707, "ref_logps/rejected": -48.00566101074219, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.01785309799015522, "rewards/margins": 0.03252986818552017, "rewards/rejected": -0.05038297548890114, "step": 51 }, { "epoch": 0.39, "grad_norm": 28.772069984820575, "learning_rate": 4.831460674157303e-07, "logps/chosen": -46.38223648071289, "logps/rejected": -52.930992126464844, "loss": 0.6797, "losses/dpo": 0.6885812282562256, "losses/sft": 1.7472295761108398, "losses/total": 0.6885812282562256, "ref_logps/chosen": -46.08771514892578, "ref_logps/rejected": -52.34654998779297, "rewards/accuracies": 0.640625, "rewards/chosen": -0.029451746493577957, "rewards/margins": 0.028992656618356705, "rewards/rejected": -0.05844440683722496, "step": 52 }, { "epoch": 0.4, "grad_norm": 49.39759455716963, "learning_rate": 4.817415730337078e-07, "logps/chosen": -46.49875259399414, "logps/rejected": -55.329402923583984, "loss": 0.6764, "losses/dpo": 0.6864849328994751, "losses/sft": 1.6644362211227417, "losses/total": 0.6864849328994751, "ref_logps/chosen": -46.24521255493164, "ref_logps/rejected": -54.71065139770508, "rewards/accuracies": 0.6875, "rewards/chosen": -0.025354178622364998, "rewards/margins": 0.036520570516586304, "rewards/rejected": -0.061874743551015854, "step": 53 }, { "epoch": 0.41, "grad_norm": 39.8971887312985, "learning_rate": 4.803370786516854e-07, "logps/chosen": -47.50967025756836, "logps/rejected": -49.979393005371094, "loss": 0.6807, "losses/dpo": 0.6969994306564331, "losses/sft": 1.813864827156067, "losses/total": 0.6969994306564331, "ref_logps/chosen": -47.27796173095703, "ref_logps/rejected": -49.47468948364258, "rewards/accuracies": 0.671875, "rewards/chosen": -0.023170819506049156, "rewards/margins": 0.02729947119951248, "rewards/rejected": -0.05047028511762619, "step": 54 }, { "epoch": 0.42, "grad_norm": 47.86880334698939, "learning_rate": 4.789325842696629e-07, "logps/chosen": -41.838748931884766, "logps/rejected": -52.32083511352539, "loss": 0.6688, "losses/dpo": 0.6723834276199341, "losses/sft": 1.736802577972412, "losses/total": 0.6723834276199341, "ref_logps/chosen": -41.597434997558594, "ref_logps/rejected": -51.555294036865234, "rewards/accuracies": 0.671875, "rewards/chosen": -0.02413160167634487, "rewards/margins": 0.052422601729631424, "rewards/rejected": -0.07655420899391174, "step": 55 }, { "epoch": 0.42, "grad_norm": 30.31390317956311, "learning_rate": 4.775280898876405e-07, "logps/chosen": -43.37042999267578, "logps/rejected": -51.014381408691406, "loss": 0.6753, "losses/dpo": 0.6652502417564392, "losses/sft": 1.5465013980865479, "losses/total": 0.6652502417564392, "ref_logps/chosen": -43.08806610107422, "ref_logps/rejected": -50.34709167480469, "rewards/accuracies": 0.6875, "rewards/chosen": -0.028236446902155876, "rewards/margins": 0.038493067026138306, "rewards/rejected": -0.06672951579093933, "step": 56 }, { "epoch": 0.43, "grad_norm": 38.398516586557875, "learning_rate": 4.7612359550561797e-07, "logps/chosen": -47.867427825927734, "logps/rejected": -52.33705520629883, "loss": 0.6699, "losses/dpo": 0.680155336856842, "losses/sft": 1.7409940958023071, "losses/total": 0.680155336856842, "ref_logps/chosen": -47.60477066040039, "ref_logps/rejected": -51.57525634765625, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.026265887543559074, "rewards/margins": 0.049913693219423294, "rewards/rejected": -0.07617958635091782, "step": 57 }, { "epoch": 0.44, "grad_norm": 30.579136530590517, "learning_rate": 4.747191011235955e-07, "logps/chosen": -40.424930572509766, "logps/rejected": -49.33341979980469, "loss": 0.671, "losses/dpo": 0.6583426594734192, "losses/sft": 1.5045228004455566, "losses/total": 0.6583426594734192, "ref_logps/chosen": -40.30132293701172, "ref_logps/rejected": -48.72381591796875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.012360622175037861, "rewards/margins": 0.04859950393438339, "rewards/rejected": -0.06096012517809868, "step": 58 }, { "epoch": 0.45, "grad_norm": 28.432386818337964, "learning_rate": 4.7331460674157303e-07, "logps/chosen": -44.42695236206055, "logps/rejected": -51.38760757446289, "loss": 0.6742, "losses/dpo": 0.6734617948532104, "losses/sft": 1.4930431842803955, "losses/total": 0.6734617948532104, "ref_logps/chosen": -44.15483474731445, "ref_logps/rejected": -50.686981201171875, "rewards/accuracies": 0.625, "rewards/chosen": -0.027211949229240417, "rewards/margins": 0.0428510457277298, "rewards/rejected": -0.07006299495697021, "step": 59 }, { "epoch": 0.45, "grad_norm": 60.94536925039147, "learning_rate": 4.7191011235955054e-07, "logps/chosen": -40.73780059814453, "logps/rejected": -48.45012283325195, "loss": 0.6711, "losses/dpo": 0.6793003678321838, "losses/sft": 1.452972173690796, "losses/total": 0.6793003678321838, "ref_logps/chosen": -40.51582336425781, "ref_logps/rejected": -47.74919891357422, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.022197561338543892, "rewards/margins": 0.04789496585726738, "rewards/rejected": -0.07009252905845642, "step": 60 }, { "epoch": 0.46, "grad_norm": 30.098575027566017, "learning_rate": 4.705056179775281e-07, "logps/chosen": -44.314781188964844, "logps/rejected": -53.01373291015625, "loss": 0.667, "losses/dpo": 0.6188409328460693, "losses/sft": 1.5232850313186646, "losses/total": 0.6188409328460693, "ref_logps/chosen": -44.05820846557617, "ref_logps/rejected": -52.162532806396484, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.02565758302807808, "rewards/margins": 0.05946283042430878, "rewards/rejected": -0.08512041717767715, "step": 61 }, { "epoch": 0.47, "grad_norm": 42.29525778771706, "learning_rate": 4.691011235955056e-07, "logps/chosen": -44.62049865722656, "logps/rejected": -52.77370834350586, "loss": 0.663, "losses/dpo": 0.6641362905502319, "losses/sft": 1.681021809577942, "losses/total": 0.6641362905502319, "ref_logps/chosen": -44.312557220458984, "ref_logps/rejected": -51.80582809448242, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.0307940524071455, "rewards/margins": 0.06599360704421997, "rewards/rejected": -0.09678765386343002, "step": 62 }, { "epoch": 0.48, "grad_norm": 38.95200855192441, "learning_rate": 4.6769662921348315e-07, "logps/chosen": -43.67677307128906, "logps/rejected": -54.40755081176758, "loss": 0.6646, "losses/dpo": 0.7088499069213867, "losses/sft": 1.5133750438690186, "losses/total": 0.7088499069213867, "ref_logps/chosen": -43.41708755493164, "ref_logps/rejected": -53.509361267089844, "rewards/accuracies": 0.71875, "rewards/chosen": -0.02596930041909218, "rewards/margins": 0.0638495609164238, "rewards/rejected": -0.08981885015964508, "step": 63 }, { "epoch": 0.48, "grad_norm": 43.49914139074744, "learning_rate": 4.662921348314606e-07, "logps/chosen": -42.34618377685547, "logps/rejected": -52.88496398925781, "loss": 0.6606, "losses/dpo": 0.6652852296829224, "losses/sft": 1.4501088857650757, "losses/total": 0.6652852296829224, "ref_logps/chosen": -42.11153030395508, "ref_logps/rejected": -51.93799591064453, "rewards/accuracies": 0.734375, "rewards/chosen": -0.02346588298678398, "rewards/margins": 0.07123082876205444, "rewards/rejected": -0.09469670802354813, "step": 64 }, { "epoch": 0.49, "grad_norm": 41.40541221321436, "learning_rate": 4.6488764044943816e-07, "logps/chosen": -44.95636749267578, "logps/rejected": -53.083003997802734, "loss": 0.6693, "losses/dpo": 0.6824743747711182, "losses/sft": 1.6903133392333984, "losses/total": 0.6824743747711182, "ref_logps/chosen": -44.57103729248047, "ref_logps/rejected": -52.162689208984375, "rewards/accuracies": 0.640625, "rewards/chosen": -0.03853331878781319, "rewards/margins": 0.053498368710279465, "rewards/rejected": -0.09203169494867325, "step": 65 }, { "epoch": 0.5, "grad_norm": 45.593336133234075, "learning_rate": 4.634831460674157e-07, "logps/chosen": -44.58454132080078, "logps/rejected": -52.93248748779297, "loss": 0.6544, "losses/dpo": 0.6102422475814819, "losses/sft": 1.4271743297576904, "losses/total": 0.6102422475814819, "ref_logps/chosen": -44.33665466308594, "ref_logps/rejected": -51.8321647644043, "rewards/accuracies": 0.671875, "rewards/chosen": -0.02478918805718422, "rewards/margins": 0.08524337410926819, "rewards/rejected": -0.11003255099058151, "step": 66 }, { "epoch": 0.51, "grad_norm": 34.1452175972297, "learning_rate": 4.620786516853932e-07, "logps/chosen": -41.57750701904297, "logps/rejected": -47.80601119995117, "loss": 0.6695, "losses/dpo": 0.6731228828430176, "losses/sft": 1.6916165351867676, "losses/total": 0.6731228828430176, "ref_logps/chosen": -41.26310348510742, "ref_logps/rejected": -46.96986770629883, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03143973648548126, "rewards/margins": 0.052174754440784454, "rewards/rejected": -0.08361449092626572, "step": 67 }, { "epoch": 0.51, "grad_norm": 30.892032365612796, "learning_rate": 4.606741573033708e-07, "logps/chosen": -45.23195266723633, "logps/rejected": -49.975162506103516, "loss": 0.667, "losses/dpo": 0.6530002355575562, "losses/sft": 1.7935758829116821, "losses/total": 0.6530002355575562, "ref_logps/chosen": -44.913612365722656, "ref_logps/rejected": -49.06536102294922, "rewards/accuracies": 0.625, "rewards/chosen": -0.03183369338512421, "rewards/margins": 0.05914711952209473, "rewards/rejected": -0.09098081290721893, "step": 68 }, { "epoch": 0.52, "grad_norm": 29.867366301916732, "learning_rate": 4.592696629213483e-07, "logps/chosen": -45.25506591796875, "logps/rejected": -53.421966552734375, "loss": 0.6631, "losses/dpo": 0.6531996726989746, "losses/sft": 1.8156472444534302, "losses/total": 0.6531996726989746, "ref_logps/chosen": -44.84506607055664, "ref_logps/rejected": -52.32767105102539, "rewards/accuracies": 0.65625, "rewards/chosen": -0.041000161319971085, "rewards/margins": 0.0684298723936081, "rewards/rejected": -0.10943003743886948, "step": 69 }, { "epoch": 0.53, "grad_norm": 32.663525839020494, "learning_rate": 4.5786516853932584e-07, "logps/chosen": -48.02471160888672, "logps/rejected": -54.52970504760742, "loss": 0.664, "losses/dpo": 0.6819270849227905, "losses/sft": 1.675918698310852, "losses/total": 0.6819270849227905, "ref_logps/chosen": -47.463623046875, "ref_logps/rejected": -53.28987503051758, "rewards/accuracies": 0.71875, "rewards/chosen": -0.05610860884189606, "rewards/margins": 0.06787437945604324, "rewards/rejected": -0.1239829957485199, "step": 70 }, { "epoch": 0.54, "grad_norm": 34.381880294779506, "learning_rate": 4.5646067415730334e-07, "logps/chosen": -44.1163330078125, "logps/rejected": -56.365657806396484, "loss": 0.6536, "losses/dpo": 0.643732488155365, "losses/sft": 1.5392124652862549, "losses/total": 0.643732488155365, "ref_logps/chosen": -43.79933166503906, "ref_logps/rejected": -55.17345428466797, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.03169972077012062, "rewards/margins": 0.08752122521400452, "rewards/rejected": -0.11922094225883484, "step": 71 }, { "epoch": 0.54, "grad_norm": 57.46694353711977, "learning_rate": 4.550561797752809e-07, "logps/chosen": -44.212608337402344, "logps/rejected": -46.585025787353516, "loss": 0.6645, "losses/dpo": 0.6590207815170288, "losses/sft": 1.775944709777832, "losses/total": 0.6590207815170288, "ref_logps/chosen": -44.05052947998047, "ref_logps/rejected": -45.757301330566406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01620814949274063, "rewards/margins": 0.06656460464000702, "rewards/rejected": -0.08277275413274765, "step": 72 }, { "epoch": 0.55, "grad_norm": 39.60971373679397, "learning_rate": 4.536516853932584e-07, "logps/chosen": -46.99342727661133, "logps/rejected": -54.59370803833008, "loss": 0.6551, "losses/dpo": 0.6424433588981628, "losses/sft": 1.875294804573059, "losses/total": 0.6424433588981628, "ref_logps/chosen": -46.555843353271484, "ref_logps/rejected": -53.273963928222656, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.04375863075256348, "rewards/margins": 0.08821592479944229, "rewards/rejected": -0.13197456300258636, "step": 73 }, { "epoch": 0.56, "grad_norm": 32.17492390510902, "learning_rate": 4.522471910112359e-07, "logps/chosen": -47.51396179199219, "logps/rejected": -51.763668060302734, "loss": 0.6614, "losses/dpo": 0.6861087083816528, "losses/sft": 1.6838195323944092, "losses/total": 0.6861087083816528, "ref_logps/chosen": -47.04558563232422, "ref_logps/rejected": -50.54121398925781, "rewards/accuracies": 0.671875, "rewards/chosen": -0.04683758318424225, "rewards/margins": 0.07540743052959442, "rewards/rejected": -0.12224502861499786, "step": 74 }, { "epoch": 0.57, "grad_norm": 31.236104643579832, "learning_rate": 4.5084269662921347e-07, "logps/chosen": -44.16737747192383, "logps/rejected": -51.12425994873047, "loss": 0.6662, "losses/dpo": 0.6428414583206177, "losses/sft": 1.7553492784500122, "losses/total": 0.6428414583206177, "ref_logps/chosen": -43.686458587646484, "ref_logps/rejected": -49.97422409057617, "rewards/accuracies": 0.6875, "rewards/chosen": -0.048091523349285126, "rewards/margins": 0.066912442445755, "rewards/rejected": -0.11500395834445953, "step": 75 }, { "epoch": 0.57, "grad_norm": 29.89168698308929, "learning_rate": 4.4943820224719097e-07, "logps/chosen": -42.67256546020508, "logps/rejected": -50.444793701171875, "loss": 0.6498, "losses/dpo": 0.6251261830329895, "losses/sft": 1.5899189710617065, "losses/total": 0.6251261830329895, "ref_logps/chosen": -42.400543212890625, "ref_logps/rejected": -49.179664611816406, "rewards/accuracies": 0.703125, "rewards/chosen": -0.027202490717172623, "rewards/margins": 0.09931023418903351, "rewards/rejected": -0.12651273608207703, "step": 76 }, { "epoch": 0.58, "grad_norm": 43.52771811475026, "learning_rate": 4.4803370786516853e-07, "logps/chosen": -45.70376205444336, "logps/rejected": -54.491477966308594, "loss": 0.6549, "losses/dpo": 0.6449969410896301, "losses/sft": 1.8385146856307983, "losses/total": 0.6449969410896301, "ref_logps/chosen": -45.164833068847656, "ref_logps/rejected": -53.02450180053711, "rewards/accuracies": 0.59375, "rewards/chosen": -0.05389354005455971, "rewards/margins": 0.09280404448509216, "rewards/rejected": -0.14669758081436157, "step": 77 }, { "epoch": 0.59, "grad_norm": 44.11248074607741, "learning_rate": 4.4662921348314603e-07, "logps/chosen": -46.73276901245117, "logps/rejected": -56.035160064697266, "loss": 0.6516, "losses/dpo": 0.6755825877189636, "losses/sft": 1.562668800354004, "losses/total": 0.6755825877189636, "ref_logps/chosen": -46.348121643066406, "ref_logps/rejected": -54.65354537963867, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.03846465423703194, "rewards/margins": 0.09969727694988251, "rewards/rejected": -0.13816192746162415, "step": 78 }, { "epoch": 0.6, "grad_norm": 31.054545788321466, "learning_rate": 4.452247191011236e-07, "logps/chosen": -45.705867767333984, "logps/rejected": -49.91191482543945, "loss": 0.6618, "losses/dpo": 0.6332757472991943, "losses/sft": 1.6961500644683838, "losses/total": 0.6332757472991943, "ref_logps/chosen": -45.05665588378906, "ref_logps/rejected": -48.505367279052734, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.064921073615551, "rewards/margins": 0.07573364675045013, "rewards/rejected": -0.14065471291542053, "step": 79 }, { "epoch": 0.6, "grad_norm": 34.50070905778543, "learning_rate": 4.438202247191011e-07, "logps/chosen": -40.36265182495117, "logps/rejected": -52.85662841796875, "loss": 0.6463, "losses/dpo": 0.6560146808624268, "losses/sft": 1.6940882205963135, "losses/total": 0.6560146808624268, "ref_logps/chosen": -40.00170135498047, "ref_logps/rejected": -51.41786193847656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.03609542176127434, "rewards/margins": 0.10778236389160156, "rewards/rejected": -0.1438777893781662, "step": 80 }, { "epoch": 0.61, "grad_norm": 94.68799629881404, "learning_rate": 4.4241573033707865e-07, "logps/chosen": -45.90070343017578, "logps/rejected": -52.97509765625, "loss": 0.6653, "losses/dpo": 0.6292118430137634, "losses/sft": 1.6726936101913452, "losses/total": 0.6292118430137634, "ref_logps/chosen": -45.422325134277344, "ref_logps/rejected": -51.77912902832031, "rewards/accuracies": 0.59375, "rewards/chosen": -0.047837354242801666, "rewards/margins": 0.07175900042057037, "rewards/rejected": -0.11959635466337204, "step": 81 }, { "epoch": 0.62, "grad_norm": 41.912927437158785, "learning_rate": 4.410112359550562e-07, "logps/chosen": -49.49458312988281, "logps/rejected": -56.47850036621094, "loss": 0.644, "losses/dpo": 0.67254638671875, "losses/sft": 1.7711910009384155, "losses/total": 0.67254638671875, "ref_logps/chosen": -48.75238800048828, "ref_logps/rejected": -54.56370162963867, "rewards/accuracies": 0.703125, "rewards/chosen": -0.07421913743019104, "rewards/margins": 0.11726155877113342, "rewards/rejected": -0.19148069620132446, "step": 82 }, { "epoch": 0.63, "grad_norm": 30.809075785390206, "learning_rate": 4.3960674157303366e-07, "logps/chosen": -46.50284957885742, "logps/rejected": -54.540199279785156, "loss": 0.6526, "losses/dpo": 0.6762425303459167, "losses/sft": 1.7156425714492798, "losses/total": 0.6762425303459167, "ref_logps/chosen": -45.8099365234375, "ref_logps/rejected": -52.88389205932617, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.06929101794958115, "rewards/margins": 0.09634006023406982, "rewards/rejected": -0.16563107073307037, "step": 83 }, { "epoch": 0.63, "grad_norm": 25.79579133353353, "learning_rate": 4.382022471910112e-07, "logps/chosen": -45.86671829223633, "logps/rejected": -53.8139533996582, "loss": 0.6557, "losses/dpo": 0.6797949075698853, "losses/sft": 1.6344847679138184, "losses/total": 0.6797949075698853, "ref_logps/chosen": -45.232337951660156, "ref_logps/rejected": -52.22455596923828, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.06343795359134674, "rewards/margins": 0.09550120681524277, "rewards/rejected": -0.1589391529560089, "step": 84 }, { "epoch": 0.64, "grad_norm": 30.162700210836963, "learning_rate": 4.367977528089887e-07, "logps/chosen": -41.93394088745117, "logps/rejected": -56.48595428466797, "loss": 0.6162, "losses/dpo": 0.6293738484382629, "losses/sft": 1.5654246807098389, "losses/total": 0.6293738484382629, "ref_logps/chosen": -41.77614212036133, "ref_logps/rejected": -54.580291748046875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.015780135989189148, "rewards/margins": 0.17478588223457336, "rewards/rejected": -0.19056600332260132, "step": 85 }, { "epoch": 0.65, "grad_norm": 34.41680518091824, "learning_rate": 4.353932584269663e-07, "logps/chosen": -44.43855667114258, "logps/rejected": -50.914955139160156, "loss": 0.6547, "losses/dpo": 0.6206750869750977, "losses/sft": 1.5246466398239136, "losses/total": 0.6206750869750977, "ref_logps/chosen": -43.69452667236328, "ref_logps/rejected": -49.168739318847656, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07440284639596939, "rewards/margins": 0.1002187430858612, "rewards/rejected": -0.17462158203125, "step": 86 }, { "epoch": 0.66, "grad_norm": 27.892274644301406, "learning_rate": 4.339887640449438e-07, "logps/chosen": -43.253910064697266, "logps/rejected": -52.30147933959961, "loss": 0.6428, "losses/dpo": 0.6658166646957397, "losses/sft": 1.6765803098678589, "losses/total": 0.6658166646957397, "ref_logps/chosen": -42.67121124267578, "ref_logps/rejected": -50.533077239990234, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.05826949700713158, "rewards/margins": 0.11857100576162338, "rewards/rejected": -0.17684051394462585, "step": 87 }, { "epoch": 0.66, "grad_norm": 55.16292803311009, "learning_rate": 4.3258426966292134e-07, "logps/chosen": -42.007484436035156, "logps/rejected": -50.0811767578125, "loss": 0.6326, "losses/dpo": 0.6056747436523438, "losses/sft": 1.7411773204803467, "losses/total": 0.6056747436523438, "ref_logps/chosen": -41.487525939941406, "ref_logps/rejected": -48.109779357910156, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.05199640989303589, "rewards/margins": 0.14514391124248505, "rewards/rejected": -0.19714030623435974, "step": 88 }, { "epoch": 0.67, "grad_norm": 34.6356034521475, "learning_rate": 4.311797752808989e-07, "logps/chosen": -40.15419006347656, "logps/rejected": -47.538421630859375, "loss": 0.6457, "losses/dpo": 0.680642306804657, "losses/sft": 1.6460747718811035, "losses/total": 0.680642306804657, "ref_logps/chosen": -40.11595153808594, "ref_logps/rejected": -46.310447692871094, "rewards/accuracies": 0.625, "rewards/chosen": -0.0038233580999076366, "rewards/margins": 0.11897383630275726, "rewards/rejected": -0.12279720604419708, "step": 89 }, { "epoch": 0.68, "grad_norm": 28.939544893174496, "learning_rate": 4.297752808988764e-07, "logps/chosen": -44.99380874633789, "logps/rejected": -54.135902404785156, "loss": 0.6341, "losses/dpo": 0.6257071495056152, "losses/sft": 1.6170375347137451, "losses/total": 0.6257071495056152, "ref_logps/chosen": -44.28892517089844, "ref_logps/rejected": -51.92372131347656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07048848271369934, "rewards/margins": 0.15072941780090332, "rewards/rejected": -0.22121790051460266, "step": 90 }, { "epoch": 0.69, "grad_norm": 30.82562962007261, "learning_rate": 4.2837078651685396e-07, "logps/chosen": -45.05669403076172, "logps/rejected": -55.15739059448242, "loss": 0.6389, "losses/dpo": 0.6449102163314819, "losses/sft": 1.5784144401550293, "losses/total": 0.6449102163314819, "ref_logps/chosen": -44.50265121459961, "ref_logps/rejected": -53.139549255371094, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.05540444701910019, "rewards/margins": 0.14638003706932068, "rewards/rejected": -0.20178449153900146, "step": 91 }, { "epoch": 0.69, "grad_norm": 28.365260891219513, "learning_rate": 4.269662921348314e-07, "logps/chosen": -42.3452033996582, "logps/rejected": -46.581642150878906, "loss": 0.643, "losses/dpo": 0.6168180704116821, "losses/sft": 1.4108332395553589, "losses/total": 0.6168180704116821, "ref_logps/chosen": -41.79684066772461, "ref_logps/rejected": -44.82728576660156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05483608320355415, "rewards/margins": 0.12060024589300156, "rewards/rejected": -0.17543631792068481, "step": 92 }, { "epoch": 0.7, "grad_norm": 43.62597997275563, "learning_rate": 4.2556179775280896e-07, "logps/chosen": -44.032196044921875, "logps/rejected": -54.29043960571289, "loss": 0.6472, "losses/dpo": 0.6641404628753662, "losses/sft": 1.6307967901229858, "losses/total": 0.6641404628753662, "ref_logps/chosen": -43.36088943481445, "ref_logps/rejected": -52.424129486083984, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06713112443685532, "rewards/margins": 0.11949995160102844, "rewards/rejected": -0.18663106858730316, "step": 93 }, { "epoch": 0.71, "grad_norm": 36.79572482997297, "learning_rate": 4.2415730337078647e-07, "logps/chosen": -43.960914611816406, "logps/rejected": -52.42353820800781, "loss": 0.6262, "losses/dpo": 0.6416357755661011, "losses/sft": 1.7492291927337646, "losses/total": 0.6416357755661011, "ref_logps/chosen": -43.40652084350586, "ref_logps/rejected": -50.285953521728516, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.05543944239616394, "rewards/margins": 0.158319354057312, "rewards/rejected": -0.21375878155231476, "step": 94 }, { "epoch": 0.72, "grad_norm": 33.202156694587515, "learning_rate": 4.22752808988764e-07, "logps/chosen": -45.4493408203125, "logps/rejected": -56.69847869873047, "loss": 0.6157, "losses/dpo": 0.6609561443328857, "losses/sft": 1.7604572772979736, "losses/total": 0.6609561443328857, "ref_logps/chosen": -44.836978912353516, "ref_logps/rejected": -54.17456817626953, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.061236314475536346, "rewards/margins": 0.19115474820137024, "rewards/rejected": -0.2523910701274872, "step": 95 }, { "epoch": 0.72, "grad_norm": 46.956478939957535, "learning_rate": 4.2134831460674153e-07, "logps/chosen": -46.54857635498047, "logps/rejected": -50.4640007019043, "loss": 0.6663, "losses/dpo": 0.7251628637313843, "losses/sft": 1.7952070236206055, "losses/total": 0.7251628637313843, "ref_logps/chosen": -45.67692947387695, "ref_logps/rejected": -48.77769470214844, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.08716444671154022, "rewards/margins": 0.08146625012159348, "rewards/rejected": -0.1686306893825531, "step": 96 }, { "epoch": 0.73, "grad_norm": 27.879034879340576, "learning_rate": 4.199438202247191e-07, "logps/chosen": -50.488792419433594, "logps/rejected": -56.18730163574219, "loss": 0.6429, "losses/dpo": 0.6116083264350891, "losses/sft": 1.8164161443710327, "losses/total": 0.6116083264350891, "ref_logps/chosen": -49.370967864990234, "ref_logps/rejected": -53.6693229675293, "rewards/accuracies": 0.671875, "rewards/chosen": -0.1117820143699646, "rewards/margins": 0.14001640677452087, "rewards/rejected": -0.25179845094680786, "step": 97 }, { "epoch": 0.74, "grad_norm": 33.6451235312571, "learning_rate": 4.1853932584269664e-07, "logps/chosen": -45.30095672607422, "logps/rejected": -52.77676773071289, "loss": 0.6276, "losses/dpo": 0.6388231515884399, "losses/sft": 1.5362550020217896, "losses/total": 0.6388231515884399, "ref_logps/chosen": -44.66321563720703, "ref_logps/rejected": -50.480506896972656, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.06377451121807098, "rewards/margins": 0.1658514142036438, "rewards/rejected": -0.2296259105205536, "step": 98 }, { "epoch": 0.75, "grad_norm": 28.084665755795335, "learning_rate": 4.1713483146067415e-07, "logps/chosen": -44.78543472290039, "logps/rejected": -56.6446533203125, "loss": 0.6214, "losses/dpo": 0.6161714792251587, "losses/sft": 1.6041640043258667, "losses/total": 0.6161714792251587, "ref_logps/chosen": -44.02279281616211, "ref_logps/rejected": -54.028587341308594, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.07626420259475708, "rewards/margins": 0.18534201383590698, "rewards/rejected": -0.26160621643066406, "step": 99 }, { "epoch": 0.76, "grad_norm": 27.824171173031214, "learning_rate": 4.157303370786517e-07, "logps/chosen": -40.95489501953125, "logps/rejected": -46.958560943603516, "loss": 0.6418, "losses/dpo": 0.7102890610694885, "losses/sft": 1.5851924419403076, "losses/total": 0.7102890610694885, "ref_logps/chosen": -40.45050048828125, "ref_logps/rejected": -45.10783004760742, "rewards/accuracies": 0.703125, "rewards/chosen": -0.050439272075891495, "rewards/margins": 0.1346338987350464, "rewards/rejected": -0.18507316708564758, "step": 100 }, { "epoch": 0.76, "grad_norm": 22.348657654837517, "learning_rate": 4.1432584269662915e-07, "logps/chosen": -44.739830017089844, "logps/rejected": -51.873111724853516, "loss": 0.6331, "losses/dpo": 0.6109752058982849, "losses/sft": 1.5853087902069092, "losses/total": 0.6109752058982849, "ref_logps/chosen": -43.880859375, "ref_logps/rejected": -49.45735549926758, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.08589743077754974, "rewards/margins": 0.15567803382873535, "rewards/rejected": -0.2415754646062851, "step": 101 }, { "epoch": 0.77, "grad_norm": 24.803923367751008, "learning_rate": 4.129213483146067e-07, "logps/chosen": -51.41362380981445, "logps/rejected": -62.211700439453125, "loss": 0.6025, "losses/dpo": 0.5558424592018127, "losses/sft": 1.6526005268096924, "losses/total": 0.5558424592018127, "ref_logps/chosen": -50.46706771850586, "ref_logps/rejected": -59.001731872558594, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.09465555101633072, "rewards/margins": 0.2263410985469818, "rewards/rejected": -0.32099664211273193, "step": 102 }, { "epoch": 0.78, "grad_norm": 44.04074369963754, "learning_rate": 4.115168539325842e-07, "logps/chosen": -44.422515869140625, "logps/rejected": -54.969276428222656, "loss": 0.6243, "losses/dpo": 0.724815845489502, "losses/sft": 1.8968292474746704, "losses/total": 0.724815845489502, "ref_logps/chosen": -43.54924774169922, "ref_logps/rejected": -52.170406341552734, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.08732666820287704, "rewards/margins": 0.19255991280078888, "rewards/rejected": -0.2798865735530853, "step": 103 }, { "epoch": 0.79, "grad_norm": 28.752734258876266, "learning_rate": 4.1011235955056177e-07, "logps/chosen": -44.18345260620117, "logps/rejected": -49.56830596923828, "loss": 0.6416, "losses/dpo": 0.6475918889045715, "losses/sft": 1.6624279022216797, "losses/total": 0.6475918889045715, "ref_logps/chosen": -43.107295989990234, "ref_logps/rejected": -47.10633850097656, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.10761551558971405, "rewards/margins": 0.1385812610387802, "rewards/rejected": -0.24619677662849426, "step": 104 }, { "epoch": 0.79, "grad_norm": 33.831465154352095, "learning_rate": 4.0870786516853933e-07, "logps/chosen": -43.108551025390625, "logps/rejected": -58.27730941772461, "loss": 0.6136, "losses/dpo": 0.5613527297973633, "losses/sft": 1.4752910137176514, "losses/total": 0.5613527297973633, "ref_logps/chosen": -42.15351104736328, "ref_logps/rejected": -55.20923614501953, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.09550447016954422, "rewards/margins": 0.21130229532718658, "rewards/rejected": -0.3068067729473114, "step": 105 }, { "epoch": 0.8, "grad_norm": 35.519256908131155, "learning_rate": 4.0730337078651683e-07, "logps/chosen": -43.02156066894531, "logps/rejected": -55.324951171875, "loss": 0.6201, "losses/dpo": 0.6104274392127991, "losses/sft": 1.5403259992599487, "losses/total": 0.6104274392127991, "ref_logps/chosen": -41.601776123046875, "ref_logps/rejected": -52.04058837890625, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.1419784277677536, "rewards/margins": 0.18645837903022766, "rewards/rejected": -0.32843682169914246, "step": 106 }, { "epoch": 0.81, "grad_norm": 53.3277918124601, "learning_rate": 4.058988764044944e-07, "logps/chosen": -47.68224334716797, "logps/rejected": -53.78023147583008, "loss": 0.6289, "losses/dpo": 0.6174871325492859, "losses/sft": 1.8614567518234253, "losses/total": 0.6174871325492859, "ref_logps/chosen": -46.320743560791016, "ref_logps/rejected": -50.71996307373047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13615000247955322, "rewards/margins": 0.1698770523071289, "rewards/rejected": -0.30602702498435974, "step": 107 }, { "epoch": 0.82, "grad_norm": 31.759411700974844, "learning_rate": 4.044943820224719e-07, "logps/chosen": -42.972476959228516, "logps/rejected": -51.017364501953125, "loss": 0.6242, "losses/dpo": 0.6567478775978088, "losses/sft": 1.450127363204956, "losses/total": 0.6567478775978088, "ref_logps/chosen": -41.97654342651367, "ref_logps/rejected": -48.165870666503906, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.0995931327342987, "rewards/margins": 0.18555624783039093, "rewards/rejected": -0.28514939546585083, "step": 108 }, { "epoch": 0.82, "grad_norm": 28.26714751384284, "learning_rate": 4.0308988764044945e-07, "logps/chosen": -46.48244094848633, "logps/rejected": -52.38172149658203, "loss": 0.6315, "losses/dpo": 0.644914984703064, "losses/sft": 1.9009453058242798, "losses/total": 0.644914984703064, "ref_logps/chosen": -45.111820220947266, "ref_logps/rejected": -49.31619644165039, "rewards/accuracies": 0.640625, "rewards/chosen": -0.13706238567829132, "rewards/margins": 0.16949015855789185, "rewards/rejected": -0.306552529335022, "step": 109 }, { "epoch": 0.83, "grad_norm": 33.75202583852142, "learning_rate": 4.0168539325842696e-07, "logps/chosen": -41.2758674621582, "logps/rejected": -47.755577087402344, "loss": 0.6217, "losses/dpo": 0.5321424007415771, "losses/sft": 1.5920504331588745, "losses/total": 0.5321424007415771, "ref_logps/chosen": -40.613494873046875, "ref_logps/rejected": -45.30800247192383, "rewards/accuracies": 0.703125, "rewards/chosen": -0.06623756885528564, "rewards/margins": 0.17852002382278442, "rewards/rejected": -0.24475759267807007, "step": 110 }, { "epoch": 0.84, "grad_norm": 32.14832326330206, "learning_rate": 4.0028089887640446e-07, "logps/chosen": -47.61855697631836, "logps/rejected": -55.30910110473633, "loss": 0.6332, "losses/dpo": 0.583430826663971, "losses/sft": 1.489072561264038, "losses/total": 0.583430826663971, "ref_logps/chosen": -46.05831527709961, "ref_logps/rejected": -52.14054489135742, "rewards/accuracies": 0.640625, "rewards/chosen": -0.15602374076843262, "rewards/margins": 0.16083230078220367, "rewards/rejected": -0.3168560266494751, "step": 111 }, { "epoch": 0.85, "grad_norm": 31.57358548790049, "learning_rate": 3.9887640449438196e-07, "logps/chosen": -45.96358871459961, "logps/rejected": -55.025814056396484, "loss": 0.6109, "losses/dpo": 0.6196195483207703, "losses/sft": 1.564924716949463, "losses/total": 0.6196195483207703, "ref_logps/chosen": -44.80457305908203, "ref_logps/rejected": -51.71221923828125, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.11590145528316498, "rewards/margins": 0.2154581993818283, "rewards/rejected": -0.3313596248626709, "step": 112 }, { "epoch": 0.85, "grad_norm": 38.54568144555463, "learning_rate": 3.974719101123595e-07, "logps/chosen": -47.64299392700195, "logps/rejected": -53.921478271484375, "loss": 0.6209, "losses/dpo": 0.5124274492263794, "losses/sft": 1.7563964128494263, "losses/total": 0.5124274492263794, "ref_logps/chosen": -46.44425582885742, "ref_logps/rejected": -50.830631256103516, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11987365782260895, "rewards/margins": 0.18921077251434326, "rewards/rejected": -0.309084415435791, "step": 113 }, { "epoch": 0.86, "grad_norm": 44.38467729184257, "learning_rate": 3.960674157303371e-07, "logps/chosen": -47.03258514404297, "logps/rejected": -58.012779235839844, "loss": 0.6173, "losses/dpo": 0.5673906803131104, "losses/sft": 1.667620301246643, "losses/total": 0.5673906803131104, "ref_logps/chosen": -45.46345138549805, "ref_logps/rejected": -54.370609283447266, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.156913161277771, "rewards/margins": 0.207303985953331, "rewards/rejected": -0.3642171621322632, "step": 114 }, { "epoch": 0.87, "grad_norm": 33.99376872525972, "learning_rate": 3.946629213483146e-07, "logps/chosen": -46.81057357788086, "logps/rejected": -56.618961334228516, "loss": 0.6259, "losses/dpo": 0.6754878163337708, "losses/sft": 1.7688114643096924, "losses/total": 0.6754878163337708, "ref_logps/chosen": -45.29435348510742, "ref_logps/rejected": -53.197017669677734, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.15162216126918793, "rewards/margins": 0.1905716210603714, "rewards/rejected": -0.3421937823295593, "step": 115 }, { "epoch": 0.88, "grad_norm": 45.99855268647248, "learning_rate": 3.9325842696629214e-07, "logps/chosen": -45.30148696899414, "logps/rejected": -54.94722366333008, "loss": 0.6164, "losses/dpo": 0.6377050876617432, "losses/sft": 1.524662733078003, "losses/total": 0.6377050876617432, "ref_logps/chosen": -43.51927185058594, "ref_logps/rejected": -50.9721565246582, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17822162806987762, "rewards/margins": 0.21928510069847107, "rewards/rejected": -0.3975067138671875, "step": 116 }, { "epoch": 0.88, "grad_norm": 41.17338423581018, "learning_rate": 3.9185393258426964e-07, "logps/chosen": -45.32805633544922, "logps/rejected": -51.43210220336914, "loss": 0.6437, "losses/dpo": 0.7014100551605225, "losses/sft": 1.579314112663269, "losses/total": 0.7014100551605225, "ref_logps/chosen": -43.525184631347656, "ref_logps/rejected": -48.19099426269531, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1802871823310852, "rewards/margins": 0.14382438361644745, "rewards/rejected": -0.32411155104637146, "step": 117 }, { "epoch": 0.89, "grad_norm": 27.8265676276597, "learning_rate": 3.904494382022472e-07, "logps/chosen": -43.54722213745117, "logps/rejected": -54.876502990722656, "loss": 0.6222, "losses/dpo": 0.5896421670913696, "losses/sft": 1.6924169063568115, "losses/total": 0.5896421670913696, "ref_logps/chosen": -41.962684631347656, "ref_logps/rejected": -51.233131408691406, "rewards/accuracies": 0.671875, "rewards/chosen": -0.15845410525798798, "rewards/margins": 0.20588305592536926, "rewards/rejected": -0.36433714628219604, "step": 118 }, { "epoch": 0.9, "grad_norm": 47.283331920582164, "learning_rate": 3.890449438202247e-07, "logps/chosen": -43.16476058959961, "logps/rejected": -51.12811279296875, "loss": 0.6058, "losses/dpo": 0.56672203540802, "losses/sft": 1.653482437133789, "losses/total": 0.56672203540802, "ref_logps/chosen": -41.7398567199707, "ref_logps/rejected": -47.340763092041016, "rewards/accuracies": 0.734375, "rewards/chosen": -0.14249008893966675, "rewards/margins": 0.2362438291311264, "rewards/rejected": -0.37873390316963196, "step": 119 }, { "epoch": 0.91, "grad_norm": 42.23676105863976, "learning_rate": 3.876404494382022e-07, "logps/chosen": -46.959041595458984, "logps/rejected": -56.4009895324707, "loss": 0.6, "losses/dpo": 0.6254112720489502, "losses/sft": 1.895226240158081, "losses/total": 0.6254112720489502, "ref_logps/chosen": -45.28234100341797, "ref_logps/rejected": -52.07709503173828, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.16767022013664246, "rewards/margins": 0.26471948623657227, "rewards/rejected": -0.4323897063732147, "step": 120 }, { "epoch": 0.91, "grad_norm": 31.040496703925, "learning_rate": 3.8623595505617977e-07, "logps/chosen": -45.887542724609375, "logps/rejected": -56.216007232666016, "loss": 0.6079, "losses/dpo": 0.5990518927574158, "losses/sft": 1.8975269794464111, "losses/total": 0.5990518927574158, "ref_logps/chosen": -44.2087287902832, "ref_logps/rejected": -52.141265869140625, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1678813397884369, "rewards/margins": 0.23959317803382874, "rewards/rejected": -0.40747445821762085, "step": 121 }, { "epoch": 0.92, "grad_norm": 34.03309764270509, "learning_rate": 3.8483146067415727e-07, "logps/chosen": -46.077545166015625, "logps/rejected": -56.32755661010742, "loss": 0.6238, "losses/dpo": 0.6925325393676758, "losses/sft": 1.9379180669784546, "losses/total": 0.6925325393676758, "ref_logps/chosen": -44.14793395996094, "ref_logps/rejected": -52.27103042602539, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.1929616928100586, "rewards/margins": 0.21269084513187408, "rewards/rejected": -0.40565258264541626, "step": 122 }, { "epoch": 0.93, "grad_norm": 45.012980715946, "learning_rate": 3.834269662921348e-07, "logps/chosen": -47.34892272949219, "logps/rejected": -57.689414978027344, "loss": 0.6057, "losses/dpo": 0.5639334917068481, "losses/sft": 1.930764079093933, "losses/total": 0.5639334917068481, "ref_logps/chosen": -45.15814971923828, "ref_logps/rejected": -53.0362548828125, "rewards/accuracies": 0.640625, "rewards/chosen": -0.2190774381160736, "rewards/margins": 0.24623852968215942, "rewards/rejected": -0.46531593799591064, "step": 123 }, { "epoch": 0.94, "grad_norm": 39.42019915706305, "learning_rate": 3.8202247191011233e-07, "logps/chosen": -47.43856430053711, "logps/rejected": -56.883419036865234, "loss": 0.6086, "losses/dpo": 0.6014137268066406, "losses/sft": 1.7882441282272339, "losses/total": 0.6014137268066406, "ref_logps/chosen": -45.34437942504883, "ref_logps/rejected": -52.417259216308594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2094189077615738, "rewards/margins": 0.23719672858715057, "rewards/rejected": -0.446615606546402, "step": 124 }, { "epoch": 0.94, "grad_norm": 37.961322378000254, "learning_rate": 3.806179775280899e-07, "logps/chosen": -55.014442443847656, "logps/rejected": -55.28057098388672, "loss": 0.6781, "losses/dpo": 0.8218836784362793, "losses/sft": 1.8995018005371094, "losses/total": 0.8218836784362793, "ref_logps/chosen": -52.459068298339844, "ref_logps/rejected": -51.623104095458984, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.25553739070892334, "rewards/margins": 0.11020897328853607, "rewards/rejected": -0.3657463788986206, "step": 125 }, { "epoch": 0.95, "grad_norm": 25.50244194867636, "learning_rate": 3.792134831460674e-07, "logps/chosen": -48.14237976074219, "logps/rejected": -59.1024055480957, "loss": 0.6076, "losses/dpo": 0.5760467648506165, "losses/sft": 1.813592791557312, "losses/total": 0.5760467648506165, "ref_logps/chosen": -46.02626037597656, "ref_logps/rejected": -54.49489974975586, "rewards/accuracies": 0.734375, "rewards/chosen": -0.21161216497421265, "rewards/margins": 0.2491389513015747, "rewards/rejected": -0.46075111627578735, "step": 126 }, { "epoch": 0.96, "grad_norm": 35.18309703758917, "learning_rate": 3.7780898876404495e-07, "logps/chosen": -49.95979690551758, "logps/rejected": -55.34828567504883, "loss": 0.6359, "losses/dpo": 0.6715194582939148, "losses/sft": 1.588201880455017, "losses/total": 0.6715194582939148, "ref_logps/chosen": -47.51445007324219, "ref_logps/rejected": -50.85495376586914, "rewards/accuracies": 0.640625, "rewards/chosen": -0.24453480541706085, "rewards/margins": 0.20479881763458252, "rewards/rejected": -0.44933363795280457, "step": 127 }, { "epoch": 0.97, "grad_norm": 37.001708979703196, "learning_rate": 3.7640449438202245e-07, "logps/chosen": -49.846168518066406, "logps/rejected": -57.822391510009766, "loss": 0.6309, "losses/dpo": 0.5826254487037659, "losses/sft": 1.7025587558746338, "losses/total": 0.5826254487037659, "ref_logps/chosen": -47.4237060546875, "ref_logps/rejected": -53.42058181762695, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.24224618077278137, "rewards/margins": 0.19793462753295898, "rewards/rejected": -0.4401807487010956, "step": 128 }, { "epoch": 0.97, "grad_norm": 36.44841731243101, "learning_rate": 3.75e-07, "logps/chosen": -45.78166580200195, "logps/rejected": -56.52864456176758, "loss": 0.6329, "losses/dpo": 0.8350533246994019, "losses/sft": 1.8745198249816895, "losses/total": 0.8350533246994019, "ref_logps/chosen": -43.792232513427734, "ref_logps/rejected": -52.539459228515625, "rewards/accuracies": 0.625, "rewards/chosen": -0.19894394278526306, "rewards/margins": 0.1999754160642624, "rewards/rejected": -0.39891934394836426, "step": 129 }, { "epoch": 0.98, "grad_norm": 50.60942892089312, "learning_rate": 3.735955056179775e-07, "logps/chosen": -48.31587600708008, "logps/rejected": -60.06913757324219, "loss": 0.6035, "losses/dpo": 0.5076871514320374, "losses/sft": 1.7596516609191895, "losses/total": 0.5076871514320374, "ref_logps/chosen": -45.997005462646484, "ref_logps/rejected": -54.95665740966797, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.23188745975494385, "rewards/margins": 0.27936017513275146, "rewards/rejected": -0.5112476348876953, "step": 130 }, { "epoch": 0.99, "grad_norm": 29.87502692929142, "learning_rate": 3.72191011235955e-07, "logps/chosen": -42.0343017578125, "logps/rejected": -56.264400482177734, "loss": 0.5636, "losses/dpo": 0.490623414516449, "losses/sft": 1.8188135623931885, "losses/total": 0.490623414516449, "ref_logps/chosen": -40.442779541015625, "ref_logps/rejected": -51.14723587036133, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.15915215015411377, "rewards/margins": 0.35256466269493103, "rewards/rejected": -0.5117168426513672, "step": 131 }, { "epoch": 1.0, "grad_norm": 26.26963874604301, "learning_rate": 3.707865168539326e-07, "logps/chosen": -44.3410530090332, "logps/rejected": -53.9102897644043, "loss": 0.6049, "losses/dpo": 0.5321304798126221, "losses/sft": 1.6258312463760376, "losses/total": 0.5321304798126221, "ref_logps/chosen": -42.459415435791016, "ref_logps/rejected": -49.608009338378906, "rewards/accuracies": 0.71875, "rewards/chosen": -0.18816326558589935, "rewards/margins": 0.24206531047821045, "rewards/rejected": -0.430228590965271, "step": 132 }, { "epoch": 1.0, "grad_norm": 31.07748233710503, "learning_rate": 3.693820224719101e-07, "logps/chosen": -48.638404846191406, "logps/rejected": -56.798377990722656, "loss": 0.604, "losses/dpo": 0.6897447109222412, "losses/sft": 2.0183496475219727, "losses/total": 0.6897447109222412, "ref_logps/chosen": -46.250240325927734, "ref_logps/rejected": -51.69140625, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.2388162910938263, "rewards/margins": 0.27188044786453247, "rewards/rejected": -0.5106967687606812, "step": 133 }, { "epoch": 1.01, "grad_norm": 32.621018283577015, "learning_rate": 3.6797752808988764e-07, "logps/chosen": -41.6754150390625, "logps/rejected": -53.74419021606445, "loss": 0.5423, "losses/dpo": 0.5601648688316345, "losses/sft": 1.5361900329589844, "losses/total": 0.5601648688316345, "ref_logps/chosen": -40.145145416259766, "ref_logps/rejected": -48.063804626464844, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.15302664041519165, "rewards/margins": 0.41501128673553467, "rewards/rejected": -0.5680379271507263, "step": 134 }, { "epoch": 1.02, "grad_norm": 41.923888909060814, "learning_rate": 3.6657303370786514e-07, "logps/chosen": -43.79188537597656, "logps/rejected": -52.1921272277832, "loss": 0.6089, "losses/dpo": 0.5927071571350098, "losses/sft": 1.3719282150268555, "losses/total": 0.5927071571350098, "ref_logps/chosen": -41.53467559814453, "ref_logps/rejected": -47.349117279052734, "rewards/accuracies": 0.734375, "rewards/chosen": -0.22572064399719238, "rewards/margins": 0.2585802674293518, "rewards/rejected": -0.4843009114265442, "step": 135 }, { "epoch": 1.03, "grad_norm": 43.628967234700895, "learning_rate": 3.651685393258427e-07, "logps/chosen": -45.88031005859375, "logps/rejected": -54.597625732421875, "loss": 0.5977, "losses/dpo": 0.6261985301971436, "losses/sft": 1.6914416551589966, "losses/total": 0.6261985301971436, "ref_logps/chosen": -43.884586334228516, "ref_logps/rejected": -49.83208465576172, "rewards/accuracies": 0.71875, "rewards/chosen": -0.19957265257835388, "rewards/margins": 0.2769812345504761, "rewards/rejected": -0.47655388712882996, "step": 136 }, { "epoch": 1.03, "grad_norm": 32.214505202780856, "learning_rate": 3.637640449438202e-07, "logps/chosen": -49.95244598388672, "logps/rejected": -59.78261184692383, "loss": 0.5843, "losses/dpo": 0.7390427589416504, "losses/sft": 1.696536660194397, "losses/total": 0.7390427589416504, "ref_logps/chosen": -47.5785026550293, "ref_logps/rejected": -54.22393035888672, "rewards/accuracies": 0.734375, "rewards/chosen": -0.23739397525787354, "rewards/margins": 0.31847497820854187, "rewards/rejected": -0.555868923664093, "step": 137 }, { "epoch": 1.04, "grad_norm": 35.14711060760973, "learning_rate": 3.6235955056179776e-07, "logps/chosen": -43.57805633544922, "logps/rejected": -51.13362503051758, "loss": 0.5695, "losses/dpo": 0.49799275398254395, "losses/sft": 1.743740439414978, "losses/total": 0.49799275398254395, "ref_logps/chosen": -41.74354934692383, "ref_logps/rejected": -46.016632080078125, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.18345096707344055, "rewards/margins": 0.3282485902309418, "rewards/rejected": -0.5116995573043823, "step": 138 }, { "epoch": 1.05, "grad_norm": 29.04827871613601, "learning_rate": 3.6095505617977526e-07, "logps/chosen": -49.01455307006836, "logps/rejected": -55.07649612426758, "loss": 0.586, "losses/dpo": 0.5866612195968628, "losses/sft": 1.6541554927825928, "losses/total": 0.5866612195968628, "ref_logps/chosen": -46.84767150878906, "ref_logps/rejected": -49.655704498291016, "rewards/accuracies": 0.703125, "rewards/chosen": -0.21668824553489685, "rewards/margins": 0.3253910541534424, "rewards/rejected": -0.5420793294906616, "step": 139 }, { "epoch": 1.06, "grad_norm": 38.80271868803454, "learning_rate": 3.5955056179775277e-07, "logps/chosen": -43.13740158081055, "logps/rejected": -55.07585906982422, "loss": 0.562, "losses/dpo": 0.6106088757514954, "losses/sft": 1.6797422170639038, "losses/total": 0.6106088757514954, "ref_logps/chosen": -41.61346435546875, "ref_logps/rejected": -50.03567123413086, "rewards/accuracies": 0.765625, "rewards/chosen": -0.1523941457271576, "rewards/margins": 0.351624995470047, "rewards/rejected": -0.5040191411972046, "step": 140 }, { "epoch": 1.06, "grad_norm": 31.513747529747132, "learning_rate": 3.581460674157303e-07, "logps/chosen": -46.31972885131836, "logps/rejected": -57.69581604003906, "loss": 0.5806, "losses/dpo": 0.6090304255485535, "losses/sft": 1.7538655996322632, "losses/total": 0.6090304255485535, "ref_logps/chosen": -44.57556915283203, "ref_logps/rejected": -52.764137268066406, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.17441636323928833, "rewards/margins": 0.3187510073184967, "rewards/rejected": -0.4931674003601074, "step": 141 }, { "epoch": 1.07, "grad_norm": 51.065834083982736, "learning_rate": 3.5674157303370783e-07, "logps/chosen": -48.51567077636719, "logps/rejected": -54.418060302734375, "loss": 0.6001, "losses/dpo": 0.6386189460754395, "losses/sft": 1.878273606300354, "losses/total": 0.6386189460754395, "ref_logps/chosen": -46.21207046508789, "ref_logps/rejected": -49.42289733886719, "rewards/accuracies": 0.671875, "rewards/chosen": -0.23036019504070282, "rewards/margins": 0.26915615797042847, "rewards/rejected": -0.4995163381099701, "step": 142 }, { "epoch": 1.08, "grad_norm": 29.093406410662645, "learning_rate": 3.553370786516854e-07, "logps/chosen": -49.36024856567383, "logps/rejected": -56.00803756713867, "loss": 0.5707, "losses/dpo": 0.4713801145553589, "losses/sft": 1.6630077362060547, "losses/total": 0.4713801145553589, "ref_logps/chosen": -47.340797424316406, "ref_logps/rejected": -50.417945861816406, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.20194481313228607, "rewards/margins": 0.3570638597011566, "rewards/rejected": -0.5590086579322815, "step": 143 }, { "epoch": 1.09, "grad_norm": 29.504151541319818, "learning_rate": 3.539325842696629e-07, "logps/chosen": -42.901329040527344, "logps/rejected": -56.28777313232422, "loss": 0.5543, "losses/dpo": 0.4904211461544037, "losses/sft": 1.8116967678070068, "losses/total": 0.4904211461544037, "ref_logps/chosen": -41.418701171875, "ref_logps/rejected": -50.88157653808594, "rewards/accuracies": 0.796875, "rewards/chosen": -0.14826291799545288, "rewards/margins": 0.3923571705818176, "rewards/rejected": -0.5406200885772705, "step": 144 }, { "epoch": 1.09, "grad_norm": 32.99428211070434, "learning_rate": 3.5252808988764045e-07, "logps/chosen": -45.01850891113281, "logps/rejected": -53.19471740722656, "loss": 0.5554, "losses/dpo": 0.631604790687561, "losses/sft": 1.6379601955413818, "losses/total": 0.631604790687561, "ref_logps/chosen": -43.15743637084961, "ref_logps/rejected": -47.4387321472168, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1861070990562439, "rewards/margins": 0.3894909918308258, "rewards/rejected": -0.5755980014801025, "step": 145 }, { "epoch": 1.1, "grad_norm": 40.98881999290143, "learning_rate": 3.51123595505618e-07, "logps/chosen": -47.00930404663086, "logps/rejected": -57.899696350097656, "loss": 0.5831, "losses/dpo": 0.5215938091278076, "losses/sft": 1.6712085008621216, "losses/total": 0.5215938091278076, "ref_logps/chosen": -44.51298522949219, "ref_logps/rejected": -51.731143951416016, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.2496318519115448, "rewards/margins": 0.36722302436828613, "rewards/rejected": -0.6168549656867981, "step": 146 }, { "epoch": 1.11, "grad_norm": 37.43481631966579, "learning_rate": 3.497191011235955e-07, "logps/chosen": -45.01251220703125, "logps/rejected": -57.83795166015625, "loss": 0.5693, "losses/dpo": 0.46049636602401733, "losses/sft": 1.6903917789459229, "losses/total": 0.46049636602401733, "ref_logps/chosen": -42.77463150024414, "ref_logps/rejected": -51.813594818115234, "rewards/accuracies": 0.71875, "rewards/chosen": -0.22378797829151154, "rewards/margins": 0.3786487281322479, "rewards/rejected": -0.6024366617202759, "step": 147 }, { "epoch": 1.12, "grad_norm": 50.27433964941223, "learning_rate": 3.48314606741573e-07, "logps/chosen": -49.43627166748047, "logps/rejected": -59.34334945678711, "loss": 0.5773, "losses/dpo": 0.5802911520004272, "losses/sft": 1.6989613771438599, "losses/total": 0.5802911520004272, "ref_logps/chosen": -47.04135513305664, "ref_logps/rejected": -53.43082046508789, "rewards/accuracies": 0.734375, "rewards/chosen": -0.23949074745178223, "rewards/margins": 0.35176190733909607, "rewards/rejected": -0.5912526845932007, "step": 148 }, { "epoch": 1.13, "grad_norm": 72.86074246642919, "learning_rate": 3.469101123595505e-07, "logps/chosen": -46.316532135009766, "logps/rejected": -52.775543212890625, "loss": 0.5892, "losses/dpo": 0.5617306232452393, "losses/sft": 1.6325223445892334, "losses/total": 0.5617306232452393, "ref_logps/chosen": -43.783203125, "ref_logps/rejected": -47.19033432006836, "rewards/accuracies": 0.703125, "rewards/chosen": -0.25333237648010254, "rewards/margins": 0.30518826842308044, "rewards/rejected": -0.5585206151008606, "step": 149 }, { "epoch": 1.13, "grad_norm": 36.655081217167435, "learning_rate": 3.4550561797752807e-07, "logps/chosen": -48.52875518798828, "logps/rejected": -64.00365447998047, "loss": 0.5271, "losses/dpo": 0.5213245153427124, "losses/sft": 1.6000676155090332, "losses/total": 0.5213245153427124, "ref_logps/chosen": -46.18851852416992, "ref_logps/rejected": -56.694435119628906, "rewards/accuracies": 0.78125, "rewards/chosen": -0.23402364552021027, "rewards/margins": 0.496898889541626, "rewards/rejected": -0.730922520160675, "step": 150 }, { "epoch": 1.14, "grad_norm": 23.49481418849143, "learning_rate": 3.441011235955056e-07, "logps/chosen": -47.38222122192383, "logps/rejected": -54.75856399536133, "loss": 0.5611, "losses/dpo": 0.4975886642932892, "losses/sft": 1.75065279006958, "losses/total": 0.4975886642932892, "ref_logps/chosen": -45.20569610595703, "ref_logps/rejected": -48.676143646240234, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2176520675420761, "rewards/margins": 0.39058929681777954, "rewards/rejected": -0.6082413792610168, "step": 151 }, { "epoch": 1.15, "grad_norm": 62.95285274494457, "learning_rate": 3.4269662921348313e-07, "logps/chosen": -48.7578010559082, "logps/rejected": -54.0283317565918, "loss": 0.6122, "losses/dpo": 0.6478061079978943, "losses/sft": 1.9034008979797363, "losses/total": 0.6478061079978943, "ref_logps/chosen": -45.902217864990234, "ref_logps/rejected": -48.68632125854492, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.28555798530578613, "rewards/margins": 0.2486429512500763, "rewards/rejected": -0.5342009663581848, "step": 152 }, { "epoch": 1.16, "grad_norm": 30.616489322415532, "learning_rate": 3.4129213483146064e-07, "logps/chosen": -47.35772705078125, "logps/rejected": -62.0230712890625, "loss": 0.5364, "losses/dpo": 0.41913074254989624, "losses/sft": 1.5708346366882324, "losses/total": 0.41913074254989624, "ref_logps/chosen": -45.15509033203125, "ref_logps/rejected": -55.225074768066406, "rewards/accuracies": 0.765625, "rewards/chosen": -0.22026404738426208, "rewards/margins": 0.45953649282455444, "rewards/rejected": -0.6798006296157837, "step": 153 }, { "epoch": 1.16, "grad_norm": 61.471302399657155, "learning_rate": 3.398876404494382e-07, "logps/chosen": -47.444671630859375, "logps/rejected": -56.34770202636719, "loss": 0.59, "losses/dpo": 0.6416410803794861, "losses/sft": 2.0058481693267822, "losses/total": 0.6416410803794861, "ref_logps/chosen": -44.839866638183594, "ref_logps/rejected": -50.426788330078125, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2604803740978241, "rewards/margins": 0.33161091804504395, "rewards/rejected": -0.5920912623405457, "step": 154 }, { "epoch": 1.17, "grad_norm": 35.39641258928965, "learning_rate": 3.3848314606741575e-07, "logps/chosen": -50.47380828857422, "logps/rejected": -57.68503952026367, "loss": 0.5792, "losses/dpo": 0.5305228233337402, "losses/sft": 1.6867642402648926, "losses/total": 0.5305228233337402, "ref_logps/chosen": -47.216766357421875, "ref_logps/rejected": -51.042598724365234, "rewards/accuracies": 0.71875, "rewards/chosen": -0.32570481300354004, "rewards/margins": 0.3385399281978607, "rewards/rejected": -0.6642447113990784, "step": 155 }, { "epoch": 1.18, "grad_norm": 35.30752395690993, "learning_rate": 3.3707865168539325e-07, "logps/chosen": -45.446800231933594, "logps/rejected": -58.381805419921875, "loss": 0.5822, "losses/dpo": 0.533534049987793, "losses/sft": 1.663805365562439, "losses/total": 0.533534049987793, "ref_logps/chosen": -42.80461883544922, "ref_logps/rejected": -52.20515060424805, "rewards/accuracies": 0.703125, "rewards/chosen": -0.26421821117401123, "rewards/margins": 0.3534476161003113, "rewards/rejected": -0.6176658272743225, "step": 156 }, { "epoch": 1.19, "grad_norm": 46.17079386920438, "learning_rate": 3.356741573033708e-07, "logps/chosen": -41.31314468383789, "logps/rejected": -51.05323028564453, "loss": 0.579, "losses/dpo": 0.5948421359062195, "losses/sft": 1.7516489028930664, "losses/total": 0.5948421359062195, "ref_logps/chosen": -38.649818420410156, "ref_logps/rejected": -44.947181701660156, "rewards/accuracies": 0.75, "rewards/chosen": -0.26633283495903015, "rewards/margins": 0.34427201747894287, "rewards/rejected": -0.6106047630310059, "step": 157 }, { "epoch": 1.19, "grad_norm": 45.60430910092628, "learning_rate": 3.3426966292134826e-07, "logps/chosen": -41.923038482666016, "logps/rejected": -55.01686477661133, "loss": 0.5493, "losses/dpo": 0.5280421376228333, "losses/sft": 1.6715142726898193, "losses/total": 0.5280421376228333, "ref_logps/chosen": -39.835819244384766, "ref_logps/rejected": -48.61201095581055, "rewards/accuracies": 0.75, "rewards/chosen": -0.20872168242931366, "rewards/margins": 0.43176329135894775, "rewards/rejected": -0.6404849290847778, "step": 158 }, { "epoch": 1.2, "grad_norm": 28.40087306788928, "learning_rate": 3.328651685393258e-07, "logps/chosen": -47.166160583496094, "logps/rejected": -55.27490997314453, "loss": 0.5449, "losses/dpo": 0.5219501256942749, "losses/sft": 1.7682543992996216, "losses/total": 0.5219501256942749, "ref_logps/chosen": -44.67719268798828, "ref_logps/rejected": -48.80986785888672, "rewards/accuracies": 0.75, "rewards/chosen": -0.24889662861824036, "rewards/margins": 0.39760738611221313, "rewards/rejected": -0.6465040445327759, "step": 159 }, { "epoch": 1.21, "grad_norm": 35.44259972458591, "learning_rate": 3.314606741573033e-07, "logps/chosen": -44.4686279296875, "logps/rejected": -55.27857971191406, "loss": 0.532, "losses/dpo": 0.5422082543373108, "losses/sft": 1.8745040893554688, "losses/total": 0.5422082543373108, "ref_logps/chosen": -42.45107650756836, "ref_logps/rejected": -48.90138244628906, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2017555832862854, "rewards/margins": 0.4359641671180725, "rewards/rejected": -0.6377197504043579, "step": 160 }, { "epoch": 1.22, "grad_norm": 31.82269331934412, "learning_rate": 3.300561797752809e-07, "logps/chosen": -45.181365966796875, "logps/rejected": -59.50548553466797, "loss": 0.5146, "losses/dpo": 0.6100755333900452, "losses/sft": 1.9181486368179321, "losses/total": 0.6100755333900452, "ref_logps/chosen": -42.73472595214844, "ref_logps/rejected": -52.02288055419922, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.2446644902229309, "rewards/margins": 0.5035961866378784, "rewards/rejected": -0.7482606172561646, "step": 161 }, { "epoch": 1.22, "grad_norm": 33.32823767781406, "learning_rate": 3.2865168539325844e-07, "logps/chosen": -46.763404846191406, "logps/rejected": -58.536251068115234, "loss": 0.5721, "losses/dpo": 0.6071146130561829, "losses/sft": 1.7083139419555664, "losses/total": 0.6071146130561829, "ref_logps/chosen": -43.77151107788086, "ref_logps/rejected": -51.48747634887695, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.29918932914733887, "rewards/margins": 0.40568807721138, "rewards/rejected": -0.7048774361610413, "step": 162 }, { "epoch": 1.23, "grad_norm": 32.96046529889193, "learning_rate": 3.2724719101123594e-07, "logps/chosen": -45.392822265625, "logps/rejected": -62.101131439208984, "loss": 0.5005, "losses/dpo": 0.40495190024375916, "losses/sft": 1.654756784439087, "losses/total": 0.40495190024375916, "ref_logps/chosen": -43.05342483520508, "ref_logps/rejected": -54.216773986816406, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.23394013941287994, "rewards/margins": 0.5544958114624023, "rewards/rejected": -0.7884359359741211, "step": 163 }, { "epoch": 1.24, "grad_norm": 40.982570800467755, "learning_rate": 3.258426966292135e-07, "logps/chosen": -45.558998107910156, "logps/rejected": -56.33720397949219, "loss": 0.5855, "losses/dpo": 0.44497954845428467, "losses/sft": 1.4462865591049194, "losses/total": 0.44497954845428467, "ref_logps/chosen": -42.405147552490234, "ref_logps/rejected": -49.374412536621094, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.315385103225708, "rewards/margins": 0.38089412450790405, "rewards/rejected": -0.6962792277336121, "step": 164 }, { "epoch": 1.25, "grad_norm": 30.245610190960313, "learning_rate": 3.24438202247191e-07, "logps/chosen": -48.86178207397461, "logps/rejected": -61.23605728149414, "loss": 0.555, "losses/dpo": 0.5053444504737854, "losses/sft": 1.4914695024490356, "losses/total": 0.5053444504737854, "ref_logps/chosen": -45.83845138549805, "ref_logps/rejected": -53.94969177246094, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.30233362317085266, "rewards/margins": 0.4263032078742981, "rewards/rejected": -0.7286368012428284, "step": 165 }, { "epoch": 1.25, "grad_norm": 31.304620096158757, "learning_rate": 3.2303370786516856e-07, "logps/chosen": -44.00810623168945, "logps/rejected": -51.443626403808594, "loss": 0.5616, "losses/dpo": 0.6277773976325989, "losses/sft": 1.5761191844940186, "losses/total": 0.6277773976325989, "ref_logps/chosen": -41.996788024902344, "ref_logps/rejected": -45.35894012451172, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.20113208889961243, "rewards/margins": 0.40733644366264343, "rewards/rejected": -0.6084685325622559, "step": 166 }, { "epoch": 1.26, "grad_norm": 45.990090598533186, "learning_rate": 3.21629213483146e-07, "logps/chosen": -48.94554901123047, "logps/rejected": -59.59689712524414, "loss": 0.5767, "losses/dpo": 0.5611432194709778, "losses/sft": 1.8900790214538574, "losses/total": 0.5611432194709778, "ref_logps/chosen": -45.57502365112305, "ref_logps/rejected": -52.39255905151367, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.3370529115200043, "rewards/margins": 0.3833810091018677, "rewards/rejected": -0.7204338312149048, "step": 167 }, { "epoch": 1.27, "grad_norm": 42.155612007275295, "learning_rate": 3.2022471910112357e-07, "logps/chosen": -52.60509490966797, "logps/rejected": -58.510658264160156, "loss": 0.5634, "losses/dpo": 0.5901261568069458, "losses/sft": 1.9270809888839722, "losses/total": 0.5901261568069458, "ref_logps/chosen": -48.875816345214844, "ref_logps/rejected": -50.73295593261719, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.37292760610580444, "rewards/margins": 0.4048423171043396, "rewards/rejected": -0.7777699828147888, "step": 168 }, { "epoch": 1.28, "grad_norm": 32.65991933431383, "learning_rate": 3.1882022471910107e-07, "logps/chosen": -48.07525634765625, "logps/rejected": -54.59061050415039, "loss": 0.5867, "losses/dpo": 0.5589795112609863, "losses/sft": 1.7771300077438354, "losses/total": 0.5589795112609863, "ref_logps/chosen": -44.67513656616211, "ref_logps/rejected": -47.91284942626953, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.34001192450523376, "rewards/margins": 0.32776492834091187, "rewards/rejected": -0.667776882648468, "step": 169 }, { "epoch": 1.28, "grad_norm": 162.0352544440551, "learning_rate": 3.1741573033707863e-07, "logps/chosen": -49.67656707763672, "logps/rejected": -64.43726348876953, "loss": 0.5375, "losses/dpo": 0.41701579093933105, "losses/sft": 1.8181653022766113, "losses/total": 0.41701579093933105, "ref_logps/chosen": -45.8641357421875, "ref_logps/rejected": -56.07011032104492, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.381242960691452, "rewards/margins": 0.45547229051589966, "rewards/rejected": -0.8367152214050293, "step": 170 }, { "epoch": 1.29, "grad_norm": 61.38392402014484, "learning_rate": 3.160112359550562e-07, "logps/chosen": -50.97727966308594, "logps/rejected": -58.199005126953125, "loss": 0.5633, "losses/dpo": 0.44652259349823, "losses/sft": 1.7464041709899902, "losses/total": 0.44652259349823, "ref_logps/chosen": -47.13948059082031, "ref_logps/rejected": -50.15918731689453, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.38377973437309265, "rewards/margins": 0.4202021360397339, "rewards/rejected": -0.8039818406105042, "step": 171 }, { "epoch": 1.3, "grad_norm": 48.154314403463175, "learning_rate": 3.146067415730337e-07, "logps/chosen": -44.277122497558594, "logps/rejected": -57.93571853637695, "loss": 0.5442, "losses/dpo": 0.564382016658783, "losses/sft": 1.8270690441131592, "losses/total": 0.564382016658783, "ref_logps/chosen": -41.201141357421875, "ref_logps/rejected": -49.990234375, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3075975775718689, "rewards/margins": 0.48695093393325806, "rewards/rejected": -0.794548511505127, "step": 172 }, { "epoch": 1.31, "grad_norm": 31.17540396860089, "learning_rate": 3.1320224719101125e-07, "logps/chosen": -47.254661560058594, "logps/rejected": -53.067962646484375, "loss": 0.5979, "losses/dpo": 0.5169023275375366, "losses/sft": 1.6572108268737793, "losses/total": 0.5169023275375366, "ref_logps/chosen": -43.593841552734375, "ref_logps/rejected": -46.29331588745117, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.3660818934440613, "rewards/margins": 0.3113827109336853, "rewards/rejected": -0.6774646639823914, "step": 173 }, { "epoch": 1.31, "grad_norm": 35.50174408603536, "learning_rate": 3.1179775280898875e-07, "logps/chosen": -46.11635971069336, "logps/rejected": -61.60688400268555, "loss": 0.5559, "losses/dpo": 0.5453978776931763, "losses/sft": 1.6486693620681763, "losses/total": 0.5453978776931763, "ref_logps/chosen": -42.868385314941406, "ref_logps/rejected": -53.524627685546875, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.32479771971702576, "rewards/margins": 0.48342782258987427, "rewards/rejected": -0.8082255125045776, "step": 174 }, { "epoch": 1.32, "grad_norm": 31.767041431724614, "learning_rate": 3.103932584269663e-07, "logps/chosen": -48.18815994262695, "logps/rejected": -59.31452941894531, "loss": 0.5477, "losses/dpo": 0.4422985017299652, "losses/sft": 1.6327428817749023, "losses/total": 0.4422985017299652, "ref_logps/chosen": -44.846229553222656, "ref_logps/rejected": -51.60065460205078, "rewards/accuracies": 0.765625, "rewards/chosen": -0.33419346809387207, "rewards/margins": 0.43719419836997986, "rewards/rejected": -0.7713876962661743, "step": 175 }, { "epoch": 1.33, "grad_norm": 33.86190158158939, "learning_rate": 3.0898876404494376e-07, "logps/chosen": -46.19519805908203, "logps/rejected": -58.27720642089844, "loss": 0.541, "losses/dpo": 0.4748964309692383, "losses/sft": 1.7518017292022705, "losses/total": 0.4748964309692383, "ref_logps/chosen": -42.37709045410156, "ref_logps/rejected": -49.79399490356445, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3818108141422272, "rewards/margins": 0.4665103554725647, "rewards/rejected": -0.8483211398124695, "step": 176 }, { "epoch": 1.34, "grad_norm": 41.19050863338929, "learning_rate": 3.075842696629213e-07, "logps/chosen": -47.045963287353516, "logps/rejected": -58.000709533691406, "loss": 0.5714, "losses/dpo": 0.5359878540039062, "losses/sft": 1.7720417976379395, "losses/total": 0.5359878540039062, "ref_logps/chosen": -43.91377639770508, "ref_logps/rejected": -51.059993743896484, "rewards/accuracies": 0.71875, "rewards/chosen": -0.31321826577186584, "rewards/margins": 0.3808533847332001, "rewards/rejected": -0.6940716505050659, "step": 177 }, { "epoch": 1.34, "grad_norm": 34.0269006355117, "learning_rate": 3.0617977528089887e-07, "logps/chosen": -48.10038757324219, "logps/rejected": -55.35579299926758, "loss": 0.5864, "losses/dpo": 0.5006414651870728, "losses/sft": 1.949295163154602, "losses/total": 0.5006414651870728, "ref_logps/chosen": -43.92369079589844, "ref_logps/rejected": -47.48767852783203, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41766950488090515, "rewards/margins": 0.36914220452308655, "rewards/rejected": -0.7868117094039917, "step": 178 }, { "epoch": 1.35, "grad_norm": 35.07116160148418, "learning_rate": 3.047752808988764e-07, "logps/chosen": -46.21326446533203, "logps/rejected": -61.98849105834961, "loss": 0.5661, "losses/dpo": 0.4972117841243744, "losses/sft": 1.8025044202804565, "losses/total": 0.4972117841243744, "ref_logps/chosen": -42.587913513183594, "ref_logps/rejected": -54.19034957885742, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.36253488063812256, "rewards/margins": 0.41728001832962036, "rewards/rejected": -0.7798148393630981, "step": 179 }, { "epoch": 1.36, "grad_norm": 64.43892536816215, "learning_rate": 3.0337078651685393e-07, "logps/chosen": -47.79494857788086, "logps/rejected": -58.98815155029297, "loss": 0.5203, "losses/dpo": 0.5164665579795837, "losses/sft": 1.8320025205612183, "losses/total": 0.5164665579795837, "ref_logps/chosen": -44.44514083862305, "ref_logps/rejected": -50.60387420654297, "rewards/accuracies": 0.828125, "rewards/chosen": -0.33498066663742065, "rewards/margins": 0.5034468173980713, "rewards/rejected": -0.8384275436401367, "step": 180 }, { "epoch": 1.37, "grad_norm": 55.14335673723716, "learning_rate": 3.0196629213483144e-07, "logps/chosen": -50.4581298828125, "logps/rejected": -63.245201110839844, "loss": 0.5261, "losses/dpo": 0.4643549919128418, "losses/sft": 1.7297191619873047, "losses/total": 0.4643549919128418, "ref_logps/chosen": -46.76393127441406, "ref_logps/rejected": -53.96830749511719, "rewards/accuracies": 0.765625, "rewards/chosen": -0.36941999197006226, "rewards/margins": 0.5582692623138428, "rewards/rejected": -0.927689254283905, "step": 181 }, { "epoch": 1.37, "grad_norm": 31.75120231192772, "learning_rate": 3.00561797752809e-07, "logps/chosen": -47.86188507080078, "logps/rejected": -60.10704803466797, "loss": 0.5437, "losses/dpo": 0.4577639698982239, "losses/sft": 1.600904941558838, "losses/total": 0.4577639698982239, "ref_logps/chosen": -44.270652770996094, "ref_logps/rejected": -51.2730712890625, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.3591231405735016, "rewards/margins": 0.5242742896080017, "rewards/rejected": -0.8833975195884705, "step": 182 }, { "epoch": 1.38, "grad_norm": 40.01187770441235, "learning_rate": 2.991573033707865e-07, "logps/chosen": -45.88461685180664, "logps/rejected": -54.52734375, "loss": 0.546, "losses/dpo": 0.45084521174430847, "losses/sft": 1.8426713943481445, "losses/total": 0.45084521174430847, "ref_logps/chosen": -42.66088104248047, "ref_logps/rejected": -46.751834869384766, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.3223740756511688, "rewards/margins": 0.45517635345458984, "rewards/rejected": -0.7775503993034363, "step": 183 }, { "epoch": 1.39, "grad_norm": 28.32565045234733, "learning_rate": 2.9775280898876406e-07, "logps/chosen": -51.762855529785156, "logps/rejected": -59.046051025390625, "loss": 0.5759, "losses/dpo": 0.5901447534561157, "losses/sft": 1.8084410429000854, "losses/total": 0.5901447534561157, "ref_logps/chosen": -47.22824478149414, "ref_logps/rejected": -50.54530334472656, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.4534617066383362, "rewards/margins": 0.3966123163700104, "rewards/rejected": -0.850074052810669, "step": 184 }, { "epoch": 1.4, "grad_norm": 32.75069177448169, "learning_rate": 2.9634831460674156e-07, "logps/chosen": -46.012393951416016, "logps/rejected": -57.30610656738281, "loss": 0.5681, "losses/dpo": 0.541546106338501, "losses/sft": 1.5090305805206299, "losses/total": 0.541546106338501, "ref_logps/chosen": -42.05977249145508, "ref_logps/rejected": -49.48423385620117, "rewards/accuracies": 0.71875, "rewards/chosen": -0.39526206254959106, "rewards/margins": 0.38692545890808105, "rewards/rejected": -0.7821875810623169, "step": 185 }, { "epoch": 1.4, "grad_norm": 77.94742656856977, "learning_rate": 2.9494382022471906e-07, "logps/chosen": -46.43281936645508, "logps/rejected": -58.593360900878906, "loss": 0.5237, "losses/dpo": 0.6953313946723938, "losses/sft": 1.787532925605774, "losses/total": 0.6953313946723938, "ref_logps/chosen": -42.994911193847656, "ref_logps/rejected": -49.855743408203125, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.34379082918167114, "rewards/margins": 0.5299713611602783, "rewards/rejected": -0.8737622499465942, "step": 186 }, { "epoch": 1.41, "grad_norm": 32.59394106553326, "learning_rate": 2.935393258426966e-07, "logps/chosen": -44.59947967529297, "logps/rejected": -57.210365295410156, "loss": 0.5549, "losses/dpo": 0.6337884664535522, "losses/sft": 1.8655121326446533, "losses/total": 0.6337884664535522, "ref_logps/chosen": -41.383880615234375, "ref_logps/rejected": -49.22904968261719, "rewards/accuracies": 0.75, "rewards/chosen": -0.32155996561050415, "rewards/margins": 0.47657155990600586, "rewards/rejected": -0.79813152551651, "step": 187 }, { "epoch": 1.42, "grad_norm": 36.14222527068887, "learning_rate": 2.921348314606741e-07, "logps/chosen": -45.52980041503906, "logps/rejected": -59.41967010498047, "loss": 0.5155, "losses/dpo": 0.5147260427474976, "losses/sft": 1.8530042171478271, "losses/total": 0.5147260427474976, "ref_logps/chosen": -42.301246643066406, "ref_logps/rejected": -50.82106018066406, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3228548765182495, "rewards/margins": 0.5370060205459595, "rewards/rejected": -0.8598609566688538, "step": 188 }, { "epoch": 1.43, "grad_norm": 30.630961497137292, "learning_rate": 2.907303370786517e-07, "logps/chosen": -48.7333984375, "logps/rejected": -56.453704833984375, "loss": 0.5967, "losses/dpo": 0.5994954109191895, "losses/sft": 1.7053594589233398, "losses/total": 0.5994954109191895, "ref_logps/chosen": -44.93768310546875, "ref_logps/rejected": -49.572078704833984, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.37957125902175903, "rewards/margins": 0.3085916042327881, "rewards/rejected": -0.6881627440452576, "step": 189 }, { "epoch": 1.43, "grad_norm": 30.316992638253904, "learning_rate": 2.893258426966292e-07, "logps/chosen": -44.77424621582031, "logps/rejected": -58.78388977050781, "loss": 0.5401, "losses/dpo": 0.6450770497322083, "losses/sft": 1.7625247240066528, "losses/total": 0.6450770497322083, "ref_logps/chosen": -41.57470703125, "ref_logps/rejected": -50.766357421875, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3199542760848999, "rewards/margins": 0.48179924488067627, "rewards/rejected": -0.8017535209655762, "step": 190 }, { "epoch": 1.44, "grad_norm": 44.704805549949086, "learning_rate": 2.8792134831460674e-07, "logps/chosen": -50.617286682128906, "logps/rejected": -54.88397216796875, "loss": 0.5832, "losses/dpo": 0.6402714252471924, "losses/sft": 1.8502634763717651, "losses/total": 0.6402714252471924, "ref_logps/chosen": -46.51710891723633, "ref_logps/rejected": -46.80644226074219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41001784801483154, "rewards/margins": 0.39773547649383545, "rewards/rejected": -0.8077532649040222, "step": 191 }, { "epoch": 1.45, "grad_norm": 39.36398982377757, "learning_rate": 2.8651685393258425e-07, "logps/chosen": -50.05918502807617, "logps/rejected": -59.800270080566406, "loss": 0.5726, "losses/dpo": 0.570289134979248, "losses/sft": 1.8627090454101562, "losses/total": 0.570289134979248, "ref_logps/chosen": -46.30699157714844, "ref_logps/rejected": -52.19733810424805, "rewards/accuracies": 0.703125, "rewards/chosen": -0.37521904706954956, "rewards/margins": 0.3850741386413574, "rewards/rejected": -0.760293185710907, "step": 192 }, { "epoch": 1.46, "grad_norm": 29.537825459501168, "learning_rate": 2.851123595505618e-07, "logps/chosen": -51.252479553222656, "logps/rejected": -62.581695556640625, "loss": 0.5106, "losses/dpo": 0.47966766357421875, "losses/sft": 2.082817554473877, "losses/total": 0.47966766357421875, "ref_logps/chosen": -47.88975524902344, "ref_logps/rejected": -53.814727783203125, "rewards/accuracies": 0.765625, "rewards/chosen": -0.33627209067344666, "rewards/margins": 0.5404240489006042, "rewards/rejected": -0.8766961097717285, "step": 193 }, { "epoch": 1.46, "grad_norm": 31.919051308589587, "learning_rate": 2.8370786516853936e-07, "logps/chosen": -49.25547790527344, "logps/rejected": -61.38949203491211, "loss": 0.5679, "losses/dpo": 0.5522312521934509, "losses/sft": 1.8139228820800781, "losses/total": 0.5522312521934509, "ref_logps/chosen": -45.594242095947266, "ref_logps/rejected": -53.25303268432617, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3661239743232727, "rewards/margins": 0.44752201437950134, "rewards/rejected": -0.8136459589004517, "step": 194 }, { "epoch": 1.47, "grad_norm": 51.0275405052851, "learning_rate": 2.823033707865168e-07, "logps/chosen": -43.275535583496094, "logps/rejected": -56.02321243286133, "loss": 0.5403, "losses/dpo": 0.40436556935310364, "losses/sft": 1.73399817943573, "losses/total": 0.40436556935310364, "ref_logps/chosen": -40.606422424316406, "ref_logps/rejected": -48.55220413208008, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26691168546676636, "rewards/margins": 0.48018932342529297, "rewards/rejected": -0.7471009492874146, "step": 195 }, { "epoch": 1.48, "grad_norm": 56.00574208971232, "learning_rate": 2.8089887640449437e-07, "logps/chosen": -47.662986755371094, "logps/rejected": -59.25799560546875, "loss": 0.5371, "losses/dpo": 0.3397861421108246, "losses/sft": 1.6196953058242798, "losses/total": 0.3397861421108246, "ref_logps/chosen": -44.04248809814453, "ref_logps/rejected": -50.62962341308594, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.362050324678421, "rewards/margins": 0.5007868409156799, "rewards/rejected": -0.8628372550010681, "step": 196 }, { "epoch": 1.49, "grad_norm": 124.18849555191056, "learning_rate": 2.794943820224719e-07, "logps/chosen": -42.69883728027344, "logps/rejected": -59.230831146240234, "loss": 0.5358, "losses/dpo": 0.5106825232505798, "losses/sft": 1.860982894897461, "losses/total": 0.5106825232505798, "ref_logps/chosen": -38.96552276611328, "ref_logps/rejected": -50.499732971191406, "rewards/accuracies": 0.796875, "rewards/chosen": -0.37333178520202637, "rewards/margins": 0.4997779130935669, "rewards/rejected": -0.8731096386909485, "step": 197 }, { "epoch": 1.5, "grad_norm": 34.35010974976012, "learning_rate": 2.7808988764044943e-07, "logps/chosen": -44.7771110534668, "logps/rejected": -54.08772659301758, "loss": 0.6115, "losses/dpo": 0.5737531781196594, "losses/sft": 1.5354400873184204, "losses/total": 0.5737531781196594, "ref_logps/chosen": -40.72944259643555, "ref_logps/rejected": -46.860530853271484, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.4047670364379883, "rewards/margins": 0.3179532289505005, "rewards/rejected": -0.7227202653884888, "step": 198 }, { "epoch": 1.5, "grad_norm": 45.84026099289128, "learning_rate": 2.7668539325842694e-07, "logps/chosen": -44.04542922973633, "logps/rejected": -56.463775634765625, "loss": 0.5334, "losses/dpo": 0.5879970192909241, "losses/sft": 1.720085859298706, "losses/total": 0.5879970192909241, "ref_logps/chosen": -39.91609191894531, "ref_logps/rejected": -47.64347839355469, "rewards/accuracies": 0.75, "rewards/chosen": -0.41293373703956604, "rewards/margins": 0.46909624338150024, "rewards/rejected": -0.8820299506187439, "step": 199 }, { "epoch": 1.51, "grad_norm": 28.809322005950122, "learning_rate": 2.752808988764045e-07, "logps/chosen": -45.40152359008789, "logps/rejected": -62.26337432861328, "loss": 0.4963, "losses/dpo": 0.47846901416778564, "losses/sft": 1.7329440116882324, "losses/total": 0.47846901416778564, "ref_logps/chosen": -41.95344924926758, "ref_logps/rejected": -53.119686126708984, "rewards/accuracies": 0.84375, "rewards/chosen": -0.344807893037796, "rewards/margins": 0.569560706615448, "rewards/rejected": -0.9143686294555664, "step": 200 }, { "epoch": 1.52, "grad_norm": 63.538121725316046, "learning_rate": 2.73876404494382e-07, "logps/chosen": -47.31874465942383, "logps/rejected": -59.53126907348633, "loss": 0.5568, "losses/dpo": 0.43703195452690125, "losses/sft": 1.7420238256454468, "losses/total": 0.43703195452690125, "ref_logps/chosen": -43.84735107421875, "ref_logps/rejected": -51.49256134033203, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.34713926911354065, "rewards/margins": 0.45673197507858276, "rewards/rejected": -0.803871214389801, "step": 201 }, { "epoch": 1.53, "grad_norm": 38.224242158370366, "learning_rate": 2.7247191011235955e-07, "logps/chosen": -45.99871826171875, "logps/rejected": -61.235435485839844, "loss": 0.5111, "losses/dpo": 0.5159462094306946, "losses/sft": 1.8562134504318237, "losses/total": 0.5159462094306946, "ref_logps/chosen": -42.85459899902344, "ref_logps/rejected": -52.07539367675781, "rewards/accuracies": 0.828125, "rewards/chosen": -0.31441253423690796, "rewards/margins": 0.6015917658805847, "rewards/rejected": -0.9160042405128479, "step": 202 }, { "epoch": 1.53, "grad_norm": 30.6100428508092, "learning_rate": 2.710674157303371e-07, "logps/chosen": -48.29388427734375, "logps/rejected": -60.84185791015625, "loss": 0.5385, "losses/dpo": 0.42853569984436035, "losses/sft": 1.5987792015075684, "losses/total": 0.42853569984436035, "ref_logps/chosen": -44.793338775634766, "ref_logps/rejected": -52.13080596923828, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.35005441308021545, "rewards/margins": 0.5210508108139038, "rewards/rejected": -0.8711051940917969, "step": 203 }, { "epoch": 1.54, "grad_norm": 39.86846830158686, "learning_rate": 2.6966292134831456e-07, "logps/chosen": -46.5351676940918, "logps/rejected": -59.753639221191406, "loss": 0.5377, "losses/dpo": 0.4748055040836334, "losses/sft": 1.6433026790618896, "losses/total": 0.4748055040836334, "ref_logps/chosen": -42.88300704956055, "ref_logps/rejected": -50.99637985229492, "rewards/accuracies": 0.734375, "rewards/chosen": -0.36521610617637634, "rewards/margins": 0.5105096101760864, "rewards/rejected": -0.8757257461547852, "step": 204 }, { "epoch": 1.55, "grad_norm": 32.7669216206013, "learning_rate": 2.682584269662921e-07, "logps/chosen": -49.69789505004883, "logps/rejected": -63.739933013916016, "loss": 0.5161, "losses/dpo": 0.40675729513168335, "losses/sft": 1.7476611137390137, "losses/total": 0.40675729513168335, "ref_logps/chosen": -45.77500915527344, "ref_logps/rejected": -54.30220413208008, "rewards/accuracies": 0.78125, "rewards/chosen": -0.39228886365890503, "rewards/margins": 0.5514840483665466, "rewards/rejected": -0.9437730312347412, "step": 205 }, { "epoch": 1.56, "grad_norm": 42.49072710925156, "learning_rate": 2.668539325842696e-07, "logps/chosen": -47.16294860839844, "logps/rejected": -59.8863525390625, "loss": 0.5289, "losses/dpo": 0.4717567563056946, "losses/sft": 1.7502728700637817, "losses/total": 0.4717567563056946, "ref_logps/chosen": -43.35600280761719, "ref_logps/rejected": -51.03800582885742, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3806946575641632, "rewards/margins": 0.5041399002075195, "rewards/rejected": -0.8848345279693604, "step": 206 }, { "epoch": 1.56, "grad_norm": 28.098551510255913, "learning_rate": 2.654494382022472e-07, "logps/chosen": -51.461143493652344, "logps/rejected": -64.59919738769531, "loss": 0.5299, "losses/dpo": 0.4931321144104004, "losses/sft": 1.7711026668548584, "losses/total": 0.4931321144104004, "ref_logps/chosen": -47.48460006713867, "ref_logps/rejected": -55.880096435546875, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.3976547122001648, "rewards/margins": 0.4742559790611267, "rewards/rejected": -0.8719106912612915, "step": 207 }, { "epoch": 1.57, "grad_norm": 30.65688530023256, "learning_rate": 2.640449438202247e-07, "logps/chosen": -49.34468460083008, "logps/rejected": -63.70561599731445, "loss": 0.5083, "losses/dpo": 0.6479763984680176, "losses/sft": 1.9399117231369019, "losses/total": 0.6479763984680176, "ref_logps/chosen": -45.291812896728516, "ref_logps/rejected": -53.408966064453125, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.40528708696365356, "rewards/margins": 0.6243781447410583, "rewards/rejected": -1.029665231704712, "step": 208 }, { "epoch": 1.58, "grad_norm": 44.09912447914834, "learning_rate": 2.6264044943820224e-07, "logps/chosen": -45.324851989746094, "logps/rejected": -58.4406623840332, "loss": 0.5472, "losses/dpo": 0.4574645161628723, "losses/sft": 1.7848825454711914, "losses/total": 0.4574645161628723, "ref_logps/chosen": -41.625450134277344, "ref_logps/rejected": -49.83580017089844, "rewards/accuracies": 0.75, "rewards/chosen": -0.3699401021003723, "rewards/margins": 0.49054551124572754, "rewards/rejected": -0.8604856729507446, "step": 209 }, { "epoch": 1.59, "grad_norm": 32.77767153915824, "learning_rate": 2.612359550561798e-07, "logps/chosen": -43.616249084472656, "logps/rejected": -63.2964973449707, "loss": 0.5, "losses/dpo": 0.5640104413032532, "losses/sft": 1.6949553489685059, "losses/total": 0.5640104413032532, "ref_logps/chosen": -39.909263610839844, "ref_logps/rejected": -53.50825881958008, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3706982731819153, "rewards/margins": 0.6081258058547974, "rewards/rejected": -0.9788240194320679, "step": 210 }, { "epoch": 1.59, "grad_norm": 98.15979576097294, "learning_rate": 2.598314606741573e-07, "logps/chosen": -46.486968994140625, "logps/rejected": -61.16170120239258, "loss": 0.5229, "losses/dpo": 0.38548439741134644, "losses/sft": 1.6069914102554321, "losses/total": 0.38548439741134644, "ref_logps/chosen": -42.42946243286133, "ref_logps/rejected": -51.792518615722656, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4057506322860718, "rewards/margins": 0.5311680436134338, "rewards/rejected": -0.9369186162948608, "step": 211 }, { "epoch": 1.6, "grad_norm": 53.09149382525376, "learning_rate": 2.5842696629213486e-07, "logps/chosen": -46.488346099853516, "logps/rejected": -57.01716613769531, "loss": 0.5408, "losses/dpo": 0.4543725848197937, "losses/sft": 1.752640724182129, "losses/total": 0.4543725848197937, "ref_logps/chosen": -41.83614730834961, "ref_logps/rejected": -47.73747253417969, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4652196168899536, "rewards/margins": 0.46275004744529724, "rewards/rejected": -0.9279696941375732, "step": 212 }, { "epoch": 1.61, "grad_norm": 37.47375544440582, "learning_rate": 2.5702247191011236e-07, "logps/chosen": -49.63587951660156, "logps/rejected": -62.40115737915039, "loss": 0.5283, "losses/dpo": 0.6338381767272949, "losses/sft": 1.8126436471939087, "losses/total": 0.6338381767272949, "ref_logps/chosen": -45.24321746826172, "ref_logps/rejected": -52.804649353027344, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.4392660856246948, "rewards/margins": 0.5203849673271179, "rewards/rejected": -0.9596511125564575, "step": 213 }, { "epoch": 1.62, "grad_norm": 26.990285757104143, "learning_rate": 2.5561797752808987e-07, "logps/chosen": -45.917240142822266, "logps/rejected": -61.10861587524414, "loss": 0.5386, "losses/dpo": 0.4945739805698395, "losses/sft": 1.8248525857925415, "losses/total": 0.4945739805698395, "ref_logps/chosen": -41.92540740966797, "ref_logps/rejected": -51.52170181274414, "rewards/accuracies": 0.75, "rewards/chosen": -0.39918389916419983, "rewards/margins": 0.5595079064369202, "rewards/rejected": -0.9586918354034424, "step": 214 }, { "epoch": 1.62, "grad_norm": 32.608593813749025, "learning_rate": 2.5421348314606737e-07, "logps/chosen": -45.396209716796875, "logps/rejected": -58.39073181152344, "loss": 0.5317, "losses/dpo": 0.4668882489204407, "losses/sft": 1.718886375427246, "losses/total": 0.4668882489204407, "ref_logps/chosen": -40.89415740966797, "ref_logps/rejected": -48.5245361328125, "rewards/accuracies": 0.75, "rewards/chosen": -0.4502052962779999, "rewards/margins": 0.5364142656326294, "rewards/rejected": -0.9866195321083069, "step": 215 }, { "epoch": 1.63, "grad_norm": 56.98192968115574, "learning_rate": 2.5280898876404493e-07, "logps/chosen": -49.24924850463867, "logps/rejected": -60.58833312988281, "loss": 0.5617, "losses/dpo": 0.4475604295730591, "losses/sft": 1.698026418685913, "losses/total": 0.4475604295730591, "ref_logps/chosen": -44.60601806640625, "ref_logps/rejected": -51.03276443481445, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4643232226371765, "rewards/margins": 0.4912331700325012, "rewards/rejected": -0.9555563926696777, "step": 216 }, { "epoch": 1.64, "grad_norm": 41.423918556154725, "learning_rate": 2.5140449438202243e-07, "logps/chosen": -45.89344024658203, "logps/rejected": -61.97504425048828, "loss": 0.5017, "losses/dpo": 0.46613746881484985, "losses/sft": 1.733440637588501, "losses/total": 0.46613746881484985, "ref_logps/chosen": -41.85301208496094, "ref_logps/rejected": -51.947357177734375, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4040428400039673, "rewards/margins": 0.5987257957458496, "rewards/rejected": -1.002768635749817, "step": 217 }, { "epoch": 1.65, "grad_norm": 31.382696226938254, "learning_rate": 2.5e-07, "logps/chosen": -44.27947235107422, "logps/rejected": -58.333377838134766, "loss": 0.4861, "losses/dpo": 0.4522477388381958, "losses/sft": 1.7368268966674805, "losses/total": 0.4522477388381958, "ref_logps/chosen": -40.75891876220703, "ref_logps/rejected": -48.761932373046875, "rewards/accuracies": 0.859375, "rewards/chosen": -0.35205551981925964, "rewards/margins": 0.6050889492034912, "rewards/rejected": -0.9571444988250732, "step": 218 }, { "epoch": 1.65, "grad_norm": 37.84145121945044, "learning_rate": 2.485955056179775e-07, "logps/chosen": -50.229488372802734, "logps/rejected": -60.491233825683594, "loss": 0.5263, "losses/dpo": 0.5404773354530334, "losses/sft": 1.7611405849456787, "losses/total": 0.5404773354530334, "ref_logps/chosen": -46.2949104309082, "ref_logps/rejected": -50.83322525024414, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.3934575021266937, "rewards/margins": 0.5723429918289185, "rewards/rejected": -0.9658005237579346, "step": 219 }, { "epoch": 1.66, "grad_norm": 31.906557608856495, "learning_rate": 2.4719101123595505e-07, "logps/chosen": -49.52198028564453, "logps/rejected": -60.96833038330078, "loss": 0.5477, "losses/dpo": 0.48801130056381226, "losses/sft": 1.6951342821121216, "losses/total": 0.48801130056381226, "ref_logps/chosen": -45.28654479980469, "ref_logps/rejected": -51.84858322143555, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4235435426235199, "rewards/margins": 0.4884312152862549, "rewards/rejected": -0.9119747281074524, "step": 220 }, { "epoch": 1.67, "grad_norm": 28.645491194164865, "learning_rate": 2.4578651685393255e-07, "logps/chosen": -46.441986083984375, "logps/rejected": -60.918113708496094, "loss": 0.5109, "losses/dpo": 0.44131675362586975, "losses/sft": 1.6665538549423218, "losses/total": 0.44131675362586975, "ref_logps/chosen": -42.85387420654297, "ref_logps/rejected": -51.00918960571289, "rewards/accuracies": 0.734375, "rewards/chosen": -0.35881122946739197, "rewards/margins": 0.6320809721946716, "rewards/rejected": -0.9908921718597412, "step": 221 }, { "epoch": 1.68, "grad_norm": 36.02973192633452, "learning_rate": 2.443820224719101e-07, "logps/chosen": -48.31787872314453, "logps/rejected": -60.520626068115234, "loss": 0.5137, "losses/dpo": 0.541698694229126, "losses/sft": 1.9254802465438843, "losses/total": 0.541698694229126, "ref_logps/chosen": -43.673789978027344, "ref_logps/rejected": -49.71205139160156, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4644095301628113, "rewards/margins": 0.6164478063583374, "rewards/rejected": -1.0808573961257935, "step": 222 }, { "epoch": 1.68, "grad_norm": 30.675714107847632, "learning_rate": 2.429775280898876e-07, "logps/chosen": -49.2231559753418, "logps/rejected": -60.815208435058594, "loss": 0.5039, "losses/dpo": 0.5225088596343994, "losses/sft": 1.7494632005691528, "losses/total": 0.5225088596343994, "ref_logps/chosen": -44.78702163696289, "ref_logps/rejected": -50.05812072753906, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.44361379742622375, "rewards/margins": 0.6320949792861938, "rewards/rejected": -1.0757088661193848, "step": 223 }, { "epoch": 1.69, "grad_norm": 31.597392161725715, "learning_rate": 2.4157303370786517e-07, "logps/chosen": -51.469242095947266, "logps/rejected": -63.69596481323242, "loss": 0.5421, "losses/dpo": 0.5044399499893188, "losses/sft": 1.823853611946106, "losses/total": 0.5044399499893188, "ref_logps/chosen": -46.0730094909668, "ref_logps/rejected": -53.22626495361328, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.539623498916626, "rewards/margins": 0.5073459148406982, "rewards/rejected": -1.0469695329666138, "step": 224 }, { "epoch": 1.7, "grad_norm": 23.91725282640204, "learning_rate": 2.401685393258427e-07, "logps/chosen": -46.35022735595703, "logps/rejected": -56.71597671508789, "loss": 0.5304, "losses/dpo": 0.4879852533340454, "losses/sft": 1.820909023284912, "losses/total": 0.4879852533340454, "ref_logps/chosen": -42.42784881591797, "ref_logps/rejected": -47.75648880004883, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.39223790168762207, "rewards/margins": 0.503710925579071, "rewards/rejected": -0.8959488272666931, "step": 225 }, { "epoch": 1.71, "grad_norm": 32.603634749589574, "learning_rate": 2.3876404494382023e-07, "logps/chosen": -50.741065979003906, "logps/rejected": -63.19419479370117, "loss": 0.5178, "losses/dpo": 0.5530965328216553, "losses/sft": 1.8346784114837646, "losses/total": 0.5530965328216553, "ref_logps/chosen": -46.495025634765625, "ref_logps/rejected": -53.35783386230469, "rewards/accuracies": 0.75, "rewards/chosen": -0.42460453510284424, "rewards/margins": 0.5590318441390991, "rewards/rejected": -0.9836363792419434, "step": 226 }, { "epoch": 1.71, "grad_norm": 38.60107352667027, "learning_rate": 2.3735955056179774e-07, "logps/chosen": -48.30965805053711, "logps/rejected": -59.211700439453125, "loss": 0.5861, "losses/dpo": 0.5638729929924011, "losses/sft": 1.714339017868042, "losses/total": 0.5638729929924011, "ref_logps/chosen": -43.16453170776367, "ref_logps/rejected": -49.69733428955078, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5145125389099121, "rewards/margins": 0.4369240999221802, "rewards/rejected": -0.9514365792274475, "step": 227 }, { "epoch": 1.72, "grad_norm": 30.395931193640205, "learning_rate": 2.3595505617977527e-07, "logps/chosen": -47.91743850708008, "logps/rejected": -62.85133361816406, "loss": 0.4782, "losses/dpo": 0.4003780484199524, "losses/sft": 1.939863681793213, "losses/total": 0.4003780484199524, "ref_logps/chosen": -43.55271911621094, "ref_logps/rejected": -51.54827117919922, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.4364721179008484, "rewards/margins": 0.693834125995636, "rewards/rejected": -1.1303062438964844, "step": 228 }, { "epoch": 1.73, "grad_norm": 30.588932596057163, "learning_rate": 2.345505617977528e-07, "logps/chosen": -52.0521354675293, "logps/rejected": -64.27798461914062, "loss": 0.5114, "losses/dpo": 0.35463789105415344, "losses/sft": 1.745693564414978, "losses/total": 0.35463789105415344, "ref_logps/chosen": -47.34131622314453, "ref_logps/rejected": -53.40336227416992, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.47108158469200134, "rewards/margins": 0.6163800358772278, "rewards/rejected": -1.0874615907669067, "step": 229 }, { "epoch": 1.74, "grad_norm": 49.464630190767615, "learning_rate": 2.331460674157303e-07, "logps/chosen": -50.147239685058594, "logps/rejected": -61.73895263671875, "loss": 0.5456, "losses/dpo": 0.4513179361820221, "losses/sft": 1.973555088043213, "losses/total": 0.4513179361820221, "ref_logps/chosen": -45.20698547363281, "ref_logps/rejected": -51.76478958129883, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.4940255284309387, "rewards/margins": 0.5033905506134033, "rewards/rejected": -0.9974161386489868, "step": 230 }, { "epoch": 1.74, "grad_norm": 46.12149051355562, "learning_rate": 2.3174157303370786e-07, "logps/chosen": -51.27939987182617, "logps/rejected": -56.45414733886719, "loss": 0.5623, "losses/dpo": 0.5874257683753967, "losses/sft": 1.9866621494293213, "losses/total": 0.5874257683753967, "ref_logps/chosen": -46.1025390625, "ref_logps/rejected": -46.94108963012695, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5176857709884644, "rewards/margins": 0.4336201548576355, "rewards/rejected": -0.9513058662414551, "step": 231 }, { "epoch": 1.75, "grad_norm": 48.970871681732376, "learning_rate": 2.303370786516854e-07, "logps/chosen": -48.21266174316406, "logps/rejected": -59.834651947021484, "loss": 0.4737, "losses/dpo": 0.4422827363014221, "losses/sft": 1.857910394668579, "losses/total": 0.4422827363014221, "ref_logps/chosen": -44.37346267700195, "ref_logps/rejected": -49.12510681152344, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.38392019271850586, "rewards/margins": 0.6870340704917908, "rewards/rejected": -1.0709542036056519, "step": 232 }, { "epoch": 1.76, "grad_norm": 46.939250964453656, "learning_rate": 2.2893258426966292e-07, "logps/chosen": -51.792171478271484, "logps/rejected": -64.41178131103516, "loss": 0.5099, "losses/dpo": 0.441424697637558, "losses/sft": 1.914039134979248, "losses/total": 0.441424697637558, "ref_logps/chosen": -46.90717315673828, "ref_logps/rejected": -53.63972473144531, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.4884992241859436, "rewards/margins": 0.5887070298194885, "rewards/rejected": -1.0772062540054321, "step": 233 }, { "epoch": 1.77, "grad_norm": 38.40358700762194, "learning_rate": 2.2752808988764045e-07, "logps/chosen": -48.622169494628906, "logps/rejected": -61.32197570800781, "loss": 0.5272, "losses/dpo": 0.5013576149940491, "losses/sft": 2.1135737895965576, "losses/total": 0.5013576149940491, "ref_logps/chosen": -43.769927978515625, "ref_logps/rejected": -50.77492904663086, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.48522406816482544, "rewards/margins": 0.5694804191589355, "rewards/rejected": -1.0547044277191162, "step": 234 }, { "epoch": 1.77, "grad_norm": 26.909977071922068, "learning_rate": 2.2612359550561795e-07, "logps/chosen": -46.23896408081055, "logps/rejected": -57.86852264404297, "loss": 0.5217, "losses/dpo": 0.5148915648460388, "losses/sft": 1.7078696489334106, "losses/total": 0.5148915648460388, "ref_logps/chosen": -41.81940841674805, "ref_logps/rejected": -47.86483383178711, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4419558048248291, "rewards/margins": 0.5584129095077515, "rewards/rejected": -1.0003687143325806, "step": 235 }, { "epoch": 1.78, "grad_norm": 35.54472324448, "learning_rate": 2.2471910112359549e-07, "logps/chosen": -47.79429244995117, "logps/rejected": -61.29048156738281, "loss": 0.5591, "losses/dpo": 0.4298320412635803, "losses/sft": 1.7925626039505005, "losses/total": 0.4298320412635803, "ref_logps/chosen": -43.1577033996582, "ref_logps/rejected": -51.852821350097656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4636591374874115, "rewards/margins": 0.4801071882247925, "rewards/rejected": -0.9437662363052368, "step": 236 }, { "epoch": 1.79, "grad_norm": 26.89707018363376, "learning_rate": 2.2331460674157302e-07, "logps/chosen": -50.49708557128906, "logps/rejected": -67.8884506225586, "loss": 0.4965, "losses/dpo": 0.3560337424278259, "losses/sft": 1.9150320291519165, "losses/total": 0.3560337424278259, "ref_logps/chosen": -45.65766525268555, "ref_logps/rejected": -55.920318603515625, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4839417338371277, "rewards/margins": 0.712871253490448, "rewards/rejected": -1.1968129873275757, "step": 237 }, { "epoch": 1.8, "grad_norm": 33.289758441651166, "learning_rate": 2.2191011235955055e-07, "logps/chosen": -52.39525604248047, "logps/rejected": -67.31971740722656, "loss": 0.5416, "losses/dpo": 0.548444390296936, "losses/sft": 1.7192367315292358, "losses/total": 0.548444390296936, "ref_logps/chosen": -46.50968933105469, "ref_logps/rejected": -55.49256896972656, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5885568857192993, "rewards/margins": 0.594158947467804, "rewards/rejected": -1.1827157735824585, "step": 238 }, { "epoch": 1.8, "grad_norm": 33.718323383105826, "learning_rate": 2.205056179775281e-07, "logps/chosen": -47.30939483642578, "logps/rejected": -62.0474967956543, "loss": 0.5128, "losses/dpo": 0.5326447486877441, "losses/sft": 1.7835038900375366, "losses/total": 0.5326447486877441, "ref_logps/chosen": -42.9058952331543, "ref_logps/rejected": -50.751258850097656, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.4403497874736786, "rewards/margins": 0.6892733573913574, "rewards/rejected": -1.1296230554580688, "step": 239 }, { "epoch": 1.81, "grad_norm": 30.600452182349464, "learning_rate": 2.191011235955056e-07, "logps/chosen": -55.63336944580078, "logps/rejected": -62.522911071777344, "loss": 0.5754, "losses/dpo": 0.5076375603675842, "losses/sft": 1.9160699844360352, "losses/total": 0.5076375603675842, "ref_logps/chosen": -50.34322738647461, "ref_logps/rejected": -52.08367919921875, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5290137529373169, "rewards/margins": 0.5149096250534058, "rewards/rejected": -1.0439234972000122, "step": 240 }, { "epoch": 1.82, "grad_norm": 26.979680099380946, "learning_rate": 2.1769662921348314e-07, "logps/chosen": -50.20891571044922, "logps/rejected": -61.1137580871582, "loss": 0.5105, "losses/dpo": 0.6182360649108887, "losses/sft": 1.9309462308883667, "losses/total": 0.6182360649108887, "ref_logps/chosen": -44.912757873535156, "ref_logps/rejected": -49.37834167480469, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5296155214309692, "rewards/margins": 0.643925666809082, "rewards/rejected": -1.1735411882400513, "step": 241 }, { "epoch": 1.83, "grad_norm": 42.09299487617717, "learning_rate": 2.1629213483146067e-07, "logps/chosen": -46.089210510253906, "logps/rejected": -64.08358764648438, "loss": 0.4714, "losses/dpo": 0.29059717059135437, "losses/sft": 1.8697593212127686, "losses/total": 0.29059717059135437, "ref_logps/chosen": -41.43824768066406, "ref_logps/rejected": -52.116554260253906, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.46509572863578796, "rewards/margins": 0.7316084504127502, "rewards/rejected": -1.1967041492462158, "step": 242 }, { "epoch": 1.83, "grad_norm": 35.24869338904229, "learning_rate": 2.148876404494382e-07, "logps/chosen": -50.051204681396484, "logps/rejected": -58.113502502441406, "loss": 0.5638, "losses/dpo": 0.6333197951316833, "losses/sft": 1.6995980739593506, "losses/total": 0.6333197951316833, "ref_logps/chosen": -44.55199432373047, "ref_logps/rejected": -47.981117248535156, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5499215126037598, "rewards/margins": 0.4633174538612366, "rewards/rejected": -1.0132389068603516, "step": 243 }, { "epoch": 1.84, "grad_norm": 34.27329658279247, "learning_rate": 2.134831460674157e-07, "logps/chosen": -47.54253387451172, "logps/rejected": -61.82128143310547, "loss": 0.4842, "losses/dpo": 0.4167064428329468, "losses/sft": 1.7835010290145874, "losses/total": 0.4167064428329468, "ref_logps/chosen": -43.070247650146484, "ref_logps/rejected": -50.46671676635742, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.44722840189933777, "rewards/margins": 0.6882277131080627, "rewards/rejected": -1.1354560852050781, "step": 244 }, { "epoch": 1.85, "grad_norm": 27.01160297073374, "learning_rate": 2.1207865168539323e-07, "logps/chosen": -47.44729995727539, "logps/rejected": -63.50260543823242, "loss": 0.5261, "losses/dpo": 0.4638723134994507, "losses/sft": 1.8679862022399902, "losses/total": 0.4638723134994507, "ref_logps/chosen": -42.23263931274414, "ref_logps/rejected": -52.155338287353516, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5214660167694092, "rewards/margins": 0.6132616400718689, "rewards/rejected": -1.1347277164459229, "step": 245 }, { "epoch": 1.86, "grad_norm": 35.678092923643696, "learning_rate": 2.1067415730337076e-07, "logps/chosen": -52.759498596191406, "logps/rejected": -65.6281509399414, "loss": 0.5028, "losses/dpo": 0.42901700735092163, "losses/sft": 1.725085973739624, "losses/total": 0.42901700735092163, "ref_logps/chosen": -48.75430679321289, "ref_logps/rejected": -55.151390075683594, "rewards/accuracies": 0.78125, "rewards/chosen": -0.40051960945129395, "rewards/margins": 0.6471560001373291, "rewards/rejected": -1.047675609588623, "step": 246 }, { "epoch": 1.87, "grad_norm": 30.397279842718763, "learning_rate": 2.0926966292134832e-07, "logps/chosen": -51.19328689575195, "logps/rejected": -62.64244842529297, "loss": 0.5323, "losses/dpo": 0.49880561232566833, "losses/sft": 1.7792472839355469, "losses/total": 0.49880561232566833, "ref_logps/chosen": -45.65985870361328, "ref_logps/rejected": -51.4868278503418, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5533427000045776, "rewards/margins": 0.5622196793556213, "rewards/rejected": -1.1155624389648438, "step": 247 }, { "epoch": 1.87, "grad_norm": 45.5097118266015, "learning_rate": 2.0786516853932585e-07, "logps/chosen": -49.541988372802734, "logps/rejected": -61.682098388671875, "loss": 0.5238, "losses/dpo": 0.4883154630661011, "losses/sft": 1.8238506317138672, "losses/total": 0.4883154630661011, "ref_logps/chosen": -44.83251190185547, "ref_logps/rejected": -50.75908660888672, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.47094783186912537, "rewards/margins": 0.6213531494140625, "rewards/rejected": -1.0923008918762207, "step": 248 }, { "epoch": 1.88, "grad_norm": 54.11701694432662, "learning_rate": 2.0646067415730336e-07, "logps/chosen": -49.46725082397461, "logps/rejected": -65.84452056884766, "loss": 0.5033, "losses/dpo": 0.45639699697494507, "losses/sft": 1.6627399921417236, "losses/total": 0.45639699697494507, "ref_logps/chosen": -44.47664260864258, "ref_logps/rejected": -54.48002624511719, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4990614354610443, "rewards/margins": 0.6373879909515381, "rewards/rejected": -1.1364493370056152, "step": 249 }, { "epoch": 1.89, "grad_norm": 39.18800453651972, "learning_rate": 2.0505617977528089e-07, "logps/chosen": -49.84864807128906, "logps/rejected": -64.8356704711914, "loss": 0.4998, "losses/dpo": 0.35297900438308716, "losses/sft": 1.8552095890045166, "losses/total": 0.35297900438308716, "ref_logps/chosen": -44.61570739746094, "ref_logps/rejected": -53.41788864135742, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5232940912246704, "rewards/margins": 0.6184837818145752, "rewards/rejected": -1.141777753829956, "step": 250 }, { "epoch": 1.9, "grad_norm": 53.063345514186096, "learning_rate": 2.0365168539325842e-07, "logps/chosen": -52.15354919433594, "logps/rejected": -64.48795318603516, "loss": 0.5117, "losses/dpo": 0.5473852157592773, "losses/sft": 1.8542628288269043, "losses/total": 0.5473852157592773, "ref_logps/chosen": -47.43267059326172, "ref_logps/rejected": -53.22420883178711, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.4720878005027771, "rewards/margins": 0.6542870998382568, "rewards/rejected": -1.1263747215270996, "step": 251 }, { "epoch": 1.9, "grad_norm": 38.47591157415672, "learning_rate": 2.0224719101123595e-07, "logps/chosen": -53.605384826660156, "logps/rejected": -67.81199645996094, "loss": 0.55, "losses/dpo": 0.7230039238929749, "losses/sft": 1.776413083076477, "losses/total": 0.7230039238929749, "ref_logps/chosen": -47.54939270019531, "ref_logps/rejected": -56.33991241455078, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6055989861488342, "rewards/margins": 0.541609525680542, "rewards/rejected": -1.1472084522247314, "step": 252 }, { "epoch": 1.91, "grad_norm": 36.31487686417263, "learning_rate": 2.0084269662921348e-07, "logps/chosen": -49.754886627197266, "logps/rejected": -61.2078971862793, "loss": 0.5329, "losses/dpo": 0.4489409625530243, "losses/sft": 1.7098063230514526, "losses/total": 0.4489409625530243, "ref_logps/chosen": -44.37398147583008, "ref_logps/rejected": -50.33176803588867, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5380907654762268, "rewards/margins": 0.549521803855896, "rewards/rejected": -1.0876126289367676, "step": 253 }, { "epoch": 1.92, "grad_norm": 37.72289933274786, "learning_rate": 1.9943820224719098e-07, "logps/chosen": -45.93370819091797, "logps/rejected": -60.7377815246582, "loss": 0.5292, "losses/dpo": 0.544952392578125, "losses/sft": 1.8177616596221924, "losses/total": 0.544952392578125, "ref_logps/chosen": -40.96842575073242, "ref_logps/rejected": -50.381805419921875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4965280294418335, "rewards/margins": 0.5390690565109253, "rewards/rejected": -1.0355970859527588, "step": 254 }, { "epoch": 1.93, "grad_norm": 28.095927855332853, "learning_rate": 1.9803370786516854e-07, "logps/chosen": -49.28886795043945, "logps/rejected": -61.70311737060547, "loss": 0.5216, "losses/dpo": 0.5553957223892212, "losses/sft": 1.7436751127243042, "losses/total": 0.5553957223892212, "ref_logps/chosen": -43.87446594238281, "ref_logps/rejected": -50.54216766357422, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5414395928382874, "rewards/margins": 0.5746556520462036, "rewards/rejected": -1.1160951852798462, "step": 255 }, { "epoch": 1.93, "grad_norm": 26.518610674857435, "learning_rate": 1.9662921348314607e-07, "logps/chosen": -50.930755615234375, "logps/rejected": -61.84996032714844, "loss": 0.5227, "losses/dpo": 0.5520716309547424, "losses/sft": 1.8250830173492432, "losses/total": 0.5520716309547424, "ref_logps/chosen": -44.4235954284668, "ref_logps/rejected": -49.4667854309082, "rewards/accuracies": 0.734375, "rewards/chosen": -0.650715708732605, "rewards/margins": 0.5876013040542603, "rewards/rejected": -1.2383170127868652, "step": 256 }, { "epoch": 1.94, "grad_norm": 42.48935751013368, "learning_rate": 1.952247191011236e-07, "logps/chosen": -49.778446197509766, "logps/rejected": -64.61287689208984, "loss": 0.512, "losses/dpo": 0.5536674857139587, "losses/sft": 2.0023696422576904, "losses/total": 0.5536674857139587, "ref_logps/chosen": -44.07032012939453, "ref_logps/rejected": -52.4774284362793, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5708125233650208, "rewards/margins": 0.6427321434020996, "rewards/rejected": -1.2135446071624756, "step": 257 }, { "epoch": 1.95, "grad_norm": 30.767857339418306, "learning_rate": 1.938202247191011e-07, "logps/chosen": -50.44746017456055, "logps/rejected": -59.21159744262695, "loss": 0.5386, "losses/dpo": 0.622530460357666, "losses/sft": 1.7786061763763428, "losses/total": 0.622530460357666, "ref_logps/chosen": -44.406494140625, "ref_logps/rejected": -47.69208526611328, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.604095995426178, "rewards/margins": 0.5478551983833313, "rewards/rejected": -1.1519510746002197, "step": 258 }, { "epoch": 1.96, "grad_norm": 39.58431653415771, "learning_rate": 1.9241573033707863e-07, "logps/chosen": -53.419769287109375, "logps/rejected": -61.74285888671875, "loss": 0.579, "losses/dpo": 0.6418075561523438, "losses/sft": 1.892571210861206, "losses/total": 0.6418075561523438, "ref_logps/chosen": -47.58960723876953, "ref_logps/rejected": -51.29787826538086, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5830158591270447, "rewards/margins": 0.46148252487182617, "rewards/rejected": -1.0444984436035156, "step": 259 }, { "epoch": 1.96, "grad_norm": 43.39483291252752, "learning_rate": 1.9101123595505617e-07, "logps/chosen": -54.651546478271484, "logps/rejected": -60.20787048339844, "loss": 0.5722, "losses/dpo": 0.520094096660614, "losses/sft": 1.7925068140029907, "losses/total": 0.520094096660614, "ref_logps/chosen": -47.40108108520508, "ref_logps/rejected": -48.18630599975586, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.7250471711158752, "rewards/margins": 0.47710931301116943, "rewards/rejected": -1.202156662940979, "step": 260 }, { "epoch": 1.97, "grad_norm": 31.455191081868676, "learning_rate": 1.896067415730337e-07, "logps/chosen": -47.867706298828125, "logps/rejected": -65.19200134277344, "loss": 0.4765, "losses/dpo": 0.35312655568122864, "losses/sft": 1.778258204460144, "losses/total": 0.35312655568122864, "ref_logps/chosen": -42.26630783081055, "ref_logps/rejected": -52.318782806396484, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5601397752761841, "rewards/margins": 0.7271820306777954, "rewards/rejected": -1.2873218059539795, "step": 261 }, { "epoch": 1.98, "grad_norm": 50.132015095577955, "learning_rate": 1.8820224719101123e-07, "logps/chosen": -50.20945358276367, "logps/rejected": -61.504669189453125, "loss": 0.5005, "losses/dpo": 0.3500882387161255, "losses/sft": 1.8216397762298584, "losses/total": 0.3500882387161255, "ref_logps/chosen": -45.090126037597656, "ref_logps/rejected": -49.85264587402344, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.5119326114654541, "rewards/margins": 0.6532695293426514, "rewards/rejected": -1.165202260017395, "step": 262 }, { "epoch": 1.99, "grad_norm": 32.89448055287079, "learning_rate": 1.8679775280898876e-07, "logps/chosen": -50.863895416259766, "logps/rejected": -61.18482971191406, "loss": 0.5503, "losses/dpo": 0.7480425238609314, "losses/sft": 1.6698777675628662, "losses/total": 0.7480425238609314, "ref_logps/chosen": -45.225868225097656, "ref_logps/rejected": -50.2490348815918, "rewards/accuracies": 0.75, "rewards/chosen": -0.5638025403022766, "rewards/margins": 0.5297766923904419, "rewards/rejected": -1.0935792922973633, "step": 263 }, { "epoch": 1.99, "grad_norm": 49.82760285605674, "learning_rate": 1.853932584269663e-07, "logps/chosen": -52.50929260253906, "logps/rejected": -65.43942260742188, "loss": 0.5086, "losses/dpo": 0.5467610359191895, "losses/sft": 1.9075955152511597, "losses/total": 0.5467610359191895, "ref_logps/chosen": -46.77457046508789, "ref_logps/rejected": -53.229820251464844, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5734716653823853, "rewards/margins": 0.6474889516830444, "rewards/rejected": -1.2209606170654297, "step": 264 }, { "epoch": 2.0, "grad_norm": 40.68420339469038, "learning_rate": 1.8398876404494382e-07, "logps/chosen": -50.630332946777344, "logps/rejected": -57.15177917480469, "loss": 0.5427, "losses/dpo": 0.5700445771217346, "losses/sft": 1.945221185684204, "losses/total": 0.5700445771217346, "ref_logps/chosen": -45.045745849609375, "ref_logps/rejected": -46.36561584472656, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5584587454795837, "rewards/margins": 0.5201576948165894, "rewards/rejected": -1.0786163806915283, "step": 265 }, { "epoch": 2.01, "grad_norm": 40.29769166622327, "learning_rate": 1.8258426966292135e-07, "logps/chosen": -45.740116119384766, "logps/rejected": -63.191139221191406, "loss": 0.4826, "losses/dpo": 0.6156269907951355, "losses/sft": 1.712737798690796, "losses/total": 0.6156269907951355, "ref_logps/chosen": -41.031341552734375, "ref_logps/rejected": -50.87398910522461, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.47087764739990234, "rewards/margins": 0.7608367204666138, "rewards/rejected": -1.2317143678665161, "step": 266 }, { "epoch": 2.02, "grad_norm": 28.611042386664174, "learning_rate": 1.8117977528089888e-07, "logps/chosen": -53.56329345703125, "logps/rejected": -61.81393051147461, "loss": 0.5025, "losses/dpo": 0.41924646496772766, "losses/sft": 1.6672053337097168, "losses/total": 0.41924646496772766, "ref_logps/chosen": -48.44025421142578, "ref_logps/rejected": -49.9116325378418, "rewards/accuracies": 0.75, "rewards/chosen": -0.5123040676116943, "rewards/margins": 0.6779264211654663, "rewards/rejected": -1.190230369567871, "step": 267 }, { "epoch": 2.02, "grad_norm": 34.04672398738775, "learning_rate": 1.7977528089887638e-07, "logps/chosen": -51.082889556884766, "logps/rejected": -62.92805099487305, "loss": 0.4928, "losses/dpo": 0.46233507990837097, "losses/sft": 1.9342318773269653, "losses/total": 0.46233507990837097, "ref_logps/chosen": -45.434566497802734, "ref_logps/rejected": -50.70568084716797, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5648323893547058, "rewards/margins": 0.6574045419692993, "rewards/rejected": -1.22223699092865, "step": 268 }, { "epoch": 2.03, "grad_norm": 35.280744439444646, "learning_rate": 1.7837078651685391e-07, "logps/chosen": -51.44675064086914, "logps/rejected": -66.9733657836914, "loss": 0.5147, "losses/dpo": 0.41224586963653564, "losses/sft": 1.9395112991333008, "losses/total": 0.41224586963653564, "ref_logps/chosen": -45.02248764038086, "ref_logps/rejected": -53.97636032104492, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6424263119697571, "rewards/margins": 0.6572746634483337, "rewards/rejected": -1.2997009754180908, "step": 269 }, { "epoch": 2.04, "grad_norm": 48.89875325089662, "learning_rate": 1.7696629213483144e-07, "logps/chosen": -54.699039459228516, "logps/rejected": -62.34593963623047, "loss": 0.5415, "losses/dpo": 0.4496094286441803, "losses/sft": 1.9949281215667725, "losses/total": 0.4496094286441803, "ref_logps/chosen": -48.44892120361328, "ref_logps/rejected": -50.89592361450195, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.6250118613243103, "rewards/margins": 0.5199897885322571, "rewards/rejected": -1.1450016498565674, "step": 270 }, { "epoch": 2.05, "grad_norm": 39.898107119022896, "learning_rate": 1.75561797752809e-07, "logps/chosen": -54.30252456665039, "logps/rejected": -64.18143463134766, "loss": 0.5055, "losses/dpo": 0.40328285098075867, "losses/sft": 1.9478102922439575, "losses/total": 0.40328285098075867, "ref_logps/chosen": -47.79462814331055, "ref_logps/rejected": -51.18360137939453, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6507893800735474, "rewards/margins": 0.6489942669868469, "rewards/rejected": -1.2997835874557495, "step": 271 }, { "epoch": 2.05, "grad_norm": 22.64059059375221, "learning_rate": 1.741573033707865e-07, "logps/chosen": -46.404747009277344, "logps/rejected": -61.52473449707031, "loss": 0.4487, "losses/dpo": 0.3951578736305237, "losses/sft": 1.6330779790878296, "losses/total": 0.3951578736305237, "ref_logps/chosen": -42.159027099609375, "ref_logps/rejected": -49.41521072387695, "rewards/accuracies": 0.84375, "rewards/chosen": -0.42457205057144165, "rewards/margins": 0.7863802909851074, "rewards/rejected": -1.2109522819519043, "step": 272 }, { "epoch": 2.06, "grad_norm": 74.13926398461123, "learning_rate": 1.7275280898876404e-07, "logps/chosen": -54.33308410644531, "logps/rejected": -62.44175720214844, "loss": 0.5412, "losses/dpo": 0.5388703346252441, "losses/sft": 1.8410019874572754, "losses/total": 0.5388703346252441, "ref_logps/chosen": -48.44995880126953, "ref_logps/rejected": -50.38564682006836, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5883128046989441, "rewards/margins": 0.6172980666160583, "rewards/rejected": -1.205610990524292, "step": 273 }, { "epoch": 2.07, "grad_norm": 25.953857990200024, "learning_rate": 1.7134831460674157e-07, "logps/chosen": -51.31452560424805, "logps/rejected": -64.98090362548828, "loss": 0.4837, "losses/dpo": 0.31895995140075684, "losses/sft": 1.6441235542297363, "losses/total": 0.31895995140075684, "ref_logps/chosen": -45.34762954711914, "ref_logps/rejected": -52.30402755737305, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5966892242431641, "rewards/margins": 0.6709988117218018, "rewards/rejected": -1.2676880359649658, "step": 274 }, { "epoch": 2.08, "grad_norm": 71.73732592082969, "learning_rate": 1.699438202247191e-07, "logps/chosen": -46.54341125488281, "logps/rejected": -63.91550064086914, "loss": 0.5107, "losses/dpo": 0.7247325778007507, "losses/sft": 1.7304763793945312, "losses/total": 0.7247325778007507, "ref_logps/chosen": -41.79149627685547, "ref_logps/rejected": -52.10732650756836, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4751916527748108, "rewards/margins": 0.7056255340576172, "rewards/rejected": -1.1808171272277832, "step": 275 }, { "epoch": 2.08, "grad_norm": 34.295472371107984, "learning_rate": 1.6853932584269663e-07, "logps/chosen": -50.055667877197266, "logps/rejected": -66.54865264892578, "loss": 0.4738, "losses/dpo": 0.4991689920425415, "losses/sft": 1.7661676406860352, "losses/total": 0.4991689920425415, "ref_logps/chosen": -44.640960693359375, "ref_logps/rejected": -53.86970901489258, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5414707660675049, "rewards/margins": 0.7264235615730286, "rewards/rejected": -1.2678943872451782, "step": 276 }, { "epoch": 2.09, "grad_norm": 34.87966460155351, "learning_rate": 1.6713483146067413e-07, "logps/chosen": -48.71733856201172, "logps/rejected": -64.19945526123047, "loss": 0.4775, "losses/dpo": 0.505129873752594, "losses/sft": 1.8008291721343994, "losses/total": 0.505129873752594, "ref_logps/chosen": -43.19121170043945, "ref_logps/rejected": -51.47002410888672, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5526125431060791, "rewards/margins": 0.7203309535980225, "rewards/rejected": -1.2729436159133911, "step": 277 }, { "epoch": 2.1, "grad_norm": 45.47006477442562, "learning_rate": 1.6573033707865166e-07, "logps/chosen": -47.62105941772461, "logps/rejected": -63.732845306396484, "loss": 0.5054, "losses/dpo": 0.3483954668045044, "losses/sft": 1.818171501159668, "losses/total": 0.3483954668045044, "ref_logps/chosen": -42.47084426879883, "ref_logps/rejected": -52.08452606201172, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.5150218605995178, "rewards/margins": 0.6498096585273743, "rewards/rejected": -1.1648313999176025, "step": 278 }, { "epoch": 2.11, "grad_norm": 40.59540284830723, "learning_rate": 1.6432584269662922e-07, "logps/chosen": -53.00666809082031, "logps/rejected": -65.99346923828125, "loss": 0.4935, "losses/dpo": 0.5351928472518921, "losses/sft": 1.8947772979736328, "losses/total": 0.5351928472518921, "ref_logps/chosen": -47.00167465209961, "ref_logps/rejected": -52.903202056884766, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.6004994511604309, "rewards/margins": 0.7085265517234802, "rewards/rejected": -1.3090260028839111, "step": 279 }, { "epoch": 2.11, "grad_norm": 44.40349521391864, "learning_rate": 1.6292134831460675e-07, "logps/chosen": -49.065860748291016, "logps/rejected": -62.56683349609375, "loss": 0.4897, "losses/dpo": 0.504388689994812, "losses/sft": 1.9688255786895752, "losses/total": 0.504388689994812, "ref_logps/chosen": -43.66246795654297, "ref_logps/rejected": -50.512149810791016, "rewards/accuracies": 0.828125, "rewards/chosen": -0.540338933467865, "rewards/margins": 0.665129542350769, "rewards/rejected": -1.2054684162139893, "step": 280 }, { "epoch": 2.12, "grad_norm": 23.51093703837308, "learning_rate": 1.6151685393258428e-07, "logps/chosen": -45.374507904052734, "logps/rejected": -66.95557403564453, "loss": 0.4394, "losses/dpo": 0.3948534429073334, "losses/sft": 1.8182882070541382, "losses/total": 0.3948534429073334, "ref_logps/chosen": -40.80500793457031, "ref_logps/rejected": -54.17891311645508, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.4569500982761383, "rewards/margins": 0.8207160830497742, "rewards/rejected": -1.2776662111282349, "step": 281 }, { "epoch": 2.13, "grad_norm": 34.6015311606402, "learning_rate": 1.6011235955056178e-07, "logps/chosen": -49.55183029174805, "logps/rejected": -64.22444152832031, "loss": 0.4938, "losses/dpo": 0.4182536005973816, "losses/sft": 1.6650938987731934, "losses/total": 0.4182536005973816, "ref_logps/chosen": -44.061161041259766, "ref_logps/rejected": -52.01859664916992, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.549067497253418, "rewards/margins": 0.6715176701545715, "rewards/rejected": -1.2205852270126343, "step": 282 }, { "epoch": 2.14, "grad_norm": 42.42694864418512, "learning_rate": 1.5870786516853931e-07, "logps/chosen": -51.672611236572266, "logps/rejected": -62.758201599121094, "loss": 0.5346, "losses/dpo": 0.46626016497612, "losses/sft": 2.0574629306793213, "losses/total": 0.46626016497612, "ref_logps/chosen": -45.942771911621094, "ref_logps/rejected": -51.32615661621094, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5729840397834778, "rewards/margins": 0.5702201128005981, "rewards/rejected": -1.1432042121887207, "step": 283 }, { "epoch": 2.14, "grad_norm": 29.027080004323725, "learning_rate": 1.5730337078651685e-07, "logps/chosen": -50.05191421508789, "logps/rejected": -63.017738342285156, "loss": 0.496, "losses/dpo": 0.4836353063583374, "losses/sft": 1.66543447971344, "losses/total": 0.4836353063583374, "ref_logps/chosen": -44.71332550048828, "ref_logps/rejected": -51.09360885620117, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5338596701622009, "rewards/margins": 0.6585534811019897, "rewards/rejected": -1.192413330078125, "step": 284 }, { "epoch": 2.15, "grad_norm": 34.83162990004629, "learning_rate": 1.5589887640449438e-07, "logps/chosen": -47.1140022277832, "logps/rejected": -57.586875915527344, "loss": 0.5112, "losses/dpo": 0.3050788342952728, "losses/sft": 1.528511643409729, "losses/total": 0.3050788342952728, "ref_logps/chosen": -41.70233154296875, "ref_logps/rejected": -45.95048522949219, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.5411670207977295, "rewards/margins": 0.6224713325500488, "rewards/rejected": -1.1636384725570679, "step": 285 }, { "epoch": 2.16, "grad_norm": 28.81649556451591, "learning_rate": 1.5449438202247188e-07, "logps/chosen": -49.70873260498047, "logps/rejected": -70.10694122314453, "loss": 0.4689, "losses/dpo": 0.3297555446624756, "losses/sft": 1.699752926826477, "losses/total": 0.3297555446624756, "ref_logps/chosen": -43.79273986816406, "ref_logps/rejected": -56.316646575927734, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.5915996432304382, "rewards/margins": 0.7874290347099304, "rewards/rejected": -1.3790286779403687, "step": 286 }, { "epoch": 2.17, "grad_norm": 31.592751280817794, "learning_rate": 1.5308988764044944e-07, "logps/chosen": -51.86985778808594, "logps/rejected": -65.0584945678711, "loss": 0.4979, "losses/dpo": 0.42169618606567383, "losses/sft": 1.717678189277649, "losses/total": 0.42169618606567383, "ref_logps/chosen": -45.48046875, "ref_logps/rejected": -51.73884963989258, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6389389038085938, "rewards/margins": 0.6930255889892578, "rewards/rejected": -1.3319644927978516, "step": 287 }, { "epoch": 2.17, "grad_norm": 30.082866349200362, "learning_rate": 1.5168539325842697e-07, "logps/chosen": -51.86253356933594, "logps/rejected": -61.59529495239258, "loss": 0.474, "losses/dpo": 0.3721589148044586, "losses/sft": 1.9278478622436523, "losses/total": 0.3721589148044586, "ref_logps/chosen": -46.706539154052734, "ref_logps/rejected": -49.11457824707031, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.5155993700027466, "rewards/margins": 0.7324722409248352, "rewards/rejected": -1.2480716705322266, "step": 288 }, { "epoch": 2.18, "grad_norm": 38.21267003236946, "learning_rate": 1.502808988764045e-07, "logps/chosen": -47.701148986816406, "logps/rejected": -60.36317825317383, "loss": 0.4747, "losses/dpo": 0.37174510955810547, "losses/sft": 1.793306827545166, "losses/total": 0.37174510955810547, "ref_logps/chosen": -42.27632141113281, "ref_logps/rejected": -47.96172332763672, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5424827933311462, "rewards/margins": 0.6976625323295593, "rewards/rejected": -1.2401453256607056, "step": 289 }, { "epoch": 2.19, "grad_norm": 31.883129332858225, "learning_rate": 1.4887640449438203e-07, "logps/chosen": -54.1004524230957, "logps/rejected": -66.43025207519531, "loss": 0.4908, "losses/dpo": 0.4963166117668152, "losses/sft": 1.8474931716918945, "losses/total": 0.4963166117668152, "ref_logps/chosen": -48.27857208251953, "ref_logps/rejected": -54.03632736206055, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5821880102157593, "rewards/margins": 0.6572045087814331, "rewards/rejected": -1.2393925189971924, "step": 290 }, { "epoch": 2.2, "grad_norm": 37.0528302465769, "learning_rate": 1.4747191011235953e-07, "logps/chosen": -49.52764129638672, "logps/rejected": -59.30036926269531, "loss": 0.5284, "losses/dpo": 0.4408453702926636, "losses/sft": 1.8658891916275024, "losses/total": 0.4408453702926636, "ref_logps/chosen": -44.21025848388672, "ref_logps/rejected": -48.308738708496094, "rewards/accuracies": 0.75, "rewards/chosen": -0.5317389965057373, "rewards/margins": 0.5674237012863159, "rewards/rejected": -1.0991626977920532, "step": 291 }, { "epoch": 2.2, "grad_norm": 46.13505545748754, "learning_rate": 1.4606741573033706e-07, "logps/chosen": -47.20491027832031, "logps/rejected": -63.91752243041992, "loss": 0.4508, "losses/dpo": 0.5243589282035828, "losses/sft": 1.6923798322677612, "losses/total": 0.5243589282035828, "ref_logps/chosen": -41.78574752807617, "ref_logps/rejected": -50.202999114990234, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5419164299964905, "rewards/margins": 0.829535722732544, "rewards/rejected": -1.3714520931243896, "step": 292 }, { "epoch": 2.21, "grad_norm": 36.141477070418105, "learning_rate": 1.446629213483146e-07, "logps/chosen": -44.679988861083984, "logps/rejected": -66.16545104980469, "loss": 0.439, "losses/dpo": 0.3221483826637268, "losses/sft": 1.4501105546951294, "losses/total": 0.3221483826637268, "ref_logps/chosen": -39.050994873046875, "ref_logps/rejected": -52.02964782714844, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.5628998875617981, "rewards/margins": 0.8506803512573242, "rewards/rejected": -1.4135804176330566, "step": 293 }, { "epoch": 2.22, "grad_norm": 35.747370823625836, "learning_rate": 1.4325842696629212e-07, "logps/chosen": -46.824867248535156, "logps/rejected": -65.53140258789062, "loss": 0.4548, "losses/dpo": 0.4621339440345764, "losses/sft": 1.9849398136138916, "losses/total": 0.4621339440345764, "ref_logps/chosen": -41.530296325683594, "ref_logps/rejected": -51.96376037597656, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5294572114944458, "rewards/margins": 0.8273074626922607, "rewards/rejected": -1.3567646741867065, "step": 294 }, { "epoch": 2.23, "grad_norm": 34.62859249651339, "learning_rate": 1.4185393258426968e-07, "logps/chosen": -48.18260192871094, "logps/rejected": -61.21552276611328, "loss": 0.4496, "losses/dpo": 0.44690898060798645, "losses/sft": 1.6001710891723633, "losses/total": 0.44690898060798645, "ref_logps/chosen": -42.8167839050293, "ref_logps/rejected": -47.96897506713867, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.5365816354751587, "rewards/margins": 0.7880733013153076, "rewards/rejected": -1.3246548175811768, "step": 295 }, { "epoch": 2.24, "grad_norm": 61.98918726132186, "learning_rate": 1.4044943820224718e-07, "logps/chosen": -47.78983688354492, "logps/rejected": -63.95269012451172, "loss": 0.4524, "losses/dpo": 0.4150780439376831, "losses/sft": 1.7755818367004395, "losses/total": 0.4150780439376831, "ref_logps/chosen": -43.358665466308594, "ref_logps/rejected": -51.511165618896484, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.44311660528182983, "rewards/margins": 0.8010364174842834, "rewards/rejected": -1.2441529035568237, "step": 296 }, { "epoch": 2.24, "grad_norm": 42.66724658253497, "learning_rate": 1.3904494382022472e-07, "logps/chosen": -49.26057434082031, "logps/rejected": -58.405521392822266, "loss": 0.529, "losses/dpo": 0.456921249628067, "losses/sft": 2.0008864402770996, "losses/total": 0.456921249628067, "ref_logps/chosen": -43.675724029541016, "ref_logps/rejected": -47.02533721923828, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5584847331047058, "rewards/margins": 0.579533040523529, "rewards/rejected": -1.1380177736282349, "step": 297 }, { "epoch": 2.25, "grad_norm": 24.79805436165274, "learning_rate": 1.3764044943820225e-07, "logps/chosen": -45.54669189453125, "logps/rejected": -61.581424713134766, "loss": 0.4435, "losses/dpo": 0.3618351221084595, "losses/sft": 1.6873468160629272, "losses/total": 0.3618351221084595, "ref_logps/chosen": -40.4007453918457, "ref_logps/rejected": -48.153865814208984, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5145938992500305, "rewards/margins": 0.8281623125076294, "rewards/rejected": -1.3427562713623047, "step": 298 }, { "epoch": 2.26, "grad_norm": 42.8985819272972, "learning_rate": 1.3623595505617978e-07, "logps/chosen": -51.75535583496094, "logps/rejected": -66.66900634765625, "loss": 0.4645, "losses/dpo": 0.38914433121681213, "losses/sft": 1.7742490768432617, "losses/total": 0.38914433121681213, "ref_logps/chosen": -44.84712600708008, "ref_logps/rejected": -51.77956771850586, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6908228993415833, "rewards/margins": 0.7981210947036743, "rewards/rejected": -1.4889440536499023, "step": 299 }, { "epoch": 2.27, "grad_norm": 40.30363175732382, "learning_rate": 1.3483146067415728e-07, "logps/chosen": -51.6724853515625, "logps/rejected": -63.59320831298828, "loss": 0.4949, "losses/dpo": 0.38013771176338196, "losses/sft": 1.8500171899795532, "losses/total": 0.38013771176338196, "ref_logps/chosen": -45.77328109741211, "ref_logps/rejected": -50.94234848022461, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.5899205207824707, "rewards/margins": 0.6751659512519836, "rewards/rejected": -1.2650864124298096, "step": 300 }, { "epoch": 2.27, "grad_norm": 34.894778793466045, "learning_rate": 1.334269662921348e-07, "logps/chosen": -50.00305938720703, "logps/rejected": -63.447113037109375, "loss": 0.4641, "losses/dpo": 0.38211292028427124, "losses/sft": 1.8071119785308838, "losses/total": 0.38211292028427124, "ref_logps/chosen": -44.68912887573242, "ref_logps/rejected": -50.19194793701172, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.5313928723335266, "rewards/margins": 0.7941237688064575, "rewards/rejected": -1.3255165815353394, "step": 301 }, { "epoch": 2.28, "grad_norm": 28.45023547091047, "learning_rate": 1.3202247191011234e-07, "logps/chosen": -50.29368591308594, "logps/rejected": -63.10490798950195, "loss": 0.4692, "losses/dpo": 0.4564237892627716, "losses/sft": 1.7674496173858643, "losses/total": 0.4564237892627716, "ref_logps/chosen": -43.98621368408203, "ref_logps/rejected": -49.43875503540039, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6307473182678223, "rewards/margins": 0.7358678579330444, "rewards/rejected": -1.3666152954101562, "step": 302 }, { "epoch": 2.29, "grad_norm": 81.7029890666012, "learning_rate": 1.306179775280899e-07, "logps/chosen": -48.68151092529297, "logps/rejected": -64.75912475585938, "loss": 0.4754, "losses/dpo": 0.41533470153808594, "losses/sft": 1.8685168027877808, "losses/total": 0.41533470153808594, "ref_logps/chosen": -43.27861404418945, "ref_logps/rejected": -51.79693603515625, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5402895212173462, "rewards/margins": 0.7559298276901245, "rewards/rejected": -1.2962194681167603, "step": 303 }, { "epoch": 2.3, "grad_norm": 30.68080590755349, "learning_rate": 1.2921348314606743e-07, "logps/chosen": -53.13454055786133, "logps/rejected": -67.93505859375, "loss": 0.4386, "losses/dpo": 0.3812296390533447, "losses/sft": 1.8960628509521484, "losses/total": 0.3812296390533447, "ref_logps/chosen": -48.34521484375, "ref_logps/rejected": -54.49592590332031, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.47893261909484863, "rewards/margins": 0.8649811744689941, "rewards/rejected": -1.3439137935638428, "step": 304 }, { "epoch": 2.3, "grad_norm": 37.97271017674442, "learning_rate": 1.2780898876404493e-07, "logps/chosen": -49.085914611816406, "logps/rejected": -66.31155395507812, "loss": 0.4367, "losses/dpo": 0.332202285528183, "losses/sft": 1.744154930114746, "losses/total": 0.332202285528183, "ref_logps/chosen": -43.31494140625, "ref_logps/rejected": -52.24314880371094, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5770969986915588, "rewards/margins": 0.8297438621520996, "rewards/rejected": -1.4068408012390137, "step": 305 }, { "epoch": 2.31, "grad_norm": 23.645326961744978, "learning_rate": 1.2640449438202246e-07, "logps/chosen": -47.66139602661133, "logps/rejected": -67.5027084350586, "loss": 0.4199, "losses/dpo": 0.31917473673820496, "losses/sft": 1.72993004322052, "losses/total": 0.31917473673820496, "ref_logps/chosen": -42.300148010253906, "ref_logps/rejected": -52.81003952026367, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5361245274543762, "rewards/margins": 0.9331421852111816, "rewards/rejected": -1.4692668914794922, "step": 306 }, { "epoch": 2.32, "grad_norm": 43.7604030256196, "learning_rate": 1.25e-07, "logps/chosen": -53.353294372558594, "logps/rejected": -66.92119598388672, "loss": 0.46, "losses/dpo": 0.258212685585022, "losses/sft": 1.5471559762954712, "losses/total": 0.258212685585022, "ref_logps/chosen": -47.464927673339844, "ref_logps/rejected": -53.276546478271484, "rewards/accuracies": 0.8125, "rewards/chosen": -0.588837206363678, "rewards/margins": 0.7756274938583374, "rewards/rejected": -1.3644648790359497, "step": 307 }, { "epoch": 2.33, "grad_norm": 119.18985047580789, "learning_rate": 1.2359550561797752e-07, "logps/chosen": -49.019073486328125, "logps/rejected": -66.93950653076172, "loss": 0.4519, "losses/dpo": 0.37537306547164917, "losses/sft": 1.880836009979248, "losses/total": 0.37537306547164917, "ref_logps/chosen": -42.4522705078125, "ref_logps/rejected": -52.390071868896484, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6566796898841858, "rewards/margins": 0.7982646822929382, "rewards/rejected": -1.454944372177124, "step": 308 }, { "epoch": 2.33, "grad_norm": 46.61171105526296, "learning_rate": 1.2219101123595506e-07, "logps/chosen": -48.54987716674805, "logps/rejected": -63.66078567504883, "loss": 0.4927, "losses/dpo": 0.41355597972869873, "losses/sft": 1.620566487312317, "losses/total": 0.41355597972869873, "ref_logps/chosen": -43.545345306396484, "ref_logps/rejected": -51.466426849365234, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5004531741142273, "rewards/margins": 0.7189834117889404, "rewards/rejected": -1.2194366455078125, "step": 309 }, { "epoch": 2.34, "grad_norm": 58.52394156974881, "learning_rate": 1.2078651685393259e-07, "logps/chosen": -46.990867614746094, "logps/rejected": -63.61461639404297, "loss": 0.4644, "losses/dpo": 0.3501368463039398, "losses/sft": 1.544814944267273, "losses/total": 0.3501368463039398, "ref_logps/chosen": -41.24350357055664, "ref_logps/rejected": -49.67095947265625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5747367739677429, "rewards/margins": 0.8196287155151367, "rewards/rejected": -1.3943655490875244, "step": 310 }, { "epoch": 2.35, "grad_norm": 54.39747581167795, "learning_rate": 1.1938202247191012e-07, "logps/chosen": -45.09647750854492, "logps/rejected": -63.75227355957031, "loss": 0.466, "losses/dpo": 0.42020341753959656, "losses/sft": 1.8535985946655273, "losses/total": 0.42020341753959656, "ref_logps/chosen": -39.841346740722656, "ref_logps/rejected": -50.577796936035156, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.5255131125450134, "rewards/margins": 0.7919342517852783, "rewards/rejected": -1.3174474239349365, "step": 311 }, { "epoch": 2.36, "grad_norm": 44.46739753895283, "learning_rate": 1.1797752808988763e-07, "logps/chosen": -50.22407531738281, "logps/rejected": -61.790164947509766, "loss": 0.4726, "losses/dpo": 0.4256379008293152, "losses/sft": 1.8494802713394165, "losses/total": 0.4256379008293152, "ref_logps/chosen": -44.54825973510742, "ref_logps/rejected": -48.405784606933594, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5675812363624573, "rewards/margins": 0.7708570957183838, "rewards/rejected": -1.3384383916854858, "step": 312 }, { "epoch": 2.36, "grad_norm": 55.75743148632008, "learning_rate": 1.1657303370786515e-07, "logps/chosen": -50.64889144897461, "logps/rejected": -60.807430267333984, "loss": 0.5731, "losses/dpo": 0.4281044900417328, "losses/sft": 1.8629206418991089, "losses/total": 0.4281044900417328, "ref_logps/chosen": -44.187469482421875, "ref_logps/rejected": -48.879905700683594, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6461426019668579, "rewards/margins": 0.5466097593307495, "rewards/rejected": -1.1927522420883179, "step": 313 }, { "epoch": 2.37, "grad_norm": 32.9016144878935, "learning_rate": 1.151685393258427e-07, "logps/chosen": -49.50126266479492, "logps/rejected": -62.49106979370117, "loss": 0.4863, "losses/dpo": 0.4087386727333069, "losses/sft": 1.9700853824615479, "losses/total": 0.4087386727333069, "ref_logps/chosen": -43.90349197387695, "ref_logps/rejected": -49.64462661743164, "rewards/accuracies": 0.8359375, "rewards/chosen": -0.559777557849884, "rewards/margins": 0.7248663902282715, "rewards/rejected": -1.2846438884735107, "step": 314 }, { "epoch": 2.38, "grad_norm": 44.99129530748152, "learning_rate": 1.1376404494382023e-07, "logps/chosen": -56.25061798095703, "logps/rejected": -66.5046157836914, "loss": 0.5117, "losses/dpo": 0.5078104138374329, "losses/sft": 1.6950089931488037, "losses/total": 0.5078104138374329, "ref_logps/chosen": -49.48601531982422, "ref_logps/rejected": -52.47853088378906, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6764602661132812, "rewards/margins": 0.7261484265327454, "rewards/rejected": -1.4026087522506714, "step": 315 }, { "epoch": 2.39, "grad_norm": 42.2868992833382, "learning_rate": 1.1235955056179774e-07, "logps/chosen": -50.650184631347656, "logps/rejected": -59.49814987182617, "loss": 0.5448, "losses/dpo": 0.5089865922927856, "losses/sft": 2.003211259841919, "losses/total": 0.5089865922927856, "ref_logps/chosen": -44.98645782470703, "ref_logps/rejected": -48.48113250732422, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5663729310035706, "rewards/margins": 0.5353288650512695, "rewards/rejected": -1.1017017364501953, "step": 316 }, { "epoch": 2.39, "grad_norm": 44.44979675867788, "learning_rate": 1.1095505617977527e-07, "logps/chosen": -51.693416595458984, "logps/rejected": -63.80548858642578, "loss": 0.4931, "losses/dpo": 0.5661917328834534, "losses/sft": 1.637414813041687, "losses/total": 0.5661917328834534, "ref_logps/chosen": -45.626705169677734, "ref_logps/rejected": -50.76273727416992, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6066715121269226, "rewards/margins": 0.6976041197776794, "rewards/rejected": -1.3042757511138916, "step": 317 }, { "epoch": 2.4, "grad_norm": 26.625947961027723, "learning_rate": 1.095505617977528e-07, "logps/chosen": -49.997684478759766, "logps/rejected": -65.61261749267578, "loss": 0.4576, "losses/dpo": 0.4372258484363556, "losses/sft": 1.924217700958252, "losses/total": 0.4372258484363556, "ref_logps/chosen": -44.4435920715332, "ref_logps/rejected": -52.19112014770508, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5554090142250061, "rewards/margins": 0.7867408394813538, "rewards/rejected": -1.3421498537063599, "step": 318 }, { "epoch": 2.41, "grad_norm": 39.10907991576822, "learning_rate": 1.0814606741573033e-07, "logps/chosen": -46.622100830078125, "logps/rejected": -66.21222686767578, "loss": 0.4423, "losses/dpo": 0.3607669174671173, "losses/sft": 1.6760084629058838, "losses/total": 0.3607669174671173, "ref_logps/chosen": -41.68397903442383, "ref_logps/rejected": -52.68547058105469, "rewards/accuracies": 0.859375, "rewards/chosen": -0.49381202459335327, "rewards/margins": 0.8588635921478271, "rewards/rejected": -1.3526755571365356, "step": 319 }, { "epoch": 2.42, "grad_norm": 33.084576306230616, "learning_rate": 1.0674157303370785e-07, "logps/chosen": -45.75358200073242, "logps/rejected": -59.3265266418457, "loss": 0.4803, "losses/dpo": 0.29809629917144775, "losses/sft": 1.7368574142456055, "losses/total": 0.29809629917144775, "ref_logps/chosen": -40.34146499633789, "ref_logps/rejected": -46.574745178222656, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5412120223045349, "rewards/margins": 0.7339664101600647, "rewards/rejected": -1.2751784324645996, "step": 320 }, { "epoch": 2.42, "grad_norm": 50.29216069920892, "learning_rate": 1.0533707865168538e-07, "logps/chosen": -52.55310821533203, "logps/rejected": -66.991455078125, "loss": 0.4871, "losses/dpo": 0.4431208670139313, "losses/sft": 2.1075568199157715, "losses/total": 0.4431208670139313, "ref_logps/chosen": -46.3017578125, "ref_logps/rejected": -53.20699691772461, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.6251351833343506, "rewards/margins": 0.75331050157547, "rewards/rejected": -1.3784456253051758, "step": 321 }, { "epoch": 2.43, "grad_norm": 31.176032208957253, "learning_rate": 1.0393258426966293e-07, "logps/chosen": -53.03559112548828, "logps/rejected": -68.98506927490234, "loss": 0.4752, "losses/dpo": 0.5410434007644653, "losses/sft": 1.8393595218658447, "losses/total": 0.5410434007644653, "ref_logps/chosen": -46.650081634521484, "ref_logps/rejected": -55.51156234741211, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.6385514140129089, "rewards/margins": 0.7087997794151306, "rewards/rejected": -1.34735107421875, "step": 322 }, { "epoch": 2.44, "grad_norm": 32.43451424088184, "learning_rate": 1.0252808988764044e-07, "logps/chosen": -47.250701904296875, "logps/rejected": -65.95178985595703, "loss": 0.4578, "losses/dpo": 0.3734424412250519, "losses/sft": 1.6751508712768555, "losses/total": 0.3734424412250519, "ref_logps/chosen": -41.42866897583008, "ref_logps/rejected": -52.412967681884766, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.5822036862373352, "rewards/margins": 0.7716787457466125, "rewards/rejected": -1.3538825511932373, "step": 323 }, { "epoch": 2.45, "grad_norm": 49.397078169062496, "learning_rate": 1.0112359550561797e-07, "logps/chosen": -47.313533782958984, "logps/rejected": -65.02396392822266, "loss": 0.4228, "losses/dpo": 0.34801167249679565, "losses/sft": 1.5732629299163818, "losses/total": 0.34801167249679565, "ref_logps/chosen": -42.42364501953125, "ref_logps/rejected": -51.05178451538086, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4889884293079376, "rewards/margins": 0.908229649066925, "rewards/rejected": -1.3972179889678955, "step": 324 }, { "epoch": 2.45, "grad_norm": 47.726495957001454, "learning_rate": 9.971910112359549e-08, "logps/chosen": -55.75758361816406, "logps/rejected": -62.832130432128906, "loss": 0.4888, "losses/dpo": 0.4860239624977112, "losses/sft": 2.04508638381958, "losses/total": 0.4860239624977112, "ref_logps/chosen": -49.81437301635742, "ref_logps/rejected": -49.849124908447266, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5943213701248169, "rewards/margins": 0.7039793729782104, "rewards/rejected": -1.2983007431030273, "step": 325 }, { "epoch": 2.46, "grad_norm": 36.74212851896501, "learning_rate": 9.831460674157303e-08, "logps/chosen": -46.65519332885742, "logps/rejected": -62.77692413330078, "loss": 0.4368, "losses/dpo": 0.3731769323348999, "losses/sft": 1.6393812894821167, "losses/total": 0.3731769323348999, "ref_logps/chosen": -41.64007568359375, "ref_logps/rejected": -49.11181640625, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.5015115141868591, "rewards/margins": 0.8649990558624268, "rewards/rejected": -1.3665106296539307, "step": 326 }, { "epoch": 2.47, "grad_norm": 40.47770234128339, "learning_rate": 9.691011235955055e-08, "logps/chosen": -51.47808837890625, "logps/rejected": -61.499794006347656, "loss": 0.4938, "losses/dpo": 0.5085561275482178, "losses/sft": 1.7307016849517822, "losses/total": 0.5085561275482178, "ref_logps/chosen": -45.46772384643555, "ref_logps/rejected": -48.600975036621094, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6010371446609497, "rewards/margins": 0.6888448596000671, "rewards/rejected": -1.2898820638656616, "step": 327 }, { "epoch": 2.48, "grad_norm": 34.48687115258181, "learning_rate": 9.550561797752808e-08, "logps/chosen": -52.35597229003906, "logps/rejected": -64.94651794433594, "loss": 0.487, "losses/dpo": 0.35471710562705994, "losses/sft": 1.7028522491455078, "losses/total": 0.35471710562705994, "ref_logps/chosen": -45.81639099121094, "ref_logps/rejected": -50.81908416748047, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6539584398269653, "rewards/margins": 0.7587845921516418, "rewards/rejected": -1.412743091583252, "step": 328 }, { "epoch": 2.48, "grad_norm": 46.297878699011456, "learning_rate": 9.410112359550561e-08, "logps/chosen": -49.620079040527344, "logps/rejected": -61.279991149902344, "loss": 0.5154, "losses/dpo": 0.3841705620288849, "losses/sft": 1.6648838520050049, "losses/total": 0.3841705620288849, "ref_logps/chosen": -43.54829406738281, "ref_logps/rejected": -48.85557174682617, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6071785688400269, "rewards/margins": 0.6352633833885193, "rewards/rejected": -1.2424417734146118, "step": 329 }, { "epoch": 2.49, "grad_norm": 31.77609611587154, "learning_rate": 9.269662921348314e-08, "logps/chosen": -46.87300491333008, "logps/rejected": -67.45767211914062, "loss": 0.4322, "losses/dpo": 0.2587304711341858, "losses/sft": 1.7239441871643066, "losses/total": 0.2587304711341858, "ref_logps/chosen": -41.466495513916016, "ref_logps/rejected": -53.13069152832031, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5406507253646851, "rewards/margins": 0.8920475244522095, "rewards/rejected": -1.4326982498168945, "step": 330 }, { "epoch": 2.5, "grad_norm": 64.68614799790986, "learning_rate": 9.129213483146067e-08, "logps/chosen": -48.50446319580078, "logps/rejected": -65.34441375732422, "loss": 0.4904, "losses/dpo": 0.47038501501083374, "losses/sft": 1.7417045831680298, "losses/total": 0.47038501501083374, "ref_logps/chosen": -42.83766174316406, "ref_logps/rejected": -52.59800720214844, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5666800141334534, "rewards/margins": 0.7079604268074036, "rewards/rejected": -1.274640440940857, "step": 331 }, { "epoch": 2.51, "grad_norm": 32.06271879641542, "learning_rate": 8.988764044943819e-08, "logps/chosen": -48.020870208740234, "logps/rejected": -63.20212936401367, "loss": 0.4812, "losses/dpo": 0.33493292331695557, "losses/sft": 1.7744500637054443, "losses/total": 0.33493292331695557, "ref_logps/chosen": -41.92070770263672, "ref_logps/rejected": -49.221893310546875, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.6100164651870728, "rewards/margins": 0.788007378578186, "rewards/rejected": -1.3980237245559692, "step": 332 }, { "epoch": 2.51, "grad_norm": 26.45308046102739, "learning_rate": 8.848314606741572e-08, "logps/chosen": -48.79193115234375, "logps/rejected": -65.9973373413086, "loss": 0.4501, "losses/dpo": 0.5545312166213989, "losses/sft": 1.8423538208007812, "losses/total": 0.5545312166213989, "ref_logps/chosen": -43.30110549926758, "ref_logps/rejected": -52.2086296081543, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5490830540657043, "rewards/margins": 0.82978755235672, "rewards/rejected": -1.3788707256317139, "step": 333 }, { "epoch": 2.52, "grad_norm": 37.24835030098744, "learning_rate": 8.707865168539325e-08, "logps/chosen": -55.10631561279297, "logps/rejected": -69.52106475830078, "loss": 0.4562, "losses/dpo": 0.35140344500541687, "losses/sft": 1.6956713199615479, "losses/total": 0.35140344500541687, "ref_logps/chosen": -48.564208984375, "ref_logps/rejected": -54.805908203125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6542105078697205, "rewards/margins": 0.8173050284385681, "rewards/rejected": -1.4715156555175781, "step": 334 }, { "epoch": 2.53, "grad_norm": 44.71227908892904, "learning_rate": 8.567415730337078e-08, "logps/chosen": -46.23660659790039, "logps/rejected": -66.13673400878906, "loss": 0.4455, "losses/dpo": 0.2548215985298157, "losses/sft": 1.7199440002441406, "losses/total": 0.2548215985298157, "ref_logps/chosen": -40.14839172363281, "ref_logps/rejected": -51.35820388793945, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.6088215708732605, "rewards/margins": 0.8690313100814819, "rewards/rejected": -1.4778528213500977, "step": 335 }, { "epoch": 2.54, "grad_norm": 31.408156008715395, "learning_rate": 8.426966292134831e-08, "logps/chosen": -46.77598571777344, "logps/rejected": -60.90141677856445, "loss": 0.4687, "losses/dpo": 0.4249792695045471, "losses/sft": 1.850572943687439, "losses/total": 0.4249792695045471, "ref_logps/chosen": -40.86663055419922, "ref_logps/rejected": -47.56753921508789, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.590936005115509, "rewards/margins": 0.7424519062042236, "rewards/rejected": -1.333387851715088, "step": 336 }, { "epoch": 2.54, "grad_norm": 46.45434475270436, "learning_rate": 8.286516853932583e-08, "logps/chosen": -53.33147430419922, "logps/rejected": -63.10407638549805, "loss": 0.5356, "losses/dpo": 0.5665757656097412, "losses/sft": 1.9680542945861816, "losses/total": 0.5665757656097412, "ref_logps/chosen": -46.464691162109375, "ref_logps/rejected": -49.922061920166016, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6866781711578369, "rewards/margins": 0.631523072719574, "rewards/rejected": -1.3182013034820557, "step": 337 }, { "epoch": 2.55, "grad_norm": 35.952800613048225, "learning_rate": 8.146067415730337e-08, "logps/chosen": -47.478485107421875, "logps/rejected": -62.79949951171875, "loss": 0.4845, "losses/dpo": 0.39381512999534607, "losses/sft": 1.7948269844055176, "losses/total": 0.39381512999534607, "ref_logps/chosen": -41.275901794433594, "ref_logps/rejected": -49.19529724121094, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.6202585697174072, "rewards/margins": 0.7401620745658875, "rewards/rejected": -1.36042058467865, "step": 338 }, { "epoch": 2.56, "grad_norm": 96.24151670858544, "learning_rate": 8.005617977528089e-08, "logps/chosen": -49.970359802246094, "logps/rejected": -62.65436553955078, "loss": 0.4763, "losses/dpo": 0.5293009281158447, "losses/sft": 1.9389904737472534, "losses/total": 0.5293009281158447, "ref_logps/chosen": -44.4184684753418, "ref_logps/rejected": -49.791622161865234, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5551893711090088, "rewards/margins": 0.7310845851898193, "rewards/rejected": -1.2862739562988281, "step": 339 }, { "epoch": 2.57, "grad_norm": 30.22245118555741, "learning_rate": 7.865168539325842e-08, "logps/chosen": -51.74867248535156, "logps/rejected": -65.66789245605469, "loss": 0.4554, "losses/dpo": 0.3766622245311737, "losses/sft": 1.8082342147827148, "losses/total": 0.3766622245311737, "ref_logps/chosen": -44.85212326049805, "ref_logps/rejected": -50.816123962402344, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.6896551847457886, "rewards/margins": 0.7955220341682434, "rewards/rejected": -1.4851771593093872, "step": 340 }, { "epoch": 2.57, "grad_norm": 37.15455327064764, "learning_rate": 7.724719101123594e-08, "logps/chosen": -50.156883239746094, "logps/rejected": -64.04041290283203, "loss": 0.4608, "losses/dpo": 0.5242490768432617, "losses/sft": 1.6807560920715332, "losses/total": 0.5242490768432617, "ref_logps/chosen": -44.362098693847656, "ref_logps/rejected": -49.7980842590332, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5794786214828491, "rewards/margins": 0.8447539210319519, "rewards/rejected": -1.4242324829101562, "step": 341 }, { "epoch": 2.58, "grad_norm": 48.143968184961885, "learning_rate": 7.584269662921348e-08, "logps/chosen": -50.02086639404297, "logps/rejected": -64.91596221923828, "loss": 0.4891, "losses/dpo": 0.397860586643219, "losses/sft": 1.852252721786499, "losses/total": 0.397860586643219, "ref_logps/chosen": -43.93117141723633, "ref_logps/rejected": -51.04300308227539, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.608969509601593, "rewards/margins": 0.7783259153366089, "rewards/rejected": -1.3872952461242676, "step": 342 }, { "epoch": 2.59, "grad_norm": 33.89319240138789, "learning_rate": 7.443820224719101e-08, "logps/chosen": -52.192691802978516, "logps/rejected": -63.581764221191406, "loss": 0.4545, "losses/dpo": 0.4276864230632782, "losses/sft": 1.9902665615081787, "losses/total": 0.4276864230632782, "ref_logps/chosen": -44.828529357910156, "ref_logps/rejected": -48.46653747558594, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.736416220664978, "rewards/margins": 0.7751063108444214, "rewards/rejected": -1.5115225315093994, "step": 343 }, { "epoch": 2.6, "grad_norm": 43.99405184376731, "learning_rate": 7.303370786516853e-08, "logps/chosen": -46.289390563964844, "logps/rejected": -59.206939697265625, "loss": 0.4685, "losses/dpo": 0.29001960158348083, "losses/sft": 1.6942520141601562, "losses/total": 0.29001960158348083, "ref_logps/chosen": -41.39132308959961, "ref_logps/rejected": -46.28327560424805, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.4898071885108948, "rewards/margins": 0.8025588989257812, "rewards/rejected": -1.2923660278320312, "step": 344 }, { "epoch": 2.61, "grad_norm": 24.542582646496427, "learning_rate": 7.162921348314606e-08, "logps/chosen": -51.44779968261719, "logps/rejected": -69.62947845458984, "loss": 0.4423, "losses/dpo": 0.2786746919155121, "losses/sft": 1.7554144859313965, "losses/total": 0.2786746919155121, "ref_logps/chosen": -44.62749099731445, "ref_logps/rejected": -54.2393913269043, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6820304989814758, "rewards/margins": 0.8569778203964233, "rewards/rejected": -1.5390081405639648, "step": 345 }, { "epoch": 2.61, "grad_norm": 64.52314127692446, "learning_rate": 7.022471910112359e-08, "logps/chosen": -52.3893928527832, "logps/rejected": -67.7752914428711, "loss": 0.4505, "losses/dpo": 0.4787806272506714, "losses/sft": 1.791824460029602, "losses/total": 0.4787806272506714, "ref_logps/chosen": -46.29042434692383, "ref_logps/rejected": -53.566009521484375, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.6098969578742981, "rewards/margins": 0.8110312819480896, "rewards/rejected": -1.4209282398223877, "step": 346 }, { "epoch": 2.62, "grad_norm": 69.4528370912726, "learning_rate": 6.882022471910112e-08, "logps/chosen": -44.024444580078125, "logps/rejected": -64.21692657470703, "loss": 0.4349, "losses/dpo": 0.4624437689781189, "losses/sft": 1.7515839338302612, "losses/total": 0.4624437689781189, "ref_logps/chosen": -38.87629318237305, "ref_logps/rejected": -50.59605407714844, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5148147940635681, "rewards/margins": 0.8472726345062256, "rewards/rejected": -1.3620874881744385, "step": 347 }, { "epoch": 2.63, "grad_norm": 50.33764152370375, "learning_rate": 6.741573033707864e-08, "logps/chosen": -50.645389556884766, "logps/rejected": -63.18524932861328, "loss": 0.4513, "losses/dpo": 0.46827107667922974, "losses/sft": 1.838287591934204, "losses/total": 0.46827107667922974, "ref_logps/chosen": -44.73358154296875, "ref_logps/rejected": -49.35152816772461, "rewards/accuracies": 0.828125, "rewards/chosen": -0.591181218624115, "rewards/margins": 0.7921910285949707, "rewards/rejected": -1.3833723068237305, "step": 348 }, { "epoch": 2.64, "grad_norm": 26.608460140756165, "learning_rate": 6.601123595505617e-08, "logps/chosen": -50.00164794921875, "logps/rejected": -61.464996337890625, "loss": 0.4761, "losses/dpo": 0.4410756230354309, "losses/sft": 1.812739610671997, "losses/total": 0.4410756230354309, "ref_logps/chosen": -44.06976318359375, "ref_logps/rejected": -47.535804748535156, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5931889414787292, "rewards/margins": 0.7997298240661621, "rewards/rejected": -1.3929188251495361, "step": 349 }, { "epoch": 2.64, "grad_norm": 58.64902180951105, "learning_rate": 6.460674157303371e-08, "logps/chosen": -51.31767272949219, "logps/rejected": -63.81333923339844, "loss": 0.5101, "losses/dpo": 0.4898483455181122, "losses/sft": 1.7123777866363525, "losses/total": 0.4898483455181122, "ref_logps/chosen": -44.28520965576172, "ref_logps/rejected": -50.04383850097656, "rewards/accuracies": 0.75, "rewards/chosen": -0.7032464742660522, "rewards/margins": 0.6737034320831299, "rewards/rejected": -1.3769499063491821, "step": 350 }, { "epoch": 2.65, "grad_norm": 50.51800904750087, "learning_rate": 6.320224719101123e-08, "logps/chosen": -53.721038818359375, "logps/rejected": -65.52702331542969, "loss": 0.5178, "losses/dpo": 0.3761449158191681, "losses/sft": 1.8855061531066895, "losses/total": 0.3761449158191681, "ref_logps/chosen": -47.294620513916016, "ref_logps/rejected": -52.42991256713867, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.6426420211791992, "rewards/margins": 0.6670692563056946, "rewards/rejected": -1.3097113370895386, "step": 351 }, { "epoch": 2.66, "grad_norm": 32.51611028866403, "learning_rate": 6.179775280898876e-08, "logps/chosen": -50.22157669067383, "logps/rejected": -67.7969741821289, "loss": 0.4182, "losses/dpo": 0.378769189119339, "losses/sft": 1.9291467666625977, "losses/total": 0.378769189119339, "ref_logps/chosen": -44.68474578857422, "ref_logps/rejected": -53.204341888427734, "rewards/accuracies": 0.8359375, "rewards/chosen": -0.5536828637123108, "rewards/margins": 0.9055807590484619, "rewards/rejected": -1.4592636823654175, "step": 352 }, { "epoch": 2.67, "grad_norm": 44.416508418781206, "learning_rate": 6.039325842696629e-08, "logps/chosen": -47.69599914550781, "logps/rejected": -63.027652740478516, "loss": 0.4622, "losses/dpo": 0.4440404176712036, "losses/sft": 1.5474292039871216, "losses/total": 0.4440404176712036, "ref_logps/chosen": -41.83900451660156, "ref_logps/rejected": -49.1169319152832, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5856993198394775, "rewards/margins": 0.8053735494613647, "rewards/rejected": -1.3910728693008423, "step": 353 }, { "epoch": 2.67, "grad_norm": 44.30837370795093, "learning_rate": 5.898876404494382e-08, "logps/chosen": -49.815650939941406, "logps/rejected": -64.16575622558594, "loss": 0.4929, "losses/dpo": 0.46184009313583374, "losses/sft": 1.791625738143921, "losses/total": 0.46184009313583374, "ref_logps/chosen": -43.65359878540039, "ref_logps/rejected": -50.70054244995117, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6162051558494568, "rewards/margins": 0.7303167581558228, "rewards/rejected": -1.3465218544006348, "step": 354 }, { "epoch": 2.68, "grad_norm": 26.83883847264626, "learning_rate": 5.758426966292135e-08, "logps/chosen": -50.534114837646484, "logps/rejected": -69.07585906982422, "loss": 0.4705, "losses/dpo": 0.4179081320762634, "losses/sft": 1.91859769821167, "losses/total": 0.4179081320762634, "ref_logps/chosen": -43.95571517944336, "ref_logps/rejected": -54.288936614990234, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6578400135040283, "rewards/margins": 0.8208526968955994, "rewards/rejected": -1.4786927700042725, "step": 355 }, { "epoch": 2.69, "grad_norm": 60.5287709010347, "learning_rate": 5.617977528089887e-08, "logps/chosen": -52.27357864379883, "logps/rejected": -68.36000061035156, "loss": 0.4323, "losses/dpo": 0.3623283803462982, "losses/sft": 1.7562377452850342, "losses/total": 0.3623283803462982, "ref_logps/chosen": -45.946617126464844, "ref_logps/rejected": -53.131263732910156, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6326963901519775, "rewards/margins": 0.890177845954895, "rewards/rejected": -1.5228742361068726, "step": 356 }, { "epoch": 2.7, "grad_norm": 34.438775651372794, "learning_rate": 5.47752808988764e-08, "logps/chosen": -50.114559173583984, "logps/rejected": -63.834102630615234, "loss": 0.4351, "losses/dpo": 0.3904572129249573, "losses/sft": 1.9670743942260742, "losses/total": 0.3904572129249573, "ref_logps/chosen": -43.60685729980469, "ref_logps/rejected": -49.31564712524414, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6507695913314819, "rewards/margins": 0.80107581615448, "rewards/rejected": -1.4518452882766724, "step": 357 }, { "epoch": 2.7, "grad_norm": 51.4060264804255, "learning_rate": 5.3370786516853926e-08, "logps/chosen": -50.31044006347656, "logps/rejected": -66.86388397216797, "loss": 0.4916, "losses/dpo": 0.39848509430885315, "losses/sft": 1.754665493965149, "losses/total": 0.39848509430885315, "ref_logps/chosen": -44.119384765625, "ref_logps/rejected": -53.55644226074219, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6191055178642273, "rewards/margins": 0.7116389274597168, "rewards/rejected": -1.3307443857192993, "step": 358 }, { "epoch": 2.71, "grad_norm": 43.99610997955244, "learning_rate": 5.196629213483146e-08, "logps/chosen": -54.09105682373047, "logps/rejected": -63.90288543701172, "loss": 0.538, "losses/dpo": 0.4680481255054474, "losses/sft": 1.9898825883865356, "losses/total": 0.4680481255054474, "ref_logps/chosen": -46.606971740722656, "ref_logps/rejected": -50.23580551147461, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.7484087347984314, "rewards/margins": 0.6182990074157715, "rewards/rejected": -1.3667078018188477, "step": 359 }, { "epoch": 2.72, "grad_norm": 34.47558938777199, "learning_rate": 5.056179775280899e-08, "logps/chosen": -52.71617126464844, "logps/rejected": -62.39817810058594, "loss": 0.5507, "losses/dpo": 0.6964555382728577, "losses/sft": 2.0210726261138916, "losses/total": 0.6964555382728577, "ref_logps/chosen": -45.996063232421875, "ref_logps/rejected": -49.64103698730469, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6720105409622192, "rewards/margins": 0.6037033200263977, "rewards/rejected": -1.2757140398025513, "step": 360 }, { "epoch": 2.73, "grad_norm": 43.61515969262683, "learning_rate": 4.915730337078652e-08, "logps/chosen": -52.628143310546875, "logps/rejected": -64.12786865234375, "loss": 0.4843, "losses/dpo": 0.3313373327255249, "losses/sft": 1.5305544137954712, "losses/total": 0.3313373327255249, "ref_logps/chosen": -46.06536865234375, "ref_logps/rejected": -50.14955139160156, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6562776565551758, "rewards/margins": 0.7415533065795898, "rewards/rejected": -1.3978309631347656, "step": 361 }, { "epoch": 2.73, "grad_norm": 25.704004544325855, "learning_rate": 4.775280898876404e-08, "logps/chosen": -48.099761962890625, "logps/rejected": -63.34098815917969, "loss": 0.4885, "losses/dpo": 0.2366529405117035, "losses/sft": 1.888703465461731, "losses/total": 0.2366529405117035, "ref_logps/chosen": -41.7584114074707, "ref_logps/rejected": -49.44350814819336, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6341351866722107, "rewards/margins": 0.7556129693984985, "rewards/rejected": -1.389748215675354, "step": 362 }, { "epoch": 2.74, "grad_norm": 35.962942116970424, "learning_rate": 4.634831460674157e-08, "logps/chosen": -48.865333557128906, "logps/rejected": -59.20051956176758, "loss": 0.4752, "losses/dpo": 0.4012467861175537, "losses/sft": 1.7801015377044678, "losses/total": 0.4012467861175537, "ref_logps/chosen": -43.125667572021484, "ref_logps/rejected": -45.5601806640625, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5739666819572449, "rewards/margins": 0.7900673747062683, "rewards/rejected": -1.3640341758728027, "step": 363 }, { "epoch": 2.75, "grad_norm": 54.55920815827558, "learning_rate": 4.4943820224719096e-08, "logps/chosen": -53.988914489746094, "logps/rejected": -64.75935363769531, "loss": 0.465, "losses/dpo": 0.4934675991535187, "losses/sft": 1.8313616514205933, "losses/total": 0.4934675991535187, "ref_logps/chosen": -48.053871154785156, "ref_logps/rejected": -50.50077819824219, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.593504786491394, "rewards/margins": 0.8323536515235901, "rewards/rejected": -1.425858497619629, "step": 364 }, { "epoch": 2.76, "grad_norm": 498.12301261000334, "learning_rate": 4.3539325842696626e-08, "logps/chosen": -51.51012420654297, "logps/rejected": -61.9508171081543, "loss": 0.4975, "losses/dpo": 0.5688412189483643, "losses/sft": 1.6108700037002563, "losses/total": 0.5688412189483643, "ref_logps/chosen": -44.91189193725586, "ref_logps/rejected": -48.141685485839844, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.6598237156867981, "rewards/margins": 0.7210899591445923, "rewards/rejected": -1.3809137344360352, "step": 365 }, { "epoch": 2.76, "grad_norm": 56.57870553404572, "learning_rate": 4.213483146067416e-08, "logps/chosen": -53.36735153198242, "logps/rejected": -65.3797607421875, "loss": 0.4971, "losses/dpo": 0.3451189398765564, "losses/sft": 1.7971303462982178, "losses/total": 0.3451189398765564, "ref_logps/chosen": -46.28878402709961, "ref_logps/rejected": -50.28190612792969, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.7078566551208496, "rewards/margins": 0.8019291758537292, "rewards/rejected": -1.509785771369934, "step": 366 }, { "epoch": 2.77, "grad_norm": 35.7157360789296, "learning_rate": 4.073033707865169e-08, "logps/chosen": -48.85197067260742, "logps/rejected": -61.11885070800781, "loss": 0.4684, "losses/dpo": 0.4802433252334595, "losses/sft": 1.8366670608520508, "losses/total": 0.4802433252334595, "ref_logps/chosen": -42.935726165771484, "ref_logps/rejected": -47.24919509887695, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5916240215301514, "rewards/margins": 0.7953420281410217, "rewards/rejected": -1.3869661092758179, "step": 367 }, { "epoch": 2.78, "grad_norm": 51.157782939757794, "learning_rate": 3.932584269662921e-08, "logps/chosen": -51.569740295410156, "logps/rejected": -68.38870239257812, "loss": 0.4961, "losses/dpo": 0.47486403584480286, "losses/sft": 1.7938294410705566, "losses/total": 0.47486403584480286, "ref_logps/chosen": -44.61973571777344, "ref_logps/rejected": -54.07467269897461, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6950002908706665, "rewards/margins": 0.7364022731781006, "rewards/rejected": -1.431402564048767, "step": 368 }, { "epoch": 2.79, "grad_norm": 36.507984471442086, "learning_rate": 3.792134831460674e-08, "logps/chosen": -46.7285041809082, "logps/rejected": -64.37630462646484, "loss": 0.4692, "losses/dpo": 0.4750906527042389, "losses/sft": 1.7684688568115234, "losses/total": 0.4750906527042389, "ref_logps/chosen": -40.8341178894043, "ref_logps/rejected": -50.90059280395508, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.5894384980201721, "rewards/margins": 0.758132815361023, "rewards/rejected": -1.3475713729858398, "step": 369 }, { "epoch": 2.79, "grad_norm": 37.56925209585322, "learning_rate": 3.6516853932584266e-08, "logps/chosen": -51.11574172973633, "logps/rejected": -66.00533294677734, "loss": 0.4859, "losses/dpo": 0.5161402821540833, "losses/sft": 1.9823883771896362, "losses/total": 0.5161402821540833, "ref_logps/chosen": -45.08394241333008, "ref_logps/rejected": -51.98117446899414, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6031798720359802, "rewards/margins": 0.7992358207702637, "rewards/rejected": -1.4024156332015991, "step": 370 }, { "epoch": 2.8, "grad_norm": 35.773750173647194, "learning_rate": 3.5112359550561796e-08, "logps/chosen": -50.73906707763672, "logps/rejected": -65.67009735107422, "loss": 0.5317, "losses/dpo": 0.4043237864971161, "losses/sft": 1.7889103889465332, "losses/total": 0.4043237864971161, "ref_logps/chosen": -44.70402526855469, "ref_logps/rejected": -52.93720626831055, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6035044193267822, "rewards/margins": 0.6697853207588196, "rewards/rejected": -1.2732897996902466, "step": 371 }, { "epoch": 2.81, "grad_norm": 45.33847715654187, "learning_rate": 3.370786516853932e-08, "logps/chosen": -54.437904357910156, "logps/rejected": -66.0970458984375, "loss": 0.5453, "losses/dpo": 0.6512450575828552, "losses/sft": 2.1795942783355713, "losses/total": 0.6512450575828552, "ref_logps/chosen": -47.7617073059082, "ref_logps/rejected": -52.72614288330078, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6676197648048401, "rewards/margins": 0.6694698929786682, "rewards/rejected": -1.3370895385742188, "step": 372 }, { "epoch": 2.82, "grad_norm": 36.49688748807524, "learning_rate": 3.230337078651686e-08, "logps/chosen": -50.890132904052734, "logps/rejected": -73.4024887084961, "loss": 0.4093, "losses/dpo": 0.17958246171474457, "losses/sft": 1.7895946502685547, "losses/total": 0.17958246171474457, "ref_logps/chosen": -45.62337112426758, "ref_logps/rejected": -57.74752426147461, "rewards/accuracies": 0.8359375, "rewards/chosen": -0.52667635679245, "rewards/margins": 1.0388202667236328, "rewards/rejected": -1.5654964447021484, "step": 373 }, { "epoch": 2.82, "grad_norm": 36.484710365490294, "learning_rate": 3.089887640449438e-08, "logps/chosen": -51.731353759765625, "logps/rejected": -68.29804229736328, "loss": 0.4708, "losses/dpo": 0.35531070828437805, "losses/sft": 1.9772918224334717, "losses/total": 0.35531070828437805, "ref_logps/chosen": -44.610931396484375, "ref_logps/rejected": -53.26063537597656, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.7120431661605835, "rewards/margins": 0.7916973829269409, "rewards/rejected": -1.5037405490875244, "step": 374 }, { "epoch": 2.83, "grad_norm": 39.305105234428076, "learning_rate": 2.949438202247191e-08, "logps/chosen": -50.83768081665039, "logps/rejected": -66.35530090332031, "loss": 0.4815, "losses/dpo": 0.31703487038612366, "losses/sft": 1.7421314716339111, "losses/total": 0.31703487038612366, "ref_logps/chosen": -45.02192306518555, "ref_logps/rejected": -52.580936431884766, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5815759897232056, "rewards/margins": 0.7958606481552124, "rewards/rejected": -1.3774365186691284, "step": 375 }, { "epoch": 2.84, "grad_norm": 38.666318562584785, "learning_rate": 2.8089887640449436e-08, "logps/chosen": -48.752010345458984, "logps/rejected": -67.23719787597656, "loss": 0.4409, "losses/dpo": 0.40991082787513733, "losses/sft": 1.9794155359268188, "losses/total": 0.40991082787513733, "ref_logps/chosen": -42.61652755737305, "ref_logps/rejected": -52.13258743286133, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6135481595993042, "rewards/margins": 0.8969129323959351, "rewards/rejected": -1.5104612112045288, "step": 376 }, { "epoch": 2.85, "grad_norm": 31.535770583598257, "learning_rate": 2.6685393258426963e-08, "logps/chosen": -51.15449142456055, "logps/rejected": -67.33303833007812, "loss": 0.4349, "losses/dpo": 0.2747466564178467, "losses/sft": 1.676353931427002, "losses/total": 0.2747466564178467, "ref_logps/chosen": -45.100257873535156, "ref_logps/rejected": -52.70699691772461, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6054239273071289, "rewards/margins": 0.8571805953979492, "rewards/rejected": -1.4626045227050781, "step": 377 }, { "epoch": 2.85, "grad_norm": 32.412403328044626, "learning_rate": 2.5280898876404493e-08, "logps/chosen": -44.84325408935547, "logps/rejected": -64.21622467041016, "loss": 0.4617, "losses/dpo": 0.5246663093566895, "losses/sft": 1.594354510307312, "losses/total": 0.5246663093566895, "ref_logps/chosen": -39.90529251098633, "ref_logps/rejected": -50.83759307861328, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.4937964379787445, "rewards/margins": 0.8440677523612976, "rewards/rejected": -1.3378642797470093, "step": 378 }, { "epoch": 2.86, "grad_norm": 54.232267084634124, "learning_rate": 2.387640449438202e-08, "logps/chosen": -55.683292388916016, "logps/rejected": -67.46990203857422, "loss": 0.4478, "losses/dpo": 0.3696019649505615, "losses/sft": 1.8699058294296265, "losses/total": 0.3696019649505615, "ref_logps/chosen": -49.01001739501953, "ref_logps/rejected": -52.054466247558594, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.6673274636268616, "rewards/margins": 0.8742159008979797, "rewards/rejected": -1.5415434837341309, "step": 379 }, { "epoch": 2.87, "grad_norm": 27.144290066670596, "learning_rate": 2.2471910112359548e-08, "logps/chosen": -45.49407958984375, "logps/rejected": -61.40216827392578, "loss": 0.4403, "losses/dpo": 0.49002838134765625, "losses/sft": 1.7484146356582642, "losses/total": 0.49002838134765625, "ref_logps/chosen": -41.38385009765625, "ref_logps/rejected": -49.091552734375, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.41102296113967896, "rewards/margins": 0.8200392127037048, "rewards/rejected": -1.2310621738433838, "step": 380 }, { "epoch": 2.88, "grad_norm": 40.238604240086836, "learning_rate": 2.106741573033708e-08, "logps/chosen": -49.64447784423828, "logps/rejected": -62.87236022949219, "loss": 0.4942, "losses/dpo": 0.42432701587677, "losses/sft": 1.961887240409851, "losses/total": 0.42432701587677, "ref_logps/chosen": -43.57355499267578, "ref_logps/rejected": -49.93211364746094, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6070915460586548, "rewards/margins": 0.686933159828186, "rewards/rejected": -1.2940247058868408, "step": 381 }, { "epoch": 2.88, "grad_norm": 54.840747205713996, "learning_rate": 1.9662921348314606e-08, "logps/chosen": -46.10314178466797, "logps/rejected": -61.747676849365234, "loss": 0.4322, "losses/dpo": 0.44490480422973633, "losses/sft": 1.522992491722107, "losses/total": 0.44490480422973633, "ref_logps/chosen": -40.64675521850586, "ref_logps/rejected": -47.95560836791992, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5456385016441345, "rewards/margins": 0.8335686326026917, "rewards/rejected": -1.3792071342468262, "step": 382 }, { "epoch": 2.89, "grad_norm": 29.570285453159077, "learning_rate": 1.8258426966292133e-08, "logps/chosen": -52.12841796875, "logps/rejected": -61.87736129760742, "loss": 0.4963, "losses/dpo": 0.590352475643158, "losses/sft": 1.8217315673828125, "losses/total": 0.590352475643158, "ref_logps/chosen": -46.294654846191406, "ref_logps/rejected": -49.08687973022461, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.5833764672279358, "rewards/margins": 0.6956723928451538, "rewards/rejected": -1.2790489196777344, "step": 383 }, { "epoch": 2.9, "grad_norm": 58.50144713719891, "learning_rate": 1.685393258426966e-08, "logps/chosen": -50.13311004638672, "logps/rejected": -66.59416961669922, "loss": 0.4762, "losses/dpo": 0.3490986227989197, "losses/sft": 1.648133397102356, "losses/total": 0.3490986227989197, "ref_logps/chosen": -43.85565948486328, "ref_logps/rejected": -52.042564392089844, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.6277450919151306, "rewards/margins": 0.8274149894714355, "rewards/rejected": -1.455160140991211, "step": 384 }, { "epoch": 2.91, "grad_norm": 41.383851631785475, "learning_rate": 1.544943820224719e-08, "logps/chosen": -51.102684020996094, "logps/rejected": -64.78075408935547, "loss": 0.4622, "losses/dpo": 0.45986640453338623, "losses/sft": 1.7652826309204102, "losses/total": 0.45986640453338623, "ref_logps/chosen": -45.024803161621094, "ref_logps/rejected": -51.074867248535156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6077881455421448, "rewards/margins": 0.7628008723258972, "rewards/rejected": -1.370589017868042, "step": 385 }, { "epoch": 2.91, "grad_norm": 37.49654140282014, "learning_rate": 1.4044943820224718e-08, "logps/chosen": -43.61898422241211, "logps/rejected": -57.51544952392578, "loss": 0.4482, "losses/dpo": 0.35744839906692505, "losses/sft": 1.6450403928756714, "losses/total": 0.35744839906692505, "ref_logps/chosen": -38.685218811035156, "ref_logps/rejected": -44.75885772705078, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4933765232563019, "rewards/margins": 0.7822825312614441, "rewards/rejected": -1.2756590843200684, "step": 386 }, { "epoch": 2.92, "grad_norm": 31.703867259636723, "learning_rate": 1.2640449438202247e-08, "logps/chosen": -48.789649963378906, "logps/rejected": -60.23027801513672, "loss": 0.5171, "losses/dpo": 0.5088573694229126, "losses/sft": 1.8967269659042358, "losses/total": 0.5088573694229126, "ref_logps/chosen": -42.69738006591797, "ref_logps/rejected": -47.479244232177734, "rewards/accuracies": 0.75, "rewards/chosen": -0.609227180480957, "rewards/margins": 0.6658761501312256, "rewards/rejected": -1.2751033306121826, "step": 387 }, { "epoch": 2.93, "grad_norm": 107.78566750167148, "learning_rate": 1.1235955056179774e-08, "logps/chosen": -48.16749954223633, "logps/rejected": -60.30778503417969, "loss": 0.5331, "losses/dpo": 0.4899829626083374, "losses/sft": 1.963571548461914, "losses/total": 0.4899829626083374, "ref_logps/chosen": -41.387779235839844, "ref_logps/rejected": -47.53449630737305, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6779724359512329, "rewards/margins": 0.5993563532829285, "rewards/rejected": -1.2773289680480957, "step": 388 }, { "epoch": 2.94, "grad_norm": 35.551684867961754, "learning_rate": 9.831460674157303e-09, "logps/chosen": -49.80058288574219, "logps/rejected": -67.81304931640625, "loss": 0.4536, "losses/dpo": 0.356619656085968, "losses/sft": 1.649460792541504, "losses/total": 0.356619656085968, "ref_logps/chosen": -44.282352447509766, "ref_logps/rejected": -53.48949432373047, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.5518239140510559, "rewards/margins": 0.8805313110351562, "rewards/rejected": -1.4323550462722778, "step": 389 }, { "epoch": 2.94, "grad_norm": 34.94248228455842, "learning_rate": 8.42696629213483e-09, "logps/chosen": -50.04351806640625, "logps/rejected": -62.3614387512207, "loss": 0.4962, "losses/dpo": 0.4704954922199249, "losses/sft": 1.8061951398849487, "losses/total": 0.4704954922199249, "ref_logps/chosen": -43.138275146484375, "ref_logps/rejected": -47.9213981628418, "rewards/accuracies": 0.734375, "rewards/chosen": -0.690524160861969, "rewards/margins": 0.7534796595573425, "rewards/rejected": -1.4440038204193115, "step": 390 }, { "epoch": 2.95, "grad_norm": 30.55473751387296, "learning_rate": 7.022471910112359e-09, "logps/chosen": -48.262489318847656, "logps/rejected": -68.67484283447266, "loss": 0.4187, "losses/dpo": 0.24351564049720764, "losses/sft": 1.8308738470077515, "losses/total": 0.24351564049720764, "ref_logps/chosen": -43.282691955566406, "ref_logps/rejected": -54.3927001953125, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4979794919490814, "rewards/margins": 0.9302344918251038, "rewards/rejected": -1.428214192390442, "step": 391 }, { "epoch": 2.96, "grad_norm": 38.23111934380986, "learning_rate": 5.617977528089887e-09, "logps/chosen": -49.23915100097656, "logps/rejected": -62.18607711791992, "loss": 0.4863, "losses/dpo": 0.43632444739341736, "losses/sft": 1.9880850315093994, "losses/total": 0.43632444739341736, "ref_logps/chosen": -43.663368225097656, "ref_logps/rejected": -48.774288177490234, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.5575782060623169, "rewards/margins": 0.7836005091667175, "rewards/rejected": -1.3411787748336792, "step": 392 }, { "epoch": 2.97, "grad_norm": 37.97152202471527, "learning_rate": 4.213483146067415e-09, "logps/chosen": -50.27070617675781, "logps/rejected": -61.61231994628906, "loss": 0.5257, "losses/dpo": 0.38129669427871704, "losses/sft": 1.688190221786499, "losses/total": 0.38129669427871704, "ref_logps/chosen": -44.26463317871094, "ref_logps/rejected": -48.561580657958984, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6006075143814087, "rewards/margins": 0.7044662237167358, "rewards/rejected": -1.305073857307434, "step": 393 }, { "epoch": 2.97, "grad_norm": 49.13135082442501, "learning_rate": 2.8089887640449435e-09, "logps/chosen": -50.83762741088867, "logps/rejected": -63.6076545715332, "loss": 0.4686, "losses/dpo": 0.4519101083278656, "losses/sft": 1.6160304546356201, "losses/total": 0.4519101083278656, "ref_logps/chosen": -44.982017517089844, "ref_logps/rejected": -49.98162841796875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5855613946914673, "rewards/margins": 0.7770410776138306, "rewards/rejected": -1.3626024723052979, "step": 394 }, { "epoch": 2.98, "grad_norm": 46.78471874836804, "learning_rate": 1.4044943820224717e-09, "logps/chosen": -53.5521354675293, "logps/rejected": -66.68860626220703, "loss": 0.4627, "losses/dpo": 0.42306211590766907, "losses/sft": 2.1937084197998047, "losses/total": 0.42306211590766907, "ref_logps/chosen": -47.2824821472168, "ref_logps/rejected": -52.29182815551758, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.6269652247428894, "rewards/margins": 0.8127122521400452, "rewards/rejected": -1.4396774768829346, "step": 395 }, { "epoch": 2.99, "grad_norm": 43.94685006929935, "learning_rate": 0.0, "logps/chosen": -52.710567474365234, "logps/rejected": -63.17184066772461, "loss": 0.5041, "losses/dpo": 0.3837814927101135, "losses/sft": 1.647486925125122, "losses/total": 0.3837814927101135, "ref_logps/chosen": -46.09189224243164, "ref_logps/rejected": -49.62699508666992, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.661867618560791, "rewards/margins": 0.6926167011260986, "rewards/rejected": -1.3544843196868896, "step": 396 }, { "epoch": 2.99, "step": 396, "total_flos": 0.0, "train_loss": 0.5606525407897102, "train_runtime": 13857.9394, "train_samples_per_second": 3.671, "train_steps_per_second": 0.029 } ], "logging_steps": 1.0, "max_steps": 396, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }