{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987085665088248, "eval_steps": 500, "global_step": 290, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0034438226431338786, "grad_norm": 52.674065050138765, "learning_rate": 2.7586206896551723e-08, "logits/chosen": -5.615417003631592, "logits/rejected": -5.667238712310791, "logps/chosen": -0.4943414330482483, "logps/rejected": -0.6143913865089417, "loss": 5.1079, "rewards/accuracies": 0.5, "rewards/chosen": -4.943414688110352, "rewards/margins": 1.200499415397644, "rewards/rejected": -6.143914222717285, "step": 1 }, { "epoch": 0.006887645286267757, "grad_norm": 60.60351785955272, "learning_rate": 5.517241379310345e-08, "logits/chosen": -5.4836554527282715, "logits/rejected": -5.671990871429443, "logps/chosen": -0.5493791103363037, "logps/rejected": -0.6132436990737915, "loss": 5.0887, "rewards/accuracies": 0.6875, "rewards/chosen": -5.493791103363037, "rewards/margins": 0.638646125793457, "rewards/rejected": -6.132437705993652, "step": 2 }, { "epoch": 0.010331467929401636, "grad_norm": 59.59005871499247, "learning_rate": 8.275862068965517e-08, "logits/chosen": -5.246090412139893, "logits/rejected": -5.381677627563477, "logps/chosen": -0.45825690031051636, "logps/rejected": -0.531363308429718, "loss": 5.0369, "rewards/accuracies": 0.75, "rewards/chosen": -4.582569122314453, "rewards/margins": 0.7310642004013062, "rewards/rejected": -5.313632965087891, "step": 3 }, { "epoch": 0.013775290572535515, "grad_norm": 58.937392298416974, "learning_rate": 1.103448275862069e-07, "logits/chosen": -5.279723167419434, "logits/rejected": -5.392016410827637, "logps/chosen": -0.5628327131271362, "logps/rejected": -0.5180907249450684, "loss": 5.3846, "rewards/accuracies": 0.3125, "rewards/chosen": -5.628327369689941, "rewards/margins": -0.4474201798439026, "rewards/rejected": -5.180907249450684, "step": 4 }, { "epoch": 0.017219113215669393, "grad_norm": 51.34542849508367, "learning_rate": 1.3793103448275863e-07, "logits/chosen": -5.257406234741211, "logits/rejected": -5.315362453460693, "logps/chosen": -0.525193989276886, "logps/rejected": -0.572905421257019, "loss": 5.1008, "rewards/accuracies": 0.5625, "rewards/chosen": -5.25193977355957, "rewards/margins": 0.47711431980133057, "rewards/rejected": -5.729053974151611, "step": 5 }, { "epoch": 0.020662935858803272, "grad_norm": 62.72154095060763, "learning_rate": 1.6551724137931034e-07, "logits/chosen": -5.621905326843262, "logits/rejected": -5.849234104156494, "logps/chosen": -0.5514707565307617, "logps/rejected": -0.5989887118339539, "loss": 5.0616, "rewards/accuracies": 0.4375, "rewards/chosen": -5.514707565307617, "rewards/margins": 0.47517985105514526, "rewards/rejected": -5.989887237548828, "step": 6 }, { "epoch": 0.02410675850193715, "grad_norm": 58.76872623763212, "learning_rate": 1.9310344827586208e-07, "logits/chosen": -5.585313320159912, "logits/rejected": -5.405150413513184, "logps/chosen": -0.6232742667198181, "logps/rejected": -0.5102998614311218, "loss": 5.2864, "rewards/accuracies": 0.6875, "rewards/chosen": -6.232743263244629, "rewards/margins": -1.129744529724121, "rewards/rejected": -5.102999210357666, "step": 7 }, { "epoch": 0.02755058114507103, "grad_norm": 49.99034574764161, "learning_rate": 2.206896551724138e-07, "logits/chosen": -5.209949016571045, "logits/rejected": -5.195341110229492, "logps/chosen": -0.47157737612724304, "logps/rejected": -0.47744542360305786, "loss": 5.1666, "rewards/accuracies": 0.6875, "rewards/chosen": -4.715773582458496, "rewards/margins": 0.05868096649646759, "rewards/rejected": -4.7744550704956055, "step": 8 }, { "epoch": 0.030994403788204908, "grad_norm": 58.66744740284535, "learning_rate": 2.482758620689655e-07, "logits/chosen": -5.36690616607666, "logits/rejected": -5.485147953033447, "logps/chosen": -0.5631701946258545, "logps/rejected": -0.5826945304870605, "loss": 5.1211, "rewards/accuracies": 0.625, "rewards/chosen": -5.631701946258545, "rewards/margins": 0.1952425241470337, "rewards/rejected": -5.826944828033447, "step": 9 }, { "epoch": 0.034438226431338786, "grad_norm": 46.491425410700735, "learning_rate": 2.7586206896551726e-07, "logits/chosen": -5.605222225189209, "logits/rejected": -5.74063777923584, "logps/chosen": -0.501873791217804, "logps/rejected": -0.5232819318771362, "loss": 5.1411, "rewards/accuracies": 0.5, "rewards/chosen": -5.018738269805908, "rewards/margins": 0.21408089995384216, "rewards/rejected": -5.232819080352783, "step": 10 }, { "epoch": 0.037882049074472665, "grad_norm": 83.27704023131166, "learning_rate": 3.034482758620689e-07, "logits/chosen": -5.455163955688477, "logits/rejected": -5.336368560791016, "logps/chosen": -0.7997897267341614, "logps/rejected": -0.5844107270240784, "loss": 5.6532, "rewards/accuracies": 0.5, "rewards/chosen": -7.997897624969482, "rewards/margins": -2.1537904739379883, "rewards/rejected": -5.844107151031494, "step": 11 }, { "epoch": 0.041325871717606544, "grad_norm": 48.93901907364537, "learning_rate": 3.310344827586207e-07, "logits/chosen": -5.099009990692139, "logits/rejected": -5.078139781951904, "logps/chosen": -0.4147089719772339, "logps/rejected": -0.4047776460647583, "loss": 5.1796, "rewards/accuracies": 0.6875, "rewards/chosen": -4.147089958190918, "rewards/margins": -0.09931322932243347, "rewards/rejected": -4.047776699066162, "step": 12 }, { "epoch": 0.04476969436074042, "grad_norm": 50.65745487462828, "learning_rate": 3.586206896551724e-07, "logits/chosen": -4.989038467407227, "logits/rejected": -4.911070346832275, "logps/chosen": -0.5246780514717102, "logps/rejected": -0.47285518050193787, "loss": 5.0677, "rewards/accuracies": 0.5, "rewards/chosen": -5.2467803955078125, "rewards/margins": -0.5182281732559204, "rewards/rejected": -4.728552341461182, "step": 13 }, { "epoch": 0.0482135170038743, "grad_norm": 57.66601162283009, "learning_rate": 3.8620689655172415e-07, "logits/chosen": -5.70846700668335, "logits/rejected": -5.75710916519165, "logps/chosen": -0.44634702801704407, "logps/rejected": -0.4534481465816498, "loss": 5.1503, "rewards/accuracies": 0.4375, "rewards/chosen": -4.463469982147217, "rewards/margins": 0.07101157307624817, "rewards/rejected": -4.534482002258301, "step": 14 }, { "epoch": 0.05165733964700818, "grad_norm": 64.39971082160424, "learning_rate": 4.1379310344827586e-07, "logits/chosen": -5.157317161560059, "logits/rejected": -5.051291465759277, "logps/chosen": -0.5117197632789612, "logps/rejected": -0.6016834378242493, "loss": 5.1127, "rewards/accuracies": 0.625, "rewards/chosen": -5.117197036743164, "rewards/margins": 0.8996371626853943, "rewards/rejected": -6.016834735870361, "step": 15 }, { "epoch": 0.05510116229014206, "grad_norm": 43.28657458222468, "learning_rate": 4.413793103448276e-07, "logits/chosen": -5.220727920532227, "logits/rejected": -5.298764228820801, "logps/chosen": -0.4690421521663666, "logps/rejected": -0.4100668132305145, "loss": 4.9565, "rewards/accuracies": 0.375, "rewards/chosen": -4.6904215812683105, "rewards/margins": -0.5897536873817444, "rewards/rejected": -4.100667953491211, "step": 16 }, { "epoch": 0.05854498493327594, "grad_norm": 55.940864947672644, "learning_rate": 4.6896551724137923e-07, "logits/chosen": -5.025401592254639, "logits/rejected": -5.0607781410217285, "logps/chosen": -0.6796688437461853, "logps/rejected": -0.6925962567329407, "loss": 5.0153, "rewards/accuracies": 0.5, "rewards/chosen": -6.796688079833984, "rewards/margins": 0.12927353382110596, "rewards/rejected": -6.925961971282959, "step": 17 }, { "epoch": 0.061988807576409816, "grad_norm": 60.73894736133069, "learning_rate": 4.96551724137931e-07, "logits/chosen": -5.200845718383789, "logits/rejected": -5.258395195007324, "logps/chosen": -0.38808485865592957, "logps/rejected": -0.38210412859916687, "loss": 5.0964, "rewards/accuracies": 0.375, "rewards/chosen": -3.8808484077453613, "rewards/margins": -0.059806786477565765, "rewards/rejected": -3.8210418224334717, "step": 18 }, { "epoch": 0.0654326302195437, "grad_norm": 50.35855568678846, "learning_rate": 5.241379310344828e-07, "logits/chosen": -5.284404277801514, "logits/rejected": -5.353991508483887, "logps/chosen": -0.4545897841453552, "logps/rejected": -0.4399701952934265, "loss": 5.0991, "rewards/accuracies": 0.3125, "rewards/chosen": -4.545897483825684, "rewards/margins": -0.1461959034204483, "rewards/rejected": -4.399702548980713, "step": 19 }, { "epoch": 0.06887645286267757, "grad_norm": 103.40309567567355, "learning_rate": 5.517241379310345e-07, "logits/chosen": -5.317500114440918, "logits/rejected": -5.262624740600586, "logps/chosen": -0.4840647280216217, "logps/rejected": -0.6067878007888794, "loss": 5.1123, "rewards/accuracies": 0.5625, "rewards/chosen": -4.8406476974487305, "rewards/margins": 1.2272305488586426, "rewards/rejected": -6.067877769470215, "step": 20 }, { "epoch": 0.07232027550581145, "grad_norm": 46.89431893563974, "learning_rate": 5.793103448275862e-07, "logits/chosen": -5.2541890144348145, "logits/rejected": -5.130758285522461, "logps/chosen": -0.4697263538837433, "logps/rejected": -0.5909743905067444, "loss": 4.937, "rewards/accuracies": 0.5625, "rewards/chosen": -4.697263240814209, "rewards/margins": 1.2124805450439453, "rewards/rejected": -5.9097442626953125, "step": 21 }, { "epoch": 0.07576409814894533, "grad_norm": 67.56806825458494, "learning_rate": 6.068965517241378e-07, "logits/chosen": -4.811267375946045, "logits/rejected": -4.888763904571533, "logps/chosen": -0.4097307324409485, "logps/rejected": -0.40606409311294556, "loss": 5.1578, "rewards/accuracies": 0.375, "rewards/chosen": -4.097307205200195, "rewards/margins": -0.03666616231203079, "rewards/rejected": -4.060640811920166, "step": 22 }, { "epoch": 0.07920792079207921, "grad_norm": 61.29631937996971, "learning_rate": 6.344827586206897e-07, "logits/chosen": -4.587668418884277, "logits/rejected": -4.847754001617432, "logps/chosen": -0.5758047103881836, "logps/rejected": -0.5185554027557373, "loss": 4.8982, "rewards/accuracies": 0.5, "rewards/chosen": -5.758047580718994, "rewards/margins": -0.5724934339523315, "rewards/rejected": -5.185554027557373, "step": 23 }, { "epoch": 0.08265174343521309, "grad_norm": 93.2152404636039, "learning_rate": 6.620689655172414e-07, "logits/chosen": -4.800999641418457, "logits/rejected": -5.004166603088379, "logps/chosen": -0.3709278702735901, "logps/rejected": -0.4135555624961853, "loss": 4.8412, "rewards/accuracies": 0.5625, "rewards/chosen": -3.7092788219451904, "rewards/margins": 0.42627638578414917, "rewards/rejected": -4.135554790496826, "step": 24 }, { "epoch": 0.08609556607834697, "grad_norm": 64.42035341096593, "learning_rate": 6.89655172413793e-07, "logits/chosen": -4.344979286193848, "logits/rejected": -4.520305633544922, "logps/chosen": -0.46768859028816223, "logps/rejected": -0.4844363033771515, "loss": 4.9329, "rewards/accuracies": 0.4375, "rewards/chosen": -4.676885604858398, "rewards/margins": 0.16747775673866272, "rewards/rejected": -4.844363212585449, "step": 25 }, { "epoch": 0.08953938872148084, "grad_norm": 56.68263958973131, "learning_rate": 7.172413793103448e-07, "logits/chosen": -4.4868011474609375, "logits/rejected": -4.451172828674316, "logps/chosen": -0.3706013262271881, "logps/rejected": -0.44255292415618896, "loss": 4.8804, "rewards/accuracies": 0.75, "rewards/chosen": -3.7060132026672363, "rewards/margins": 0.7195163369178772, "rewards/rejected": -4.425529479980469, "step": 26 }, { "epoch": 0.09298321136461472, "grad_norm": 69.68637190790966, "learning_rate": 7.448275862068965e-07, "logits/chosen": -4.358417510986328, "logits/rejected": -4.435417652130127, "logps/chosen": -0.3981940746307373, "logps/rejected": -0.4577309489250183, "loss": 4.8387, "rewards/accuracies": 0.5625, "rewards/chosen": -3.981940984725952, "rewards/margins": 0.5953686237335205, "rewards/rejected": -4.5773091316223145, "step": 27 }, { "epoch": 0.0964270340077486, "grad_norm": 58.3412204024468, "learning_rate": 7.724137931034483e-07, "logits/chosen": -4.649521350860596, "logits/rejected": -4.8989362716674805, "logps/chosen": -0.33310776948928833, "logps/rejected": -0.37322184443473816, "loss": 4.6174, "rewards/accuracies": 0.625, "rewards/chosen": -3.331077814102173, "rewards/margins": 0.4011409282684326, "rewards/rejected": -3.7322187423706055, "step": 28 }, { "epoch": 0.09987085665088248, "grad_norm": 56.195572626431165, "learning_rate": 8e-07, "logits/chosen": -4.673098087310791, "logits/rejected": -5.0018768310546875, "logps/chosen": -0.4161800742149353, "logps/rejected": -0.3885071277618408, "loss": 4.65, "rewards/accuracies": 0.4375, "rewards/chosen": -4.161801338195801, "rewards/margins": -0.27672961354255676, "rewards/rejected": -3.885071277618408, "step": 29 }, { "epoch": 0.10331467929401636, "grad_norm": 67.80702382842614, "learning_rate": 7.999710236630706e-07, "logits/chosen": -4.643288612365723, "logits/rejected": -4.589477062225342, "logps/chosen": -0.4303164482116699, "logps/rejected": -0.506043016910553, "loss": 4.8244, "rewards/accuracies": 0.625, "rewards/chosen": -4.303164482116699, "rewards/margins": 0.7572658658027649, "rewards/rejected": -5.06043004989624, "step": 30 }, { "epoch": 0.10675850193715024, "grad_norm": 50.50628925100566, "learning_rate": 7.998840988504232e-07, "logits/chosen": -4.767556190490723, "logits/rejected": -4.7690935134887695, "logps/chosen": -0.403850257396698, "logps/rejected": -0.44447407126426697, "loss": 4.7897, "rewards/accuracies": 0.5625, "rewards/chosen": -4.0385026931762695, "rewards/margins": 0.4062381088733673, "rewards/rejected": -4.4447407722473145, "step": 31 }, { "epoch": 0.11020232458028412, "grad_norm": 60.05314418873607, "learning_rate": 7.997392381558708e-07, "logits/chosen": -3.7635271549224854, "logits/rejected": -3.760200262069702, "logps/chosen": -0.5402004718780518, "logps/rejected": -0.5654389262199402, "loss": 4.7483, "rewards/accuracies": 0.5, "rewards/chosen": -5.402004718780518, "rewards/margins": 0.25238436460494995, "rewards/rejected": -5.654389381408691, "step": 32 }, { "epoch": 0.113646147223418, "grad_norm": 50.090026374394135, "learning_rate": 7.99536462567075e-07, "logits/chosen": -5.203555583953857, "logits/rejected": -5.3314290046691895, "logps/chosen": -0.4754854440689087, "logps/rejected": -0.4819332957267761, "loss": 4.8249, "rewards/accuracies": 0.375, "rewards/chosen": -4.754855155944824, "rewards/margins": 0.06447845697402954, "rewards/rejected": -4.819333076477051, "step": 33 }, { "epoch": 0.11708996986655187, "grad_norm": 44.62210441192165, "learning_rate": 7.992758014625048e-07, "logits/chosen": -4.730749607086182, "logits/rejected": -4.70894718170166, "logps/chosen": -0.3653126657009125, "logps/rejected": -0.47924527525901794, "loss": 4.681, "rewards/accuracies": 0.6875, "rewards/chosen": -3.6531267166137695, "rewards/margins": 1.1393256187438965, "rewards/rejected": -4.792451858520508, "step": 34 }, { "epoch": 0.12053379250968575, "grad_norm": 51.36875532303172, "learning_rate": 7.989572926071799e-07, "logits/chosen": -4.721662521362305, "logits/rejected": -4.724156856536865, "logps/chosen": -0.4223301410675049, "logps/rejected": -0.4952259063720703, "loss": 4.5665, "rewards/accuracies": 0.625, "rewards/chosen": -4.223300933837891, "rewards/margins": 0.7289580702781677, "rewards/rejected": -4.952259063720703, "step": 35 }, { "epoch": 0.12397761515281963, "grad_norm": 57.9863349338661, "learning_rate": 7.985809821472e-07, "logits/chosen": -4.691116809844971, "logits/rejected": -4.813366413116455, "logps/chosen": -0.4277626872062683, "logps/rejected": -0.4881032705307007, "loss": 4.6191, "rewards/accuracies": 0.625, "rewards/chosen": -4.277626991271973, "rewards/margins": 0.6034059524536133, "rewards/rejected": -4.881032943725586, "step": 36 }, { "epoch": 0.1274214377959535, "grad_norm": 63.01073545994464, "learning_rate": 7.981469246030587e-07, "logits/chosen": -4.308718204498291, "logits/rejected": -4.413212776184082, "logps/chosen": -0.4789758026599884, "logps/rejected": -0.58740234375, "loss": 4.7889, "rewards/accuracies": 0.6875, "rewards/chosen": -4.789758205413818, "rewards/margins": 1.0842654705047607, "rewards/rejected": -5.8740234375, "step": 37 }, { "epoch": 0.1308652604390874, "grad_norm": 83.60559787534415, "learning_rate": 7.976551828617438e-07, "logits/chosen": -4.922616481781006, "logits/rejected": -4.967951774597168, "logps/chosen": -0.4330342710018158, "logps/rejected": -0.4283411204814911, "loss": 4.7034, "rewards/accuracies": 0.4375, "rewards/chosen": -4.330342769622803, "rewards/margins": -0.0469314381480217, "rewards/rejected": -4.283411026000977, "step": 38 }, { "epoch": 0.13430908308222125, "grad_norm": 57.08486563674846, "learning_rate": 7.971058281676275e-07, "logits/chosen": -5.094006061553955, "logits/rejected": -5.191053867340088, "logps/chosen": -0.4875888228416443, "logps/rejected": -0.6387084722518921, "loss": 4.6644, "rewards/accuracies": 0.75, "rewards/chosen": -4.875887870788574, "rewards/margins": 1.5111969709396362, "rewards/rejected": -6.3870849609375, "step": 39 }, { "epoch": 0.13775290572535515, "grad_norm": 60.77964329495937, "learning_rate": 7.964989401121432e-07, "logits/chosen": -4.993417739868164, "logits/rejected": -4.969631195068359, "logps/chosen": -0.33883655071258545, "logps/rejected": -0.3545013964176178, "loss": 4.5686, "rewards/accuracies": 0.5625, "rewards/chosen": -3.3883657455444336, "rewards/margins": 0.1566484272480011, "rewards/rejected": -3.5450141429901123, "step": 40 }, { "epoch": 0.141196728368489, "grad_norm": 40.340190041907185, "learning_rate": 7.958346066222549e-07, "logits/chosen": -4.525943756103516, "logits/rejected": -4.55746603012085, "logps/chosen": -0.45404478907585144, "logps/rejected": -0.4469107389450073, "loss": 4.5937, "rewards/accuracies": 0.5625, "rewards/chosen": -4.54044771194458, "rewards/margins": -0.07134075462818146, "rewards/rejected": -4.469107151031494, "step": 41 }, { "epoch": 0.1446405510116229, "grad_norm": 57.514362166929665, "learning_rate": 7.951129239477177e-07, "logits/chosen": -5.132482528686523, "logits/rejected": -5.1113786697387695, "logps/chosen": -0.401109516620636, "logps/rejected": -0.44438689947128296, "loss": 4.6176, "rewards/accuracies": 0.4375, "rewards/chosen": -4.01109504699707, "rewards/margins": 0.43277424573898315, "rewards/rejected": -4.443869113922119, "step": 42 }, { "epoch": 0.14808437365475677, "grad_norm": 60.34883135001456, "learning_rate": 7.943339966471333e-07, "logits/chosen": -4.517858982086182, "logits/rejected": -4.421618461608887, "logps/chosen": -0.6845810413360596, "logps/rejected": -0.6314383745193481, "loss": 4.6897, "rewards/accuracies": 0.5, "rewards/chosen": -6.8458099365234375, "rewards/margins": -0.531426191329956, "rewards/rejected": -6.3143839836120605, "step": 43 }, { "epoch": 0.15152819629789066, "grad_norm": 65.95554033423765, "learning_rate": 7.93497937572801e-07, "logits/chosen": -5.128730297088623, "logits/rejected": -5.0609660148620605, "logps/chosen": -0.5347275137901306, "logps/rejected": -0.5926434397697449, "loss": 4.7481, "rewards/accuracies": 0.75, "rewards/chosen": -5.3472747802734375, "rewards/margins": 0.5791594386100769, "rewards/rejected": -5.926434516906738, "step": 44 }, { "epoch": 0.15497201894102453, "grad_norm": 65.36255184426133, "learning_rate": 7.926048678543684e-07, "logits/chosen": -4.324880599975586, "logits/rejected": -4.221179485321045, "logps/chosen": -0.5375354290008545, "logps/rejected": -0.7417870163917542, "loss": 4.4532, "rewards/accuracies": 0.75, "rewards/chosen": -5.375354766845703, "rewards/margins": 2.042515277862549, "rewards/rejected": -7.417870044708252, "step": 45 }, { "epoch": 0.15841584158415842, "grad_norm": 56.19140828662666, "learning_rate": 7.916549168812805e-07, "logits/chosen": -4.412731647491455, "logits/rejected": -4.406851768493652, "logps/chosen": -0.43062710762023926, "logps/rejected": -0.5201414227485657, "loss": 4.4945, "rewards/accuracies": 0.6875, "rewards/chosen": -4.306270599365234, "rewards/margins": 0.8951433897018433, "rewards/rejected": -5.201414585113525, "step": 46 }, { "epoch": 0.16185966422729228, "grad_norm": 61.47712790834795, "learning_rate": 7.906482222840346e-07, "logits/chosen": -3.994800329208374, "logits/rejected": -3.9059207439422607, "logps/chosen": -0.4685822129249573, "logps/rejected": -0.6291300058364868, "loss": 4.5688, "rewards/accuracies": 0.6875, "rewards/chosen": -4.685822010040283, "rewards/margins": 1.6054778099060059, "rewards/rejected": -6.291299819946289, "step": 47 }, { "epoch": 0.16530348687042618, "grad_norm": 71.97293938631553, "learning_rate": 7.8958492991424e-07, "logits/chosen": -4.644060134887695, "logits/rejected": -4.552207946777344, "logps/chosen": -0.49269717931747437, "logps/rejected": -0.48811206221580505, "loss": 4.3686, "rewards/accuracies": 0.5, "rewards/chosen": -4.926971435546875, "rewards/margins": -0.04585088789463043, "rewards/rejected": -4.881120204925537, "step": 48 }, { "epoch": 0.16874730951356004, "grad_norm": 50.91272883112091, "learning_rate": 7.884651938234865e-07, "logits/chosen": -4.6048712730407715, "logits/rejected": -4.637516975402832, "logps/chosen": -0.454245924949646, "logps/rejected": -0.5149778723716736, "loss": 4.4144, "rewards/accuracies": 0.6875, "rewards/chosen": -4.542459011077881, "rewards/margins": 0.6073201894760132, "rewards/rejected": -5.149779796600342, "step": 49 }, { "epoch": 0.17219113215669393, "grad_norm": 70.84729258897266, "learning_rate": 7.872891762410253e-07, "logits/chosen": -4.788956642150879, "logits/rejected": -4.830476760864258, "logps/chosen": -0.5271515846252441, "logps/rejected": -0.5558156967163086, "loss": 4.398, "rewards/accuracies": 0.625, "rewards/chosen": -5.271515846252441, "rewards/margins": 0.2866411805152893, "rewards/rejected": -5.558156967163086, "step": 50 }, { "epoch": 0.1756349547998278, "grad_norm": 72.43110951103814, "learning_rate": 7.860570475502648e-07, "logits/chosen": -4.508288860321045, "logits/rejected": -4.559998035430908, "logps/chosen": -0.4371810257434845, "logps/rejected": -0.5790078639984131, "loss": 4.4228, "rewards/accuracies": 0.625, "rewards/chosen": -4.371809959411621, "rewards/margins": 1.4182684421539307, "rewards/rejected": -5.790079116821289, "step": 51 }, { "epoch": 0.1790787774429617, "grad_norm": 55.410292700087545, "learning_rate": 7.847689862640855e-07, "logits/chosen": -4.518070697784424, "logits/rejected": -4.57796049118042, "logps/chosen": -0.4647026062011719, "logps/rejected": -0.5196883082389832, "loss": 4.7694, "rewards/accuracies": 0.5625, "rewards/chosen": -4.647026062011719, "rewards/margins": 0.5498570203781128, "rewards/rejected": -5.196883201599121, "step": 52 }, { "epoch": 0.18252260008609555, "grad_norm": 69.37009413960385, "learning_rate": 7.834251789989765e-07, "logits/chosen": -4.978256702423096, "logits/rejected": -4.886575698852539, "logps/chosen": -0.5333456993103027, "logps/rejected": -0.7749611139297485, "loss": 4.6385, "rewards/accuracies": 0.5625, "rewards/chosen": -5.333456516265869, "rewards/margins": 2.4161548614501953, "rewards/rejected": -7.7496113777160645, "step": 53 }, { "epoch": 0.18596642272922945, "grad_norm": 72.61257423821304, "learning_rate": 7.820258204479982e-07, "logits/chosen": -4.223357677459717, "logits/rejected": -4.151899337768555, "logps/chosen": -0.5688156485557556, "logps/rejected": -0.6057307124137878, "loss": 4.811, "rewards/accuracies": 0.5, "rewards/chosen": -5.688156604766846, "rewards/margins": 0.3691507577896118, "rewards/rejected": -6.05730676651001, "step": 54 }, { "epoch": 0.1894102453723633, "grad_norm": 56.657706476039074, "learning_rate": 7.805711133525747e-07, "logits/chosen": -4.470883846282959, "logits/rejected": -4.288090705871582, "logps/chosen": -0.6821640729904175, "logps/rejected": -0.6564118266105652, "loss": 4.6001, "rewards/accuracies": 0.625, "rewards/chosen": -6.8216400146484375, "rewards/margins": -0.25752171874046326, "rewards/rejected": -6.564118385314941, "step": 55 }, { "epoch": 0.1928540680154972, "grad_norm": 78.23314078065798, "learning_rate": 7.790612684731209e-07, "logits/chosen": -4.282840728759766, "logits/rejected": -4.223234176635742, "logps/chosen": -0.6843351721763611, "logps/rejected": -0.8569565415382385, "loss": 4.456, "rewards/accuracies": 0.625, "rewards/chosen": -6.843351364135742, "rewards/margins": 1.7262136936187744, "rewards/rejected": -8.569564819335938, "step": 56 }, { "epoch": 0.19629789065863107, "grad_norm": 64.08169300905095, "learning_rate": 7.774965045585064e-07, "logits/chosen": -5.029541015625, "logits/rejected": -5.061357498168945, "logps/chosen": -0.5916852951049805, "logps/rejected": -0.6251527667045593, "loss": 4.3484, "rewards/accuracies": 0.625, "rewards/chosen": -5.916852951049805, "rewards/margins": 0.334674596786499, "rewards/rejected": -6.251528263092041, "step": 57 }, { "epoch": 0.19974171330176496, "grad_norm": 62.02026429799981, "learning_rate": 7.758770483143634e-07, "logits/chosen": -3.820904016494751, "logits/rejected": -3.8994665145874023, "logps/chosen": -0.6398332118988037, "logps/rejected": -0.662560760974884, "loss": 4.3431, "rewards/accuracies": 0.5625, "rewards/chosen": -6.398331642150879, "rewards/margins": 0.22727595269680023, "rewards/rejected": -6.625607967376709, "step": 58 }, { "epoch": 0.20318553594489883, "grad_norm": 57.111582977153155, "learning_rate": 7.742031343702404e-07, "logits/chosen": -4.509333610534668, "logits/rejected": -4.401131629943848, "logps/chosen": -0.5554917454719543, "logps/rejected": -0.6513252854347229, "loss": 4.1657, "rewards/accuracies": 0.8125, "rewards/chosen": -5.554917335510254, "rewards/margins": 0.9583351016044617, "rewards/rejected": -6.5132527351379395, "step": 59 }, { "epoch": 0.20662935858803272, "grad_norm": 68.28922293268536, "learning_rate": 7.724750052456098e-07, "logits/chosen": -4.062650680541992, "logits/rejected": -3.9956672191619873, "logps/chosen": -0.5649631023406982, "logps/rejected": -0.7722354531288147, "loss": 4.3439, "rewards/accuracies": 0.75, "rewards/chosen": -5.649630546569824, "rewards/margins": 2.072723627090454, "rewards/rejected": -7.722353935241699, "step": 60 }, { "epoch": 0.21007318123116658, "grad_norm": 65.73305092854736, "learning_rate": 7.706929113147304e-07, "logits/chosen": -4.709454536437988, "logits/rejected": -4.698660850524902, "logps/chosen": -0.6076084971427917, "logps/rejected": -0.6684498190879822, "loss": 4.2227, "rewards/accuracies": 0.6875, "rewards/chosen": -6.076085090637207, "rewards/margins": 0.6084132790565491, "rewards/rejected": -6.684497833251953, "step": 61 }, { "epoch": 0.21351700387430048, "grad_norm": 67.46265604716064, "learning_rate": 7.688571107703732e-07, "logits/chosen": -3.963956832885742, "logits/rejected": -3.938755512237549, "logps/chosen": -0.5723408460617065, "logps/rejected": -0.5089117288589478, "loss": 4.4227, "rewards/accuracies": 0.4375, "rewards/chosen": -5.7234086990356445, "rewards/margins": -0.6342912316322327, "rewards/rejected": -5.089117527008057, "step": 62 }, { "epoch": 0.21696082651743434, "grad_norm": 60.086542404476894, "learning_rate": 7.669678695864137e-07, "logits/chosen": -4.414982795715332, "logits/rejected": -4.424773693084717, "logps/chosen": -0.7808203101158142, "logps/rejected": -0.9502580761909485, "loss": 4.1876, "rewards/accuracies": 0.625, "rewards/chosen": -7.80820369720459, "rewards/margins": 1.6943775415420532, "rewards/rejected": -9.502581596374512, "step": 63 }, { "epoch": 0.22040464916056823, "grad_norm": 62.22132599200314, "learning_rate": 7.650254614792972e-07, "logits/chosen": -5.100131511688232, "logits/rejected": -4.888442039489746, "logps/chosen": -0.7664632797241211, "logps/rejected": -0.7097909450531006, "loss": 4.0675, "rewards/accuracies": 0.5625, "rewards/chosen": -7.664633274078369, "rewards/margins": -0.5667227506637573, "rewards/rejected": -7.097909927368164, "step": 64 }, { "epoch": 0.2238484718037021, "grad_norm": 64.6980516756494, "learning_rate": 7.630301678683828e-07, "logits/chosen": -4.501206398010254, "logits/rejected": -4.3760600090026855, "logps/chosen": -0.582870602607727, "logps/rejected": -0.7515184283256531, "loss": 3.8879, "rewards/accuracies": 0.6875, "rewards/chosen": -5.82870626449585, "rewards/margins": 1.686477780342102, "rewards/rejected": -7.51518440246582, "step": 65 }, { "epoch": 0.227292294446836, "grad_norm": 58.49975168550397, "learning_rate": 7.6098227783517e-07, "logits/chosen": -4.590901851654053, "logits/rejected": -4.614831447601318, "logps/chosen": -0.6885466575622559, "logps/rejected": -0.6569955945014954, "loss": 4.2328, "rewards/accuracies": 0.4375, "rewards/chosen": -6.885467052459717, "rewards/margins": -0.31551113724708557, "rewards/rejected": -6.569955825805664, "step": 66 }, { "epoch": 0.23073611708996986, "grad_norm": 71.25690755989906, "learning_rate": 7.588820880814168e-07, "logits/chosen": -4.404972553253174, "logits/rejected": -4.322005271911621, "logps/chosen": -0.7880414128303528, "logps/rejected": -0.8913244605064392, "loss": 4.4685, "rewards/accuracies": 0.625, "rewards/chosen": -7.8804144859313965, "rewards/margins": 1.032829999923706, "rewards/rejected": -8.913244247436523, "step": 67 }, { "epoch": 0.23417993973310375, "grad_norm": 74.45643519661347, "learning_rate": 7.567299028861528e-07, "logits/chosen": -5.07747220993042, "logits/rejected": -4.910668849945068, "logps/chosen": -0.8106582760810852, "logps/rejected": -0.8454375863075256, "loss": 4.2067, "rewards/accuracies": 0.8125, "rewards/chosen": -8.106582641601562, "rewards/margins": 0.3477928638458252, "rewards/rejected": -8.454376220703125, "step": 68 }, { "epoch": 0.2376237623762376, "grad_norm": 61.17870132728996, "learning_rate": 7.54526034061595e-07, "logits/chosen": -4.368528842926025, "logits/rejected": -4.182857513427734, "logps/chosen": -0.7671667337417603, "logps/rejected": -0.8781678676605225, "loss": 4.0971, "rewards/accuracies": 0.625, "rewards/chosen": -7.67166805267334, "rewards/margins": 1.110011100769043, "rewards/rejected": -8.781679153442383, "step": 69 }, { "epoch": 0.2410675850193715, "grad_norm": 80.03873573932812, "learning_rate": 7.522708009079711e-07, "logits/chosen": -3.757272720336914, "logits/rejected": -3.6177382469177246, "logps/chosen": -0.7591115832328796, "logps/rejected": -1.0031275749206543, "loss": 4.4004, "rewards/accuracies": 0.6875, "rewards/chosen": -7.591116905212402, "rewards/margins": 2.4401588439941406, "rewards/rejected": -10.031274795532227, "step": 70 }, { "epoch": 0.24451140766250537, "grad_norm": 54.78863193631618, "learning_rate": 7.499645301672599e-07, "logits/chosen": -4.391002655029297, "logits/rejected": -4.642823696136475, "logps/chosen": -0.8277568817138672, "logps/rejected": -0.8874188661575317, "loss": 4.0832, "rewards/accuracies": 0.6875, "rewards/chosen": -8.277568817138672, "rewards/margins": 0.5966211557388306, "rewards/rejected": -8.874189376831055, "step": 71 }, { "epoch": 0.24795523030563926, "grad_norm": 68.33693738907787, "learning_rate": 7.476075559758513e-07, "logits/chosen": -4.277254581451416, "logits/rejected": -4.10576057434082, "logps/chosen": -0.6340219378471375, "logps/rejected": -0.8107688426971436, "loss": 4.3541, "rewards/accuracies": 0.6875, "rewards/chosen": -6.340219974517822, "rewards/margins": 1.767467975616455, "rewards/rejected": -8.107687950134277, "step": 72 }, { "epoch": 0.2513990529487731, "grad_norm": 56.379646600357916, "learning_rate": 7.452002198161371e-07, "logits/chosen": -4.682867050170898, "logits/rejected": -4.608969211578369, "logps/chosen": -0.7252380847930908, "logps/rejected": -0.8175498247146606, "loss": 3.8474, "rewards/accuracies": 0.5625, "rewards/chosen": -7.25238037109375, "rewards/margins": 0.9231181144714355, "rewards/rejected": -8.175498962402344, "step": 73 }, { "epoch": 0.254842875591907, "grad_norm": 100.90328367426899, "learning_rate": 7.427428704670356e-07, "logits/chosen": -4.861872673034668, "logits/rejected": -4.656722545623779, "logps/chosen": -0.7617427706718445, "logps/rejected": -0.9613173007965088, "loss": 4.4928, "rewards/accuracies": 0.625, "rewards/chosen": -7.617427825927734, "rewards/margins": 1.9957445859909058, "rewards/rejected": -9.61317253112793, "step": 74 }, { "epoch": 0.2582866982350409, "grad_norm": 65.6279612427127, "learning_rate": 7.402358639534602e-07, "logits/chosen": -5.1001877784729, "logits/rejected": -5.059464454650879, "logps/chosen": -0.6768380403518677, "logps/rejected": -0.8699934482574463, "loss": 4.1233, "rewards/accuracies": 0.625, "rewards/chosen": -6.768380641937256, "rewards/margins": 1.931553840637207, "rewards/rejected": -8.699934005737305, "step": 75 }, { "epoch": 0.2617305208781748, "grad_norm": 69.40821628799613, "learning_rate": 7.376795634947379e-07, "logits/chosen": -4.4171576499938965, "logits/rejected": -4.2465434074401855, "logps/chosen": -0.7788955569267273, "logps/rejected": -0.8167555332183838, "loss": 4.309, "rewards/accuracies": 0.75, "rewards/chosen": -7.788956165313721, "rewards/margins": 0.37859874963760376, "rewards/rejected": -8.16755485534668, "step": 76 }, { "epoch": 0.26517434352130864, "grad_norm": 69.16350786587172, "learning_rate": 7.350743394519858e-07, "logits/chosen": -4.930624485015869, "logits/rejected": -4.70862340927124, "logps/chosen": -0.8845140933990479, "logps/rejected": -0.9442533850669861, "loss": 4.1944, "rewards/accuracies": 0.625, "rewards/chosen": -8.845142364501953, "rewards/margins": 0.5973912477493286, "rewards/rejected": -9.442534446716309, "step": 77 }, { "epoch": 0.2686181661644425, "grad_norm": 67.06499369198696, "learning_rate": 7.324205692744521e-07, "logits/chosen": -5.08651065826416, "logits/rejected": -5.048566818237305, "logps/chosen": -0.672334611415863, "logps/rejected": -0.7581319808959961, "loss": 4.2669, "rewards/accuracies": 0.4375, "rewards/chosen": -6.723345756530762, "rewards/margins": 0.8579738140106201, "rewards/rejected": -7.581319808959961, "step": 78 }, { "epoch": 0.2720619888075764, "grad_norm": 85.80640569218886, "learning_rate": 7.297186374448307e-07, "logits/chosen": -5.137825012207031, "logits/rejected": -5.172469139099121, "logps/chosen": -0.9155557155609131, "logps/rejected": -1.0751527547836304, "loss": 4.1234, "rewards/accuracies": 0.75, "rewards/chosen": -9.155557632446289, "rewards/margins": 1.5959699153900146, "rewards/rejected": -10.751527786254883, "step": 79 }, { "epoch": 0.2755058114507103, "grad_norm": 63.74795493043287, "learning_rate": 7.269689354235567e-07, "logits/chosen": -5.289166450500488, "logits/rejected": -4.827259540557861, "logps/chosen": -0.7461143136024475, "logps/rejected": -1.00174081325531, "loss": 3.6397, "rewards/accuracies": 0.8125, "rewards/chosen": -7.461143493652344, "rewards/margins": 2.5562655925750732, "rewards/rejected": -10.01740837097168, "step": 80 }, { "epoch": 0.27894963409384416, "grad_norm": 63.4036066323074, "learning_rate": 7.241718615920916e-07, "logits/chosen": -5.0095415115356445, "logits/rejected": -4.8333353996276855, "logps/chosen": -0.8599931597709656, "logps/rejected": -1.064732551574707, "loss": 3.9659, "rewards/accuracies": 0.8125, "rewards/chosen": -8.599931716918945, "rewards/margins": 2.0473945140838623, "rewards/rejected": -10.647326469421387, "step": 81 }, { "epoch": 0.282393456736978, "grad_norm": 62.58097683888751, "learning_rate": 7.213278211952038e-07, "logits/chosen": -4.466184139251709, "logits/rejected": -4.107361793518066, "logps/chosen": -0.7377562522888184, "logps/rejected": -0.9077808856964111, "loss": 3.8192, "rewards/accuracies": 0.8125, "rewards/chosen": -7.377562046051025, "rewards/margins": 1.7002463340759277, "rewards/rejected": -9.07780933380127, "step": 82 }, { "epoch": 0.28583727938011194, "grad_norm": 71.46678574585364, "learning_rate": 7.184372262822574e-07, "logits/chosen": -4.615472793579102, "logits/rejected": -4.519737243652344, "logps/chosen": -0.8602911233901978, "logps/rejected": -0.9075096845626831, "loss": 4.0224, "rewards/accuracies": 0.5625, "rewards/chosen": -8.602910995483398, "rewards/margins": 0.47218504548072815, "rewards/rejected": -9.07509708404541, "step": 83 }, { "epoch": 0.2892811020232458, "grad_norm": 76.77633086094144, "learning_rate": 7.155004956475131e-07, "logits/chosen": -5.291561126708984, "logits/rejected": -4.816816329956055, "logps/chosen": -0.7795137166976929, "logps/rejected": -0.9014157056808472, "loss": 3.9316, "rewards/accuracies": 0.8125, "rewards/chosen": -7.79513692855835, "rewards/margins": 1.2190203666687012, "rewards/rejected": -9.01415729522705, "step": 84 }, { "epoch": 0.29272492466637967, "grad_norm": 64.07924704882033, "learning_rate": 7.125180547694526e-07, "logits/chosen": -5.0156683921813965, "logits/rejected": -4.72418737411499, "logps/chosen": -0.8232897520065308, "logps/rejected": -1.23880136013031, "loss": 3.6287, "rewards/accuracies": 0.6875, "rewards/chosen": -8.23289680480957, "rewards/margins": 4.155117034912109, "rewards/rejected": -12.388014793395996, "step": 85 }, { "epoch": 0.29616874730951354, "grad_norm": 60.650149825273516, "learning_rate": 7.094903357491345e-07, "logits/chosen": -4.864440441131592, "logits/rejected": -4.457652568817139, "logps/chosen": -0.8692309260368347, "logps/rejected": -1.0946143865585327, "loss": 3.9978, "rewards/accuracies": 0.8125, "rewards/chosen": -8.69230842590332, "rewards/margins": 2.2538340091705322, "rewards/rejected": -10.94614315032959, "step": 86 }, { "epoch": 0.29961256995264746, "grad_norm": 57.554296714602465, "learning_rate": 7.064177772475911e-07, "logits/chosen": -5.011836528778076, "logits/rejected": -5.0129570960998535, "logps/chosen": -0.9953622221946716, "logps/rejected": -1.1730215549468994, "loss": 3.945, "rewards/accuracies": 0.625, "rewards/chosen": -9.953622817993164, "rewards/margins": 1.7765934467315674, "rewards/rejected": -11.730216026306152, "step": 87 }, { "epoch": 0.3030563925957813, "grad_norm": 76.32625861331243, "learning_rate": 7.033008244222745e-07, "logits/chosen": -5.204478740692139, "logits/rejected": -4.811039924621582, "logps/chosen": -1.0616154670715332, "logps/rejected": -1.0835515260696411, "loss": 3.803, "rewards/accuracies": 0.6875, "rewards/chosen": -10.616154670715332, "rewards/margins": 0.2193598747253418, "rewards/rejected": -10.835514068603516, "step": 88 }, { "epoch": 0.3065002152389152, "grad_norm": 74.9886645079608, "learning_rate": 7.001399288625609e-07, "logits/chosen": -5.231860637664795, "logits/rejected": -4.674942970275879, "logps/chosen": -0.9697386026382446, "logps/rejected": -1.13163423538208, "loss": 3.7676, "rewards/accuracies": 0.6875, "rewards/chosen": -9.697385787963867, "rewards/margins": 1.6189574003219604, "rewards/rejected": -11.316343307495117, "step": 89 }, { "epoch": 0.30994403788204905, "grad_norm": 103.40467284986448, "learning_rate": 6.969355485243239e-07, "logits/chosen": -5.283835411071777, "logits/rejected": -5.210239410400391, "logps/chosen": -0.9744136929512024, "logps/rejected": -1.04610276222229, "loss": 4.0352, "rewards/accuracies": 0.4375, "rewards/chosen": -9.744136810302734, "rewards/margins": 0.7168899774551392, "rewards/rejected": -10.461027145385742, "step": 90 }, { "epoch": 0.31338786052518297, "grad_norm": 75.93291524225512, "learning_rate": 6.936881476635852e-07, "logits/chosen": -6.081892013549805, "logits/rejected": -5.807435989379883, "logps/chosen": -1.129875898361206, "logps/rejected": -1.387671709060669, "loss": 4.1486, "rewards/accuracies": 0.6875, "rewards/chosen": -11.298759460449219, "rewards/margins": 2.5779573917388916, "rewards/rejected": -13.876716613769531, "step": 91 }, { "epoch": 0.31683168316831684, "grad_norm": 81.2673779492731, "learning_rate": 6.903981967692524e-07, "logits/chosen": -5.27292013168335, "logits/rejected": -4.817174911499023, "logps/chosen": -0.9680742621421814, "logps/rejected": -1.3815770149230957, "loss": 3.5383, "rewards/accuracies": 0.875, "rewards/chosen": -9.680743217468262, "rewards/margins": 4.135027885437012, "rewards/rejected": -13.815771102905273, "step": 92 }, { "epoch": 0.3202755058114507, "grad_norm": 74.43824451243964, "learning_rate": 6.870661724949532e-07, "logits/chosen": -5.829610824584961, "logits/rejected": -5.776001453399658, "logps/chosen": -0.9559181928634644, "logps/rejected": -1.1338019371032715, "loss": 3.772, "rewards/accuracies": 0.6875, "rewards/chosen": -9.559182167053223, "rewards/margins": 1.778836965560913, "rewards/rejected": -11.338018417358398, "step": 93 }, { "epoch": 0.32371932845458457, "grad_norm": 80.85750752674423, "learning_rate": 6.836925575899777e-07, "logits/chosen": -5.458807468414307, "logits/rejected": -5.102845668792725, "logps/chosen": -1.3218635320663452, "logps/rejected": -1.4797770977020264, "loss": 3.6527, "rewards/accuracies": 0.75, "rewards/chosen": -13.218635559082031, "rewards/margins": 1.5791367292404175, "rewards/rejected": -14.797771453857422, "step": 94 }, { "epoch": 0.3271631510977185, "grad_norm": 91.67035563126446, "learning_rate": 6.802778408293369e-07, "logits/chosen": -6.600034713745117, "logits/rejected": -5.972718238830566, "logps/chosen": -1.1001228094100952, "logps/rejected": -1.3483161926269531, "loss": 3.2161, "rewards/accuracies": 0.8125, "rewards/chosen": -11.001227378845215, "rewards/margins": 2.4819343090057373, "rewards/rejected": -13.483161926269531, "step": 95 }, { "epoch": 0.33060697374085235, "grad_norm": 82.93501550536419, "learning_rate": 6.768225169429477e-07, "logits/chosen": -5.710722923278809, "logits/rejected": -5.354726314544678, "logps/chosen": -1.1976306438446045, "logps/rejected": -1.6553398370742798, "loss": 3.474, "rewards/accuracies": 0.75, "rewards/chosen": -11.976305961608887, "rewards/margins": 4.577092170715332, "rewards/rejected": -16.55339813232422, "step": 96 }, { "epoch": 0.3340507963839862, "grad_norm": 83.33902780022994, "learning_rate": 6.733270865439557e-07, "logits/chosen": -6.448300361633301, "logits/rejected": -6.226202487945557, "logps/chosen": -1.595934510231018, "logps/rejected": -1.6069419384002686, "loss": 3.8929, "rewards/accuracies": 0.5625, "rewards/chosen": -15.959344863891602, "rewards/margins": 0.11007285118103027, "rewards/rejected": -16.069419860839844, "step": 97 }, { "epoch": 0.3374946190271201, "grad_norm": 117.67802338372276, "learning_rate": 6.697920560562055e-07, "logits/chosen": -6.556612968444824, "logits/rejected": -6.181111812591553, "logps/chosen": -1.4487080574035645, "logps/rejected": -1.8549811840057373, "loss": 3.6244, "rewards/accuracies": 0.6875, "rewards/chosen": -14.487081527709961, "rewards/margins": 4.062728404998779, "rewards/rejected": -18.549808502197266, "step": 98 }, { "epoch": 0.340938441670254, "grad_norm": 93.0037276638684, "learning_rate": 6.662179376408698e-07, "logits/chosen": -7.180575370788574, "logits/rejected": -6.442221641540527, "logps/chosen": -1.184888243675232, "logps/rejected": -1.4313077926635742, "loss": 2.8886, "rewards/accuracies": 0.5625, "rewards/chosen": -11.848883628845215, "rewards/margins": 2.4641964435577393, "rewards/rejected": -14.313077926635742, "step": 99 }, { "epoch": 0.34438226431338786, "grad_norm": 103.70431976296841, "learning_rate": 6.626052491222453e-07, "logits/chosen": -7.366156101226807, "logits/rejected": -6.646521091461182, "logps/chosen": -1.50858736038208, "logps/rejected": -1.6617248058319092, "loss": 3.8069, "rewards/accuracies": 0.6875, "rewards/chosen": -15.0858736038208, "rewards/margins": 1.5313715934753418, "rewards/rejected": -16.617244720458984, "step": 100 }, { "epoch": 0.34782608695652173, "grad_norm": 97.56464703755402, "learning_rate": 6.589545139127311e-07, "logits/chosen": -6.810091972351074, "logits/rejected": -6.6775031089782715, "logps/chosen": -1.1999897956848145, "logps/rejected": -1.4419368505477905, "loss": 3.0443, "rewards/accuracies": 0.625, "rewards/chosen": -11.999898910522461, "rewards/margins": 2.4194700717926025, "rewards/rejected": -14.419368743896484, "step": 101 }, { "epoch": 0.3512699095996556, "grad_norm": 109.40818246650933, "learning_rate": 6.552662609369942e-07, "logits/chosen": -9.70158576965332, "logits/rejected": -9.41241455078125, "logps/chosen": -1.6020368337631226, "logps/rejected": -1.8466899394989014, "loss": 3.8531, "rewards/accuracies": 0.8125, "rewards/chosen": -16.020368576049805, "rewards/margins": 2.446530818939209, "rewards/rejected": -18.466899871826172, "step": 102 }, { "epoch": 0.3547137322427895, "grad_norm": 120.68703068267112, "learning_rate": 6.515410245553393e-07, "logits/chosen": -9.626636505126953, "logits/rejected": -8.800621032714844, "logps/chosen": -1.5177563428878784, "logps/rejected": -2.0786399841308594, "loss": 3.3957, "rewards/accuracies": 0.875, "rewards/chosen": -15.177563667297363, "rewards/margins": 5.608834266662598, "rewards/rejected": -20.78639793395996, "step": 103 }, { "epoch": 0.3581575548859234, "grad_norm": 115.8693954281069, "learning_rate": 6.477793444862892e-07, "logits/chosen": -8.715924263000488, "logits/rejected": -8.530646324157715, "logps/chosen": -1.4800488948822021, "logps/rejected": -1.7589524984359741, "loss": 3.1903, "rewards/accuracies": 0.75, "rewards/chosen": -14.800487518310547, "rewards/margins": 2.7890357971191406, "rewards/rejected": -17.589523315429688, "step": 104 }, { "epoch": 0.36160137752905724, "grad_norm": 152.94221871551687, "learning_rate": 6.439817657283891e-07, "logits/chosen": -9.968289375305176, "logits/rejected": -9.650674819946289, "logps/chosen": -1.2602545022964478, "logps/rejected": -1.6873594522476196, "loss": 3.4884, "rewards/accuracies": 0.75, "rewards/chosen": -12.602544784545898, "rewards/margins": 4.271048545837402, "rewards/rejected": -16.873594284057617, "step": 105 }, { "epoch": 0.3650452001721911, "grad_norm": 105.58797890414657, "learning_rate": 6.401488384812473e-07, "logits/chosen": -9.584343910217285, "logits/rejected": -9.589265823364258, "logps/chosen": -1.5995938777923584, "logps/rejected": -1.774956226348877, "loss": 3.6783, "rewards/accuracies": 0.75, "rewards/chosen": -15.995938301086426, "rewards/margins": 1.7536234855651855, "rewards/rejected": -17.749563217163086, "step": 106 }, { "epoch": 0.36848902281532503, "grad_norm": 136.93293579796645, "learning_rate": 6.362811180658203e-07, "logits/chosen": -10.062201499938965, "logits/rejected": -9.910536766052246, "logps/chosen": -1.5935949087142944, "logps/rejected": -1.9505418539047241, "loss": 3.243, "rewards/accuracies": 0.75, "rewards/chosen": -15.93595027923584, "rewards/margins": 3.5694689750671387, "rewards/rejected": -19.505420684814453, "step": 107 }, { "epoch": 0.3719328454584589, "grad_norm": 116.29861634629408, "learning_rate": 6.323791648439579e-07, "logits/chosen": -9.214845657348633, "logits/rejected": -8.844350814819336, "logps/chosen": -1.5258371829986572, "logps/rejected": -1.9718682765960693, "loss": 3.2289, "rewards/accuracies": 0.8125, "rewards/chosen": -15.258371353149414, "rewards/margins": 4.460310935974121, "rewards/rejected": -19.71868324279785, "step": 108 }, { "epoch": 0.37537666810159276, "grad_norm": 155.48156040186888, "learning_rate": 6.284435441372161e-07, "logits/chosen": -11.504440307617188, "logits/rejected": -10.832094192504883, "logps/chosen": -1.9326715469360352, "logps/rejected": -2.6018829345703125, "loss": 3.0293, "rewards/accuracies": 0.75, "rewards/chosen": -19.32671546936035, "rewards/margins": 6.692113876342773, "rewards/rejected": -26.018831253051758, "step": 109 }, { "epoch": 0.3788204907447266, "grad_norm": 136.87645547171363, "learning_rate": 6.244748261449529e-07, "logits/chosen": -11.773118019104004, "logits/rejected": -11.45300006866455, "logps/chosen": -1.7673592567443848, "logps/rejected": -2.0853307247161865, "loss": 2.9492, "rewards/accuracies": 0.875, "rewards/chosen": -17.67359161376953, "rewards/margins": 3.179716110229492, "rewards/rejected": -20.853307723999023, "step": 110 }, { "epoch": 0.38226431338786054, "grad_norm": 118.1199403678312, "learning_rate": 6.204735858617171e-07, "logits/chosen": -11.518077850341797, "logits/rejected": -10.808693885803223, "logps/chosen": -1.9174708127975464, "logps/rejected": -2.1425564289093018, "loss": 3.0723, "rewards/accuracies": 0.8125, "rewards/chosen": -19.174705505371094, "rewards/margins": 2.250857353210449, "rewards/rejected": -21.425565719604492, "step": 111 }, { "epoch": 0.3857081360309944, "grad_norm": 119.80139407969132, "learning_rate": 6.164404029939416e-07, "logits/chosen": -11.800997734069824, "logits/rejected": -11.651061058044434, "logps/chosen": -1.7692383527755737, "logps/rejected": -2.131488800048828, "loss": 3.1498, "rewards/accuracies": 0.8125, "rewards/chosen": -17.692384719848633, "rewards/margins": 3.6225037574768066, "rewards/rejected": -21.31488800048828, "step": 112 }, { "epoch": 0.3891519586741283, "grad_norm": 163.89733948610942, "learning_rate": 6.123758618759547e-07, "logits/chosen": -11.592788696289062, "logits/rejected": -11.94422435760498, "logps/chosen": -2.020418405532837, "logps/rejected": -2.4360499382019043, "loss": 3.0169, "rewards/accuracies": 0.75, "rewards/chosen": -20.204185485839844, "rewards/margins": 4.156314849853516, "rewards/rejected": -24.36050033569336, "step": 113 }, { "epoch": 0.39259578131726214, "grad_norm": 143.8629447599346, "learning_rate": 6.082805513853209e-07, "logits/chosen": -12.300226211547852, "logits/rejected": -11.13952350616455, "logps/chosen": -1.6513843536376953, "logps/rejected": -2.2150726318359375, "loss": 2.7393, "rewards/accuracies": 0.875, "rewards/chosen": -16.513843536376953, "rewards/margins": 5.636881351470947, "rewards/rejected": -22.150726318359375, "step": 114 }, { "epoch": 0.39603960396039606, "grad_norm": 173.63128250872595, "learning_rate": 6.041550648575234e-07, "logits/chosen": -11.796028137207031, "logits/rejected": -11.548486709594727, "logps/chosen": -2.350860595703125, "logps/rejected": -2.701869010925293, "loss": 3.1346, "rewards/accuracies": 0.625, "rewards/chosen": -23.50860595703125, "rewards/margins": 3.510082721710205, "rewards/rejected": -27.018688201904297, "step": 115 }, { "epoch": 0.3994834266035299, "grad_norm": 138.3119351547436, "learning_rate": 6e-07, "logits/chosen": -12.655649185180664, "logits/rejected": -12.318216323852539, "logps/chosen": -1.7546292543411255, "logps/rejected": -2.545380115509033, "loss": 3.5043, "rewards/accuracies": 0.8125, "rewards/chosen": -17.54629135131836, "rewards/margins": 7.907507419586182, "rewards/rejected": -25.453800201416016, "step": 116 }, { "epoch": 0.4029272492466638, "grad_norm": 145.98056778857588, "learning_rate": 5.958159588055472e-07, "logits/chosen": -13.69811725616455, "logits/rejected": -13.62729549407959, "logps/chosen": -1.6991206407546997, "logps/rejected": -2.0454368591308594, "loss": 3.1443, "rewards/accuracies": 0.875, "rewards/chosen": -16.9912052154541, "rewards/margins": 3.4631614685058594, "rewards/rejected": -20.454364776611328, "step": 117 }, { "epoch": 0.40637107188979765, "grad_norm": 169.50788721509392, "learning_rate": 5.916035474651021e-07, "logits/chosen": -13.291184425354004, "logits/rejected": -13.162979125976562, "logps/chosen": -1.7755848169326782, "logps/rejected": -2.445830821990967, "loss": 3.0084, "rewards/accuracies": 0.875, "rewards/chosen": -17.755847930908203, "rewards/margins": 6.702462196350098, "rewards/rejected": -24.458311080932617, "step": 118 }, { "epoch": 0.4098148945329316, "grad_norm": 119.7129501335904, "learning_rate": 5.87363376279916e-07, "logits/chosen": -12.720142364501953, "logits/rejected": -12.485799789428711, "logps/chosen": -1.9099136590957642, "logps/rejected": -2.9828622341156006, "loss": 2.6101, "rewards/accuracies": 0.8125, "rewards/chosen": -19.099138259887695, "rewards/margins": 10.729487419128418, "rewards/rejected": -29.828622817993164, "step": 119 }, { "epoch": 0.41325871717606544, "grad_norm": 175.87119643186315, "learning_rate": 5.830960595731334e-07, "logits/chosen": -11.896202087402344, "logits/rejected": -12.151188850402832, "logps/chosen": -1.8780018091201782, "logps/rejected": -2.5524849891662598, "loss": 2.5416, "rewards/accuracies": 0.75, "rewards/chosen": -18.780019760131836, "rewards/margins": 6.744830131530762, "rewards/rejected": -25.52484893798828, "step": 120 }, { "epoch": 0.4167025398191993, "grad_norm": 182.67604950430695, "learning_rate": 5.788022156007876e-07, "logits/chosen": -13.834617614746094, "logits/rejected": -13.90433406829834, "logps/chosen": -2.329464912414551, "logps/rejected": -2.853301763534546, "loss": 3.5812, "rewards/accuracies": 0.75, "rewards/chosen": -23.294649124145508, "rewards/margins": 5.238368034362793, "rewards/rejected": -28.533016204833984, "step": 121 }, { "epoch": 0.42014636246233317, "grad_norm": 162.94046075177278, "learning_rate": 5.744824664622269e-07, "logits/chosen": -13.56065559387207, "logits/rejected": -13.065262794494629, "logps/chosen": -2.3431150913238525, "logps/rejected": -2.9342470169067383, "loss": 2.9716, "rewards/accuracies": 0.8125, "rewards/chosen": -23.431150436401367, "rewards/margins": 5.911318778991699, "rewards/rejected": -29.342470169067383, "step": 122 }, { "epoch": 0.4235901851054671, "grad_norm": 140.20048424042622, "learning_rate": 5.70137438009984e-07, "logits/chosen": -14.668344497680664, "logits/rejected": -13.541962623596191, "logps/chosen": -2.4308154582977295, "logps/rejected": -2.9639768600463867, "loss": 3.1536, "rewards/accuracies": 0.6875, "rewards/chosen": -24.308155059814453, "rewards/margins": 5.331615447998047, "rewards/rejected": -29.639768600463867, "step": 123 }, { "epoch": 0.42703400774860095, "grad_norm": 271.24268423314203, "learning_rate": 5.657677597591007e-07, "logits/chosen": -14.41106128692627, "logits/rejected": -14.61843204498291, "logps/chosen": -2.38899564743042, "logps/rejected": -2.7550418376922607, "loss": 3.5358, "rewards/accuracies": 0.625, "rewards/chosen": -23.889955520629883, "rewards/margins": 3.660465717315674, "rewards/rejected": -27.550418853759766, "step": 124 }, { "epoch": 0.4304778303917348, "grad_norm": 165.17472401407244, "learning_rate": 5.613740647959235e-07, "logits/chosen": -12.676807403564453, "logits/rejected": -12.471404075622559, "logps/chosen": -1.8505709171295166, "logps/rejected": -2.3470511436462402, "loss": 2.8249, "rewards/accuracies": 0.875, "rewards/chosen": -18.505708694458008, "rewards/margins": 4.964802265167236, "rewards/rejected": -23.47051239013672, "step": 125 }, { "epoch": 0.4339216530348687, "grad_norm": 143.69505743500713, "learning_rate": 5.569569896863801e-07, "logits/chosen": -13.985774993896484, "logits/rejected": -13.654581069946289, "logps/chosen": -1.782606601715088, "logps/rejected": -2.051421642303467, "loss": 3.256, "rewards/accuracies": 0.8125, "rewards/chosen": -17.826065063476562, "rewards/margins": 2.688152551651001, "rewards/rejected": -20.514219284057617, "step": 126 }, { "epoch": 0.4373654756780026, "grad_norm": 198.98779014050083, "learning_rate": 5.52517174383754e-07, "logits/chosen": -13.829938888549805, "logits/rejected": -13.86230182647705, "logps/chosen": -2.4392776489257812, "logps/rejected": -3.190002679824829, "loss": 2.8292, "rewards/accuracies": 0.8125, "rewards/chosen": -24.39277458190918, "rewards/margins": 7.50724983215332, "rewards/rejected": -31.900026321411133, "step": 127 }, { "epoch": 0.44080929832113647, "grad_norm": 189.97902844787896, "learning_rate": 5.480552621359659e-07, "logits/chosen": -14.226242065429688, "logits/rejected": -14.341365814208984, "logps/chosen": -2.10856294631958, "logps/rejected": -2.4918906688690186, "loss": 3.2012, "rewards/accuracies": 0.625, "rewards/chosen": -21.085630416870117, "rewards/margins": 3.8332767486572266, "rewards/rejected": -24.918907165527344, "step": 128 }, { "epoch": 0.44425312096427033, "grad_norm": 136.19908022128857, "learning_rate": 5.435718993923784e-07, "logits/chosen": -13.451090812683105, "logits/rejected": -12.785599708557129, "logps/chosen": -1.6525287628173828, "logps/rejected": -2.3770031929016113, "loss": 2.4949, "rewards/accuracies": 0.875, "rewards/chosen": -16.52528953552246, "rewards/margins": 7.244744300842285, "rewards/rejected": -23.77003288269043, "step": 129 }, { "epoch": 0.4476969436074042, "grad_norm": 135.82081198678648, "learning_rate": 5.39067735710139e-07, "logits/chosen": -14.511775016784668, "logits/rejected": -13.7813138961792, "logps/chosen": -2.127079963684082, "logps/rejected": -2.742191791534424, "loss": 2.8888, "rewards/accuracies": 0.5625, "rewards/chosen": -21.27079963684082, "rewards/margins": 6.151117324829102, "rewards/rejected": -27.42191505432129, "step": 130 }, { "epoch": 0.4511407662505381, "grad_norm": 183.19436612857393, "learning_rate": 5.3454342366007e-07, "logits/chosen": -14.230147361755371, "logits/rejected": -13.983600616455078, "logps/chosen": -2.1134510040283203, "logps/rejected": -2.4505257606506348, "loss": 3.9961, "rewards/accuracies": 0.75, "rewards/chosen": -21.134510040283203, "rewards/margins": 3.3707499504089355, "rewards/rejected": -24.505260467529297, "step": 131 }, { "epoch": 0.454584588893672, "grad_norm": 207.54586277796528, "learning_rate": 5.299996187321231e-07, "logits/chosen": -15.675312042236328, "logits/rejected": -15.437052726745605, "logps/chosen": -1.9110677242279053, "logps/rejected": -2.109048366546631, "loss": 3.5684, "rewards/accuracies": 0.5625, "rewards/chosen": -19.110675811767578, "rewards/margins": 1.9798049926757812, "rewards/rejected": -21.090482711791992, "step": 132 }, { "epoch": 0.45802841153680585, "grad_norm": 126.18602958529125, "learning_rate": 5.254369792404108e-07, "logits/chosen": -14.874656677246094, "logits/rejected": -14.413407325744629, "logps/chosen": -2.353257417678833, "logps/rejected": -3.3936214447021484, "loss": 2.0221, "rewards/accuracies": 1.0, "rewards/chosen": -23.532573699951172, "rewards/margins": 10.403639793395996, "rewards/rejected": -33.936214447021484, "step": 133 }, { "epoch": 0.4614722341799397, "grad_norm": 143.02615992144584, "learning_rate": 5.20856166227829e-07, "logits/chosen": -15.980953216552734, "logits/rejected": -15.589332580566406, "logps/chosen": -2.683424472808838, "logps/rejected": -3.0904102325439453, "loss": 2.7952, "rewards/accuracies": 0.75, "rewards/chosen": -26.834243774414062, "rewards/margins": 4.069858551025391, "rewards/rejected": -30.904102325439453, "step": 134 }, { "epoch": 0.46491605682307363, "grad_norm": 160.67163719835557, "learning_rate": 5.162578433702844e-07, "logits/chosen": -15.208805084228516, "logits/rejected": -15.462963104248047, "logps/chosen": -1.8320481777191162, "logps/rejected": -2.23215651512146, "loss": 2.9135, "rewards/accuracies": 0.75, "rewards/chosen": -18.320480346679688, "rewards/margins": 4.001082420349121, "rewards/rejected": -22.321565628051758, "step": 135 }, { "epoch": 0.4683598794662075, "grad_norm": 145.99038790062517, "learning_rate": 5.116426768805387e-07, "logits/chosen": -14.624232292175293, "logits/rejected": -14.728387832641602, "logps/chosen": -2.1977291107177734, "logps/rejected": -2.507412910461426, "loss": 3.0741, "rewards/accuracies": 0.8125, "rewards/chosen": -21.977291107177734, "rewards/margins": 3.096836566925049, "rewards/rejected": -25.07413101196289, "step": 136 }, { "epoch": 0.47180370210934136, "grad_norm": 148.12763051461735, "learning_rate": 5.070113354116884e-07, "logits/chosen": -15.4700927734375, "logits/rejected": -15.196715354919434, "logps/chosen": -1.5759204626083374, "logps/rejected": -2.449571132659912, "loss": 2.46, "rewards/accuracies": 0.9375, "rewards/chosen": -15.759203910827637, "rewards/margins": 8.736505508422852, "rewards/rejected": -24.495710372924805, "step": 137 }, { "epoch": 0.4752475247524752, "grad_norm": 170.6076112934297, "learning_rate": 5.023644899602871e-07, "logits/chosen": -15.85372257232666, "logits/rejected": -15.770110130310059, "logps/chosen": -2.3490283489227295, "logps/rejected": -2.6556615829467773, "loss": 2.5731, "rewards/accuracies": 0.625, "rewards/chosen": -23.490280151367188, "rewards/margins": 3.066333293914795, "rewards/rejected": -26.556615829467773, "step": 138 }, { "epoch": 0.47869134739560915, "grad_norm": 160.9701483874417, "learning_rate": 4.977028137691324e-07, "logits/chosen": -14.690975189208984, "logits/rejected": -14.133597373962402, "logps/chosen": -2.249864101409912, "logps/rejected": -2.9582080841064453, "loss": 2.6498, "rewards/accuracies": 0.9375, "rewards/chosen": -22.498640060424805, "rewards/margins": 7.083439826965332, "rewards/rejected": -29.582080841064453, "step": 139 }, { "epoch": 0.482135170038743, "grad_norm": 140.63079154740615, "learning_rate": 4.930269822297241e-07, "logits/chosen": -15.633464813232422, "logits/rejected": -15.003985404968262, "logps/chosen": -1.8792824745178223, "logps/rejected": -2.4007444381713867, "loss": 3.0883, "rewards/accuracies": 0.8125, "rewards/chosen": -18.79282569885254, "rewards/margins": 5.214618682861328, "rewards/rejected": -24.0074462890625, "step": 140 }, { "epoch": 0.4855789926818769, "grad_norm": 164.90245104113825, "learning_rate": 4.883376727844129e-07, "logits/chosen": -17.06644058227539, "logits/rejected": -16.71479034423828, "logps/chosen": -1.984204888343811, "logps/rejected": -2.5203452110290527, "loss": 3.2745, "rewards/accuracies": 0.6875, "rewards/chosen": -19.842050552368164, "rewards/margins": 5.361400127410889, "rewards/rejected": -25.203449249267578, "step": 141 }, { "epoch": 0.48902281532501074, "grad_norm": 158.4727771377469, "learning_rate": 4.836355648282509e-07, "logits/chosen": -15.427898406982422, "logits/rejected": -15.480656623840332, "logps/chosen": -1.6626648902893066, "logps/rejected": -2.4167871475219727, "loss": 2.6222, "rewards/accuracies": 0.875, "rewards/chosen": -16.62664794921875, "rewards/margins": 7.54122257232666, "rewards/rejected": -24.16787338256836, "step": 142 }, { "epoch": 0.49246663796814466, "grad_norm": 164.75096025896175, "learning_rate": 4.7892133961056e-07, "logits/chosen": -17.340797424316406, "logits/rejected": -16.83738899230957, "logps/chosen": -2.716911554336548, "logps/rejected": -3.8089609146118164, "loss": 3.2104, "rewards/accuracies": 0.9375, "rewards/chosen": -27.16911506652832, "rewards/margins": 10.920495986938477, "rewards/rejected": -38.08961486816406, "step": 143 }, { "epoch": 0.4959104606112785, "grad_norm": 165.51865689779507, "learning_rate": 4.7419568013623185e-07, "logits/chosen": -17.844758987426758, "logits/rejected": -17.37071418762207, "logps/chosen": -2.3916749954223633, "logps/rejected": -2.984015703201294, "loss": 3.5064, "rewards/accuracies": 0.75, "rewards/chosen": -23.916751861572266, "rewards/margins": 5.923404693603516, "rewards/rejected": -29.84015655517578, "step": 144 }, { "epoch": 0.4993542832544124, "grad_norm": 170.81988641182852, "learning_rate": 4.694592710667722e-07, "logits/chosen": -16.56422996520996, "logits/rejected": -16.64710235595703, "logps/chosen": -1.9375088214874268, "logps/rejected": -2.7873284816741943, "loss": 2.599, "rewards/accuracies": 0.875, "rewards/chosen": -19.375089645385742, "rewards/margins": 8.498197555541992, "rewards/rejected": -27.873287200927734, "step": 145 }, { "epoch": 0.5027981058975463, "grad_norm": 152.83376641387204, "learning_rate": 4.6471279862110594e-07, "logits/chosen": -16.366130828857422, "logits/rejected": -16.218612670898438, "logps/chosen": -2.085256814956665, "logps/rejected": -2.532078742980957, "loss": 2.5765, "rewards/accuracies": 0.75, "rewards/chosen": -20.852569580078125, "rewards/margins": 4.468219757080078, "rewards/rejected": -25.320789337158203, "step": 146 }, { "epoch": 0.5062419285406802, "grad_norm": 167.03745018894716, "learning_rate": 4.5995695047615724e-07, "logits/chosen": -16.575876235961914, "logits/rejected": -16.29088020324707, "logps/chosen": -1.7308956384658813, "logps/rejected": -2.0768227577209473, "loss": 3.2235, "rewards/accuracies": 0.75, "rewards/chosen": -17.308956146240234, "rewards/margins": 3.459270477294922, "rewards/rejected": -20.768226623535156, "step": 147 }, { "epoch": 0.509685751183814, "grad_norm": 178.30073567921698, "learning_rate": 4.5519241566721724e-07, "logits/chosen": -15.774458885192871, "logits/rejected": -15.673702239990234, "logps/chosen": -2.2769925594329834, "logps/rejected": -2.522907257080078, "loss": 3.6175, "rewards/accuracies": 0.75, "rewards/chosen": -22.769929885864258, "rewards/margins": 2.45914363861084, "rewards/rejected": -25.229076385498047, "step": 148 }, { "epoch": 0.5131295738269479, "grad_norm": 137.31263512738002, "learning_rate": 4.5041988448811574e-07, "logits/chosen": -15.081258773803711, "logits/rejected": -15.273090362548828, "logps/chosen": -1.8598787784576416, "logps/rejected": -2.1195287704467773, "loss": 2.687, "rewards/accuracies": 0.6875, "rewards/chosen": -18.598787307739258, "rewards/margins": 2.596500873565674, "rewards/rejected": -21.195289611816406, "step": 149 }, { "epoch": 0.5165733964700818, "grad_norm": 125.93526672957817, "learning_rate": 4.456400483912099e-07, "logits/chosen": -16.464996337890625, "logits/rejected": -16.59218978881836, "logps/chosen": -2.1750199794769287, "logps/rejected": -2.6353604793548584, "loss": 2.8572, "rewards/accuracies": 0.875, "rewards/chosen": -21.750200271606445, "rewards/margins": 4.603403568267822, "rewards/rejected": -26.35360336303711, "step": 150 }, { "epoch": 0.5200172191132156, "grad_norm": 142.20133174481876, "learning_rate": 4.4085359988720583e-07, "logits/chosen": -15.427270889282227, "logits/rejected": -15.429370880126953, "logps/chosen": -1.9908243417739868, "logps/rejected": -2.447384834289551, "loss": 2.1271, "rewards/accuracies": 0.875, "rewards/chosen": -19.908245086669922, "rewards/margins": 4.565605163574219, "rewards/rejected": -24.473848342895508, "step": 151 }, { "epoch": 0.5234610417563496, "grad_norm": 149.43131783531857, "learning_rate": 4.3606123244482615e-07, "logits/chosen": -16.817100524902344, "logits/rejected": -16.41914176940918, "logps/chosen": -2.2302825450897217, "logps/rejected": -3.1961381435394287, "loss": 2.7684, "rewards/accuracies": 0.8125, "rewards/chosen": -22.302825927734375, "rewards/margins": 9.658554077148438, "rewards/rejected": -31.961380004882812, "step": 152 }, { "epoch": 0.5269048643994835, "grad_norm": 175.08007654534825, "learning_rate": 4.3126364039033934e-07, "logits/chosen": -16.285236358642578, "logits/rejected": -16.360095977783203, "logps/chosen": -1.9334690570831299, "logps/rejected": -2.5686261653900146, "loss": 2.9525, "rewards/accuracies": 0.6875, "rewards/chosen": -19.33469009399414, "rewards/margins": 6.3515706062316895, "rewards/rejected": -25.686262130737305, "step": 153 }, { "epoch": 0.5303486870426173, "grad_norm": 147.77300608746174, "learning_rate": 4.2646151880696466e-07, "logits/chosen": -15.203396797180176, "logits/rejected": -15.251398086547852, "logps/chosen": -2.082613945007324, "logps/rejected": -2.380194664001465, "loss": 3.1857, "rewards/accuracies": 0.8125, "rewards/chosen": -20.826141357421875, "rewards/margins": 2.9758081436157227, "rewards/rejected": -23.80194854736328, "step": 154 }, { "epoch": 0.5337925096857512, "grad_norm": 191.413155487678, "learning_rate": 4.21655563434167e-07, "logits/chosen": -16.2647762298584, "logits/rejected": -16.06024742126465, "logps/chosen": -1.8659639358520508, "logps/rejected": -2.6368792057037354, "loss": 2.8876, "rewards/accuracies": 0.875, "rewards/chosen": -18.659639358520508, "rewards/margins": 7.7091522216796875, "rewards/rejected": -26.368793487548828, "step": 155 }, { "epoch": 0.537236332328885, "grad_norm": 124.88412207047179, "learning_rate": 4.16846470566857e-07, "logits/chosen": -16.624813079833984, "logits/rejected": -16.500511169433594, "logps/chosen": -1.741891622543335, "logps/rejected": -2.405266284942627, "loss": 1.9418, "rewards/accuracies": 0.8125, "rewards/chosen": -17.418916702270508, "rewards/margins": 6.633745193481445, "rewards/rejected": -24.052661895751953, "step": 156 }, { "epoch": 0.5406801549720189, "grad_norm": 141.77331157520445, "learning_rate": 4.120349369545109e-07, "logits/chosen": -15.149438858032227, "logits/rejected": -15.287237167358398, "logps/chosen": -2.173785448074341, "logps/rejected": -3.180941104888916, "loss": 2.7363, "rewards/accuracies": 0.625, "rewards/chosen": -21.73785400390625, "rewards/margins": 10.071558952331543, "rewards/rejected": -31.809410095214844, "step": 157 }, { "epoch": 0.5441239776151529, "grad_norm": 155.44419089941894, "learning_rate": 4.0722165970022414e-07, "logits/chosen": -16.01889419555664, "logits/rejected": -16.09550666809082, "logps/chosen": -2.3958230018615723, "logps/rejected": -2.5509209632873535, "loss": 3.3508, "rewards/accuracies": 0.5, "rewards/chosen": -23.95823097229004, "rewards/margins": 1.5509822368621826, "rewards/rejected": -25.509214401245117, "step": 158 }, { "epoch": 0.5475678002582867, "grad_norm": 129.29743296707403, "learning_rate": 4.024073361597142e-07, "logits/chosen": -17.30500030517578, "logits/rejected": -16.847618103027344, "logps/chosen": -2.4113364219665527, "logps/rejected": -3.3326172828674316, "loss": 2.5569, "rewards/accuracies": 0.8125, "rewards/chosen": -24.113361358642578, "rewards/margins": 9.212811470031738, "rewards/rejected": -33.326175689697266, "step": 159 }, { "epoch": 0.5510116229014206, "grad_norm": 161.70130038708717, "learning_rate": 3.9759266384028583e-07, "logits/chosen": -15.621679306030273, "logits/rejected": -15.098061561584473, "logps/chosen": -2.271921157836914, "logps/rejected": -2.7090096473693848, "loss": 2.7771, "rewards/accuracies": 0.625, "rewards/chosen": -22.71921157836914, "rewards/margins": 4.370884895324707, "rewards/rejected": -27.090097427368164, "step": 160 }, { "epoch": 0.5544554455445545, "grad_norm": 157.66640514865557, "learning_rate": 3.927783402997757e-07, "logits/chosen": -15.658122062683105, "logits/rejected": -15.553414344787598, "logps/chosen": -2.2297635078430176, "logps/rejected": -2.9377989768981934, "loss": 2.6828, "rewards/accuracies": 0.8125, "rewards/chosen": -22.29763412475586, "rewards/margins": 7.080355644226074, "rewards/rejected": -29.37799072265625, "step": 161 }, { "epoch": 0.5578992681876883, "grad_norm": 135.88851128774647, "learning_rate": 3.879650630454892e-07, "logits/chosen": -16.659839630126953, "logits/rejected": -16.298494338989258, "logps/chosen": -2.3507156372070312, "logps/rejected": -2.968364715576172, "loss": 2.8013, "rewards/accuracies": 0.875, "rewards/chosen": -23.50715446472168, "rewards/margins": 6.176491737365723, "rewards/rejected": -29.683645248413086, "step": 162 }, { "epoch": 0.5613430908308222, "grad_norm": 113.40296909004411, "learning_rate": 3.83153529433143e-07, "logits/chosen": -14.723609924316406, "logits/rejected": -14.707221984863281, "logps/chosen": -2.070491313934326, "logps/rejected": -2.8369667530059814, "loss": 2.4633, "rewards/accuracies": 0.8125, "rewards/chosen": -20.704910278320312, "rewards/margins": 7.664756774902344, "rewards/rejected": -28.369670867919922, "step": 163 }, { "epoch": 0.564786913473956, "grad_norm": 158.36069390792653, "learning_rate": 3.78344436565833e-07, "logits/chosen": -16.133251190185547, "logits/rejected": -15.6480712890625, "logps/chosen": -2.447453498840332, "logps/rejected": -3.0984373092651367, "loss": 2.7807, "rewards/accuracies": 0.75, "rewards/chosen": -24.47453498840332, "rewards/margins": 6.50984001159668, "rewards/rejected": -30.984373092651367, "step": 164 }, { "epoch": 0.56823073611709, "grad_norm": 144.30140846289407, "learning_rate": 3.7353848119303536e-07, "logits/chosen": -14.615021705627441, "logits/rejected": -14.4873685836792, "logps/chosen": -2.1710031032562256, "logps/rejected": -3.150233030319214, "loss": 2.3923, "rewards/accuracies": 0.9375, "rewards/chosen": -21.710033416748047, "rewards/margins": 9.79229736328125, "rewards/rejected": -31.502328872680664, "step": 165 }, { "epoch": 0.5716745587602239, "grad_norm": 156.39504517806822, "learning_rate": 3.687363596096607e-07, "logits/chosen": -13.180891036987305, "logits/rejected": -13.603325843811035, "logps/chosen": -2.1853692531585693, "logps/rejected": -2.7947943210601807, "loss": 2.5098, "rewards/accuracies": 0.8125, "rewards/chosen": -21.85369300842285, "rewards/margins": 6.09425163269043, "rewards/rejected": -27.94794464111328, "step": 166 }, { "epoch": 0.5751183814033577, "grad_norm": 189.2555316232527, "learning_rate": 3.639387675551739e-07, "logits/chosen": -16.535764694213867, "logits/rejected": -16.17355728149414, "logps/chosen": -2.0116500854492188, "logps/rejected": -2.5565733909606934, "loss": 2.8736, "rewards/accuracies": 0.625, "rewards/chosen": -20.11650276184082, "rewards/margins": 5.449231147766113, "rewards/rejected": -25.56573486328125, "step": 167 }, { "epoch": 0.5785622040464916, "grad_norm": 168.96863729883424, "learning_rate": 3.5914640011279424e-07, "logits/chosen": -17.7143497467041, "logits/rejected": -17.613689422607422, "logps/chosen": -2.375505208969116, "logps/rejected": -3.2843146324157715, "loss": 1.8044, "rewards/accuracies": 0.9375, "rewards/chosen": -23.75505256652832, "rewards/margins": 9.088095664978027, "rewards/rejected": -32.84314727783203, "step": 168 }, { "epoch": 0.5820060266896255, "grad_norm": 154.96333991867238, "learning_rate": 3.543599516087901e-07, "logits/chosen": -16.464033126831055, "logits/rejected": -16.16067886352539, "logps/chosen": -2.4366371631622314, "logps/rejected": -2.9191558361053467, "loss": 2.8393, "rewards/accuracies": 0.75, "rewards/chosen": -24.366371154785156, "rewards/margins": 4.825188636779785, "rewards/rejected": -29.191558837890625, "step": 169 }, { "epoch": 0.5854498493327593, "grad_norm": 133.1585711778442, "learning_rate": 3.495801155118843e-07, "logits/chosen": -17.312694549560547, "logits/rejected": -17.00650978088379, "logps/chosen": -2.205974817276001, "logps/rejected": -2.8471574783325195, "loss": 2.1623, "rewards/accuracies": 1.0, "rewards/chosen": -22.059749603271484, "rewards/margins": 6.411825656890869, "rewards/rejected": -28.471574783325195, "step": 170 }, { "epoch": 0.5888936719758933, "grad_norm": 161.54792608348458, "learning_rate": 3.448075843327827e-07, "logits/chosen": -16.923572540283203, "logits/rejected": -16.892681121826172, "logps/chosen": -2.1601150035858154, "logps/rejected": -2.8166677951812744, "loss": 2.3029, "rewards/accuracies": 0.75, "rewards/chosen": -21.601150512695312, "rewards/margins": 6.56552791595459, "rewards/rejected": -28.16668128967285, "step": 171 }, { "epoch": 0.5923374946190271, "grad_norm": 147.52184027924474, "learning_rate": 3.4004304952384283e-07, "logits/chosen": -17.819061279296875, "logits/rejected": -17.36820411682129, "logps/chosen": -2.7700634002685547, "logps/rejected": -3.871890068054199, "loss": 2.3046, "rewards/accuracies": 0.9375, "rewards/chosen": -27.700634002685547, "rewards/margins": 11.018264770507812, "rewards/rejected": -38.71889877319336, "step": 172 }, { "epoch": 0.595781317262161, "grad_norm": 133.0174741810263, "learning_rate": 3.352872013788941e-07, "logits/chosen": -15.862306594848633, "logits/rejected": -15.657508850097656, "logps/chosen": -1.8171809911727905, "logps/rejected": -2.790759325027466, "loss": 1.9991, "rewards/accuracies": 0.875, "rewards/chosen": -18.171810150146484, "rewards/margins": 9.735782623291016, "rewards/rejected": -27.907590866088867, "step": 173 }, { "epoch": 0.5992251399052949, "grad_norm": 140.03974045337986, "learning_rate": 3.3054072893322785e-07, "logits/chosen": -18.625810623168945, "logits/rejected": -18.38481330871582, "logps/chosen": -2.596587896347046, "logps/rejected": -2.94968843460083, "loss": 2.901, "rewards/accuracies": 0.6875, "rewards/chosen": -25.965877532958984, "rewards/margins": 3.5310049057006836, "rewards/rejected": -29.496883392333984, "step": 174 }, { "epoch": 0.6026689625484287, "grad_norm": 183.98457603492201, "learning_rate": 3.258043198637682e-07, "logits/chosen": -14.411111831665039, "logits/rejected": -14.562616348266602, "logps/chosen": -2.5059261322021484, "logps/rejected": -3.6475634574890137, "loss": 2.2702, "rewards/accuracies": 0.875, "rewards/chosen": -25.059261322021484, "rewards/margins": 11.41637134552002, "rewards/rejected": -36.47563171386719, "step": 175 }, { "epoch": 0.6061127851915626, "grad_norm": 213.58061636651394, "learning_rate": 3.2107866038944004e-07, "logits/chosen": -18.129159927368164, "logits/rejected": -18.01100730895996, "logps/chosen": -3.2717809677124023, "logps/rejected": -3.870783805847168, "loss": 2.6386, "rewards/accuracies": 0.875, "rewards/chosen": -32.71781539916992, "rewards/margins": 5.990023612976074, "rewards/rejected": -38.70783615112305, "step": 176 }, { "epoch": 0.6095566078346966, "grad_norm": 179.1745395557081, "learning_rate": 3.163644351717492e-07, "logits/chosen": -17.755756378173828, "logits/rejected": -17.911243438720703, "logps/chosen": -2.3970510959625244, "logps/rejected": -3.1361923217773438, "loss": 2.4945, "rewards/accuracies": 0.75, "rewards/chosen": -23.970510482788086, "rewards/margins": 7.391412734985352, "rewards/rejected": -31.36192512512207, "step": 177 }, { "epoch": 0.6130004304778304, "grad_norm": 167.61011422532053, "learning_rate": 3.1166232721558714e-07, "logits/chosen": -17.961496353149414, "logits/rejected": -18.155773162841797, "logps/chosen": -2.3098530769348145, "logps/rejected": -3.018498182296753, "loss": 2.8513, "rewards/accuracies": 0.75, "rewards/chosen": -23.098527908325195, "rewards/margins": 7.086452484130859, "rewards/rejected": -30.184980392456055, "step": 178 }, { "epoch": 0.6164442531209643, "grad_norm": 166.57075495491398, "learning_rate": 3.069730177702759e-07, "logits/chosen": -16.301759719848633, "logits/rejected": -16.399494171142578, "logps/chosen": -1.7774579524993896, "logps/rejected": -3.1513190269470215, "loss": 2.6985, "rewards/accuracies": 0.875, "rewards/chosen": -17.774578094482422, "rewards/margins": 13.73861312866211, "rewards/rejected": -31.51319122314453, "step": 179 }, { "epoch": 0.6198880757640981, "grad_norm": 176.06415844976797, "learning_rate": 3.022971862308676e-07, "logits/chosen": -18.093582153320312, "logits/rejected": -18.556367874145508, "logps/chosen": -2.9991700649261475, "logps/rejected": -3.220672845840454, "loss": 3.4972, "rewards/accuracies": 0.6875, "rewards/chosen": -29.991701126098633, "rewards/margins": 2.215024709701538, "rewards/rejected": -32.20672607421875, "step": 180 }, { "epoch": 0.623331898407232, "grad_norm": 165.59558701094664, "learning_rate": 2.9763551003971285e-07, "logits/chosen": -17.380640029907227, "logits/rejected": -17.27768325805664, "logps/chosen": -2.5434718132019043, "logps/rejected": -3.3419389724731445, "loss": 2.149, "rewards/accuracies": 0.6875, "rewards/chosen": -25.434715270996094, "rewards/margins": 7.984673976898193, "rewards/rejected": -33.41939163208008, "step": 181 }, { "epoch": 0.6267757210503659, "grad_norm": 202.61030946129074, "learning_rate": 2.929886645883117e-07, "logits/chosen": -18.67276954650879, "logits/rejected": -18.613061904907227, "logps/chosen": -2.4041404724121094, "logps/rejected": -3.169337749481201, "loss": 3.206, "rewards/accuracies": 0.875, "rewards/chosen": -24.041404724121094, "rewards/margins": 7.651971340179443, "rewards/rejected": -31.693378448486328, "step": 182 }, { "epoch": 0.6302195436934998, "grad_norm": 187.65675738898932, "learning_rate": 2.883573231194613e-07, "logits/chosen": -17.532236099243164, "logits/rejected": -17.939956665039062, "logps/chosen": -2.346766233444214, "logps/rejected": -3.403451681137085, "loss": 2.2531, "rewards/accuracies": 0.875, "rewards/chosen": -23.46766471862793, "rewards/margins": 10.566852569580078, "rewards/rejected": -34.034515380859375, "step": 183 }, { "epoch": 0.6336633663366337, "grad_norm": 195.90592847716476, "learning_rate": 2.837421566297156e-07, "logits/chosen": -17.615713119506836, "logits/rejected": -17.56720542907715, "logps/chosen": -2.333320140838623, "logps/rejected": -2.91764760017395, "loss": 3.0132, "rewards/accuracies": 0.625, "rewards/chosen": -23.333202362060547, "rewards/margins": 5.8432722091674805, "rewards/rejected": -29.176475524902344, "step": 184 }, { "epoch": 0.6371071889797676, "grad_norm": 183.34560091532057, "learning_rate": 2.7914383377217083e-07, "logits/chosen": -18.328622817993164, "logits/rejected": -18.28988265991211, "logps/chosen": -2.4505515098571777, "logps/rejected": -3.1104087829589844, "loss": 2.7728, "rewards/accuracies": 0.6875, "rewards/chosen": -24.50551414489746, "rewards/margins": 6.598570823669434, "rewards/rejected": -31.104084014892578, "step": 185 }, { "epoch": 0.6405510116229014, "grad_norm": 143.6928758781207, "learning_rate": 2.745630207595893e-07, "logits/chosen": -17.590606689453125, "logits/rejected": -17.842931747436523, "logps/chosen": -2.3018007278442383, "logps/rejected": -3.050802707672119, "loss": 2.3707, "rewards/accuracies": 0.875, "rewards/chosen": -23.018009185791016, "rewards/margins": 7.490016937255859, "rewards/rejected": -30.508028030395508, "step": 186 }, { "epoch": 0.6439948342660353, "grad_norm": 184.0549446666674, "learning_rate": 2.70000381267877e-07, "logits/chosen": -17.26044273376465, "logits/rejected": -16.797651290893555, "logps/chosen": -2.5893783569335938, "logps/rejected": -3.3172554969787598, "loss": 2.5419, "rewards/accuracies": 0.8125, "rewards/chosen": -25.893783569335938, "rewards/margins": 7.278769016265869, "rewards/rejected": -33.17255401611328, "step": 187 }, { "epoch": 0.6474386569091691, "grad_norm": 162.70714561559674, "learning_rate": 2.654565763399299e-07, "logits/chosen": -17.615966796875, "logits/rejected": -17.202425003051758, "logps/chosen": -2.06203556060791, "logps/rejected": -2.5958409309387207, "loss": 2.3888, "rewards/accuracies": 0.8125, "rewards/chosen": -20.62035369873047, "rewards/margins": 5.338054180145264, "rewards/rejected": -25.958410263061523, "step": 188 }, { "epoch": 0.650882479552303, "grad_norm": 211.4855934809745, "learning_rate": 2.6093226428986103e-07, "logits/chosen": -17.890888214111328, "logits/rejected": -18.181949615478516, "logps/chosen": -2.5792300701141357, "logps/rejected": -2.9869372844696045, "loss": 3.2394, "rewards/accuracies": 0.625, "rewards/chosen": -25.792301177978516, "rewards/margins": 4.077073574066162, "rewards/rejected": -29.869373321533203, "step": 189 }, { "epoch": 0.654326302195437, "grad_norm": 241.71964487552518, "learning_rate": 2.564281006076217e-07, "logits/chosen": -17.281450271606445, "logits/rejected": -17.177658081054688, "logps/chosen": -2.0290184020996094, "logps/rejected": -2.531289577484131, "loss": 2.9749, "rewards/accuracies": 0.625, "rewards/chosen": -20.29018211364746, "rewards/margins": 5.022716999053955, "rewards/rejected": -25.31290054321289, "step": 190 }, { "epoch": 0.6577701248385708, "grad_norm": 140.2272151723766, "learning_rate": 2.519447378640342e-07, "logits/chosen": -18.04941749572754, "logits/rejected": -17.695268630981445, "logps/chosen": -2.407273292541504, "logps/rejected": -3.1553428173065186, "loss": 2.5175, "rewards/accuracies": 0.8125, "rewards/chosen": -24.07273292541504, "rewards/margins": 7.480693340301514, "rewards/rejected": -31.553424835205078, "step": 191 }, { "epoch": 0.6612139474817047, "grad_norm": 122.61240060144814, "learning_rate": 2.4748282561624587e-07, "logits/chosen": -18.940771102905273, "logits/rejected": -19.209871292114258, "logps/chosen": -2.9477639198303223, "logps/rejected": -3.3782527446746826, "loss": 2.231, "rewards/accuracies": 0.8125, "rewards/chosen": -29.47764015197754, "rewards/margins": 4.304885387420654, "rewards/rejected": -33.78252410888672, "step": 192 }, { "epoch": 0.6646577701248386, "grad_norm": 154.6658255017087, "learning_rate": 2.4304301031361993e-07, "logits/chosen": -17.39430046081543, "logits/rejected": -17.464357376098633, "logps/chosen": -1.6274338960647583, "logps/rejected": -2.39015531539917, "loss": 2.0168, "rewards/accuracies": 0.8125, "rewards/chosen": -16.274337768554688, "rewards/margins": 7.627217769622803, "rewards/rejected": -23.901554107666016, "step": 193 }, { "epoch": 0.6681015927679724, "grad_norm": 183.06073957842517, "learning_rate": 2.386259352040766e-07, "logits/chosen": -17.165082931518555, "logits/rejected": -17.03466796875, "logps/chosen": -2.3020198345184326, "logps/rejected": -3.141963481903076, "loss": 2.5949, "rewards/accuracies": 0.9375, "rewards/chosen": -23.020198822021484, "rewards/margins": 8.399435997009277, "rewards/rejected": -31.419633865356445, "step": 194 }, { "epoch": 0.6715454154111064, "grad_norm": 195.7328576465893, "learning_rate": 2.3423224024089924e-07, "logits/chosen": -16.67756462097168, "logits/rejected": -16.06248664855957, "logps/chosen": -1.9835630655288696, "logps/rejected": -2.165980815887451, "loss": 2.8892, "rewards/accuracies": 0.6875, "rewards/chosen": -19.83563232421875, "rewards/margins": 1.8241767883300781, "rewards/rejected": -21.659809112548828, "step": 195 }, { "epoch": 0.6749892380542402, "grad_norm": 185.40100121088884, "learning_rate": 2.2986256199001607e-07, "logits/chosen": -16.720062255859375, "logits/rejected": -17.210546493530273, "logps/chosen": -2.1794826984405518, "logps/rejected": -3.011854887008667, "loss": 3.2752, "rewards/accuracies": 0.9375, "rewards/chosen": -21.79482650756836, "rewards/margins": 8.323722839355469, "rewards/rejected": -30.118549346923828, "step": 196 }, { "epoch": 0.6784330606973741, "grad_norm": 169.01081696151877, "learning_rate": 2.2551753353777298e-07, "logits/chosen": -16.483049392700195, "logits/rejected": -15.803775787353516, "logps/chosen": -1.923852562904358, "logps/rejected": -2.4508516788482666, "loss": 2.4423, "rewards/accuracies": 0.8125, "rewards/chosen": -19.238523483276367, "rewards/margins": 5.269989967346191, "rewards/rejected": -24.508514404296875, "step": 197 }, { "epoch": 0.681876883340508, "grad_norm": 184.28493562958423, "learning_rate": 2.2119778439921243e-07, "logits/chosen": -17.207120895385742, "logits/rejected": -17.228199005126953, "logps/chosen": -2.1767778396606445, "logps/rejected": -2.8498196601867676, "loss": 2.6463, "rewards/accuracies": 0.875, "rewards/chosen": -21.767776489257812, "rewards/margins": 6.730417251586914, "rewards/rejected": -28.49819564819336, "step": 198 }, { "epoch": 0.6853207059836418, "grad_norm": 185.34805729694065, "learning_rate": 2.169039404268666e-07, "logits/chosen": -15.101792335510254, "logits/rejected": -15.015120506286621, "logps/chosen": -1.9498858451843262, "logps/rejected": -2.9745125770568848, "loss": 2.4206, "rewards/accuracies": 0.875, "rewards/chosen": -19.498859405517578, "rewards/margins": 10.246267318725586, "rewards/rejected": -29.745128631591797, "step": 199 }, { "epoch": 0.6887645286267757, "grad_norm": 171.9250040897193, "learning_rate": 2.1263662372008397e-07, "logits/chosen": -17.20707893371582, "logits/rejected": -17.392990112304688, "logps/chosen": -2.209491491317749, "logps/rejected": -3.219839334487915, "loss": 2.3084, "rewards/accuracies": 0.875, "rewards/chosen": -22.094913482666016, "rewards/margins": 10.103475570678711, "rewards/rejected": -32.19839096069336, "step": 200 }, { "epoch": 0.6922083512699096, "grad_norm": 134.1599441897618, "learning_rate": 2.0839645253489785e-07, "logits/chosen": -17.15797233581543, "logits/rejected": -17.089859008789062, "logps/chosen": -2.460911512374878, "logps/rejected": -3.360152244567871, "loss": 2.7262, "rewards/accuracies": 0.875, "rewards/chosen": -24.60911750793457, "rewards/margins": 8.992408752441406, "rewards/rejected": -33.601524353027344, "step": 201 }, { "epoch": 0.6956521739130435, "grad_norm": 225.89869134193762, "learning_rate": 2.0418404119445257e-07, "logits/chosen": -18.76146697998047, "logits/rejected": -18.73739242553711, "logps/chosen": -2.1693127155303955, "logps/rejected": -2.5993404388427734, "loss": 2.7532, "rewards/accuracies": 0.75, "rewards/chosen": -21.693126678466797, "rewards/margins": 4.3002777099609375, "rewards/rejected": -25.993404388427734, "step": 202 }, { "epoch": 0.6990959965561774, "grad_norm": 168.18364451173605, "learning_rate": 2.0000000000000007e-07, "logits/chosen": -16.125919342041016, "logits/rejected": -16.450841903686523, "logps/chosen": -2.2542104721069336, "logps/rejected": -2.954272985458374, "loss": 2.3868, "rewards/accuracies": 0.8125, "rewards/chosen": -22.542104721069336, "rewards/margins": 7.000626087188721, "rewards/rejected": -29.542734146118164, "step": 203 }, { "epoch": 0.7025398191993112, "grad_norm": 169.8912048470285, "learning_rate": 1.9584493514247673e-07, "logits/chosen": -15.31773567199707, "logits/rejected": -15.480231285095215, "logps/chosen": -2.3769216537475586, "logps/rejected": -3.0659618377685547, "loss": 2.829, "rewards/accuracies": 0.75, "rewards/chosen": -23.769216537475586, "rewards/margins": 6.890398979187012, "rewards/rejected": -30.65961456298828, "step": 204 }, { "epoch": 0.7059836418424451, "grad_norm": 164.25344840367887, "learning_rate": 1.91719448614679e-07, "logits/chosen": -18.408157348632812, "logits/rejected": -18.19486427307129, "logps/chosen": -2.329998016357422, "logps/rejected": -2.998572587966919, "loss": 2.6702, "rewards/accuracies": 0.75, "rewards/chosen": -23.29998016357422, "rewards/margins": 6.685744285583496, "rewards/rejected": -29.9857234954834, "step": 205 }, { "epoch": 0.709427464485579, "grad_norm": 160.2297179052625, "learning_rate": 1.8762413812404537e-07, "logits/chosen": -15.564806938171387, "logits/rejected": -15.336395263671875, "logps/chosen": -2.414278507232666, "logps/rejected": -3.099297523498535, "loss": 2.3344, "rewards/accuracies": 0.875, "rewards/chosen": -24.142784118652344, "rewards/margins": 6.850188255310059, "rewards/rejected": -30.992971420288086, "step": 206 }, { "epoch": 0.7128712871287128, "grad_norm": 159.59891745599342, "learning_rate": 1.8355959700605835e-07, "logits/chosen": -16.28492546081543, "logits/rejected": -15.95881462097168, "logps/chosen": -2.8605947494506836, "logps/rejected": -3.867330551147461, "loss": 2.8207, "rewards/accuracies": 0.9375, "rewards/chosen": -28.60594367980957, "rewards/margins": 10.067360877990723, "rewards/rejected": -38.67330551147461, "step": 207 }, { "epoch": 0.7163151097718468, "grad_norm": 138.3874486270341, "learning_rate": 1.7952641413828285e-07, "logits/chosen": -14.021824836730957, "logits/rejected": -14.172553062438965, "logps/chosen": -1.9960724115371704, "logps/rejected": -2.703728199005127, "loss": 2.7886, "rewards/accuracies": 0.8125, "rewards/chosen": -19.960723876953125, "rewards/margins": 7.076559066772461, "rewards/rejected": -27.037281036376953, "step": 208 }, { "epoch": 0.7197589324149807, "grad_norm": 139.88886492945034, "learning_rate": 1.755251738550471e-07, "logits/chosen": -17.956972122192383, "logits/rejected": -17.393238067626953, "logps/chosen": -2.526200294494629, "logps/rejected": -3.3326597213745117, "loss": 2.5612, "rewards/accuracies": 0.8125, "rewards/chosen": -25.262001037597656, "rewards/margins": 8.064594268798828, "rewards/rejected": -33.32659149169922, "step": 209 }, { "epoch": 0.7232027550581145, "grad_norm": 121.87663059806626, "learning_rate": 1.7155645586278396e-07, "logits/chosen": -16.801706314086914, "logits/rejected": -17.29227638244629, "logps/chosen": -2.4827024936676025, "logps/rejected": -3.251868724822998, "loss": 2.3086, "rewards/accuracies": 1.0, "rewards/chosen": -24.827022552490234, "rewards/margins": 7.691664218902588, "rewards/rejected": -32.5186882019043, "step": 210 }, { "epoch": 0.7266465777012484, "grad_norm": 183.5598825145353, "learning_rate": 1.6762083515604205e-07, "logits/chosen": -16.376623153686523, "logits/rejected": -16.853988647460938, "logps/chosen": -2.1823861598968506, "logps/rejected": -2.469951629638672, "loss": 3.0611, "rewards/accuracies": 0.625, "rewards/chosen": -21.82386016845703, "rewards/margins": 2.8756532669067383, "rewards/rejected": -24.69951629638672, "step": 211 }, { "epoch": 0.7300904003443822, "grad_norm": 140.6766155274216, "learning_rate": 1.6371888193417962e-07, "logits/chosen": -16.161523818969727, "logits/rejected": -15.664430618286133, "logps/chosen": -2.5164270401000977, "logps/rejected": -3.5702481269836426, "loss": 1.5852, "rewards/accuracies": 0.8125, "rewards/chosen": -25.16427230834961, "rewards/margins": 10.538207054138184, "rewards/rejected": -35.702476501464844, "step": 212 }, { "epoch": 0.7335342229875161, "grad_norm": 145.77478214483807, "learning_rate": 1.598511615187527e-07, "logits/chosen": -16.284515380859375, "logits/rejected": -15.916769027709961, "logps/chosen": -1.8510792255401611, "logps/rejected": -2.855922222137451, "loss": 2.4349, "rewards/accuracies": 0.8125, "rewards/chosen": -18.51078987121582, "rewards/margins": 10.048429489135742, "rewards/rejected": -28.559221267700195, "step": 213 }, { "epoch": 0.7369780456306501, "grad_norm": 165.3902758912234, "learning_rate": 1.560182342716109e-07, "logits/chosen": -16.97911834716797, "logits/rejected": -17.0489444732666, "logps/chosen": -2.855278253555298, "logps/rejected": -3.1526296138763428, "loss": 2.8624, "rewards/accuracies": 0.75, "rewards/chosen": -28.552780151367188, "rewards/margins": 2.9735140800476074, "rewards/rejected": -31.526294708251953, "step": 214 }, { "epoch": 0.7404218682737839, "grad_norm": 169.1752973988815, "learning_rate": 1.5222065551371078e-07, "logits/chosen": -16.086669921875, "logits/rejected": -15.614689826965332, "logps/chosen": -2.2419023513793945, "logps/rejected": -2.8294296264648438, "loss": 2.1771, "rewards/accuracies": 0.875, "rewards/chosen": -22.41902732849121, "rewards/margins": 5.875270843505859, "rewards/rejected": -28.294296264648438, "step": 215 }, { "epoch": 0.7438656909169178, "grad_norm": 175.5809330835783, "learning_rate": 1.4845897544466062e-07, "logits/chosen": -15.881332397460938, "logits/rejected": -16.038084030151367, "logps/chosen": -1.938501238822937, "logps/rejected": -2.6783177852630615, "loss": 2.5021, "rewards/accuracies": 0.875, "rewards/chosen": -19.385011672973633, "rewards/margins": 7.398165702819824, "rewards/rejected": -26.783178329467773, "step": 216 }, { "epoch": 0.7473095135600517, "grad_norm": 138.7906401630708, "learning_rate": 1.4473373906300576e-07, "logits/chosen": -14.417771339416504, "logits/rejected": -14.370369911193848, "logps/chosen": -1.7395800352096558, "logps/rejected": -2.279273271560669, "loss": 2.329, "rewards/accuracies": 0.875, "rewards/chosen": -17.39579963684082, "rewards/margins": 5.396934986114502, "rewards/rejected": -22.79273223876953, "step": 217 }, { "epoch": 0.7507533362031855, "grad_norm": 177.5157013301922, "learning_rate": 1.4104548608726895e-07, "logits/chosen": -17.694894790649414, "logits/rejected": -17.320873260498047, "logps/chosen": -2.5375585556030273, "logps/rejected": -3.9845151901245117, "loss": 2.2671, "rewards/accuracies": 0.9375, "rewards/chosen": -25.375581741333008, "rewards/margins": 14.46957015991211, "rewards/rejected": -39.84515380859375, "step": 218 }, { "epoch": 0.7541971588463194, "grad_norm": 134.9327740603631, "learning_rate": 1.3739475087775466e-07, "logits/chosen": -15.686095237731934, "logits/rejected": -15.889823913574219, "logps/chosen": -2.468545913696289, "logps/rejected": -2.932441234588623, "loss": 2.5443, "rewards/accuracies": 0.75, "rewards/chosen": -24.685457229614258, "rewards/margins": 4.638955116271973, "rewards/rejected": -29.324413299560547, "step": 219 }, { "epoch": 0.7576409814894532, "grad_norm": 135.47400869262708, "learning_rate": 1.3378206235913028e-07, "logits/chosen": -15.72805404663086, "logits/rejected": -15.899078369140625, "logps/chosen": -1.9772777557373047, "logps/rejected": -2.4077036380767822, "loss": 2.3468, "rewards/accuracies": 0.6875, "rewards/chosen": -19.77277946472168, "rewards/margins": 4.304256916046143, "rewards/rejected": -24.07703399658203, "step": 220 }, { "epoch": 0.7610848041325872, "grad_norm": 162.24671967985978, "learning_rate": 1.3020794394379447e-07, "logits/chosen": -16.417884826660156, "logits/rejected": -15.766003608703613, "logps/chosen": -3.0476391315460205, "logps/rejected": -4.174098491668701, "loss": 2.7984, "rewards/accuracies": 0.9375, "rewards/chosen": -30.476388931274414, "rewards/margins": 11.264592170715332, "rewards/rejected": -41.74098205566406, "step": 221 }, { "epoch": 0.7645286267757211, "grad_norm": 153.11579053938232, "learning_rate": 1.2667291345604433e-07, "logits/chosen": -16.348756790161133, "logits/rejected": -16.798192977905273, "logps/chosen": -2.2396018505096436, "logps/rejected": -2.7667808532714844, "loss": 2.4575, "rewards/accuracies": 0.6875, "rewards/chosen": -22.39601707458496, "rewards/margins": 5.271792411804199, "rewards/rejected": -27.66781234741211, "step": 222 }, { "epoch": 0.7679724494188549, "grad_norm": 125.46348058673289, "learning_rate": 1.2317748305705217e-07, "logits/chosen": -17.994367599487305, "logits/rejected": -18.30419921875, "logps/chosen": -2.42231822013855, "logps/rejected": -2.860621690750122, "loss": 2.578, "rewards/accuracies": 0.75, "rewards/chosen": -24.223180770874023, "rewards/margins": 4.383035659790039, "rewards/rejected": -28.606216430664062, "step": 223 }, { "epoch": 0.7714162720619888, "grad_norm": 130.3454994188522, "learning_rate": 1.1972215917066307e-07, "logits/chosen": -17.247783660888672, "logits/rejected": -16.855878829956055, "logps/chosen": -2.419665813446045, "logps/rejected": -3.496854782104492, "loss": 2.3658, "rewards/accuracies": 0.875, "rewards/chosen": -24.196657180786133, "rewards/margins": 10.771889686584473, "rewards/rejected": -34.968544006347656, "step": 224 }, { "epoch": 0.7748600947051227, "grad_norm": 172.91319963893733, "learning_rate": 1.1630744241002223e-07, "logits/chosen": -17.969970703125, "logits/rejected": -18.13035011291504, "logps/chosen": -2.1760947704315186, "logps/rejected": -2.7468371391296387, "loss": 2.4968, "rewards/accuracies": 0.8125, "rewards/chosen": -21.760944366455078, "rewards/margins": 5.707423210144043, "rewards/rejected": -27.46837043762207, "step": 225 }, { "epoch": 0.7783039173482565, "grad_norm": 113.76103569407015, "learning_rate": 1.1293382750504688e-07, "logits/chosen": -18.275222778320312, "logits/rejected": -17.712696075439453, "logps/chosen": -2.3341901302337646, "logps/rejected": -3.101623773574829, "loss": 2.1953, "rewards/accuracies": 0.875, "rewards/chosen": -23.341899871826172, "rewards/margins": 7.674335479736328, "rewards/rejected": -31.016237258911133, "step": 226 }, { "epoch": 0.7817477399913905, "grad_norm": 132.91068429588344, "learning_rate": 1.0960180323074774e-07, "logits/chosen": -18.44281578063965, "logits/rejected": -18.48580551147461, "logps/chosen": -2.326096296310425, "logps/rejected": -3.35933518409729, "loss": 1.9983, "rewards/accuracies": 0.875, "rewards/chosen": -23.260963439941406, "rewards/margins": 10.332388877868652, "rewards/rejected": -33.593353271484375, "step": 227 }, { "epoch": 0.7851915626345243, "grad_norm": 148.59038179673675, "learning_rate": 1.0631185233641474e-07, "logits/chosen": -18.724830627441406, "logits/rejected": -18.60055923461914, "logps/chosen": -2.2051844596862793, "logps/rejected": -3.1807661056518555, "loss": 2.2086, "rewards/accuracies": 0.8125, "rewards/chosen": -22.05184555053711, "rewards/margins": 9.755819320678711, "rewards/rejected": -31.80766487121582, "step": 228 }, { "epoch": 0.7886353852776582, "grad_norm": 168.3363506588075, "learning_rate": 1.0306445147567604e-07, "logits/chosen": -16.910625457763672, "logits/rejected": -17.089317321777344, "logps/chosen": -2.1940903663635254, "logps/rejected": -2.7529542446136475, "loss": 3.2585, "rewards/accuracies": 0.75, "rewards/chosen": -21.940902709960938, "rewards/margins": 5.588636875152588, "rewards/rejected": -27.529541015625, "step": 229 }, { "epoch": 0.7920792079207921, "grad_norm": 140.0112579178903, "learning_rate": 9.986007113743906e-08, "logits/chosen": -17.158119201660156, "logits/rejected": -17.688983917236328, "logps/chosen": -2.0375654697418213, "logps/rejected": -2.7925596237182617, "loss": 2.1608, "rewards/accuracies": 0.8125, "rewards/chosen": -20.375656127929688, "rewards/margins": 7.549938201904297, "rewards/rejected": -27.925594329833984, "step": 230 }, { "epoch": 0.7955230305639259, "grad_norm": 147.83687034828287, "learning_rate": 9.669917557772542e-08, "logits/chosen": -17.468488693237305, "logits/rejected": -17.5322208404541, "logps/chosen": -2.3444223403930664, "logps/rejected": -2.8957479000091553, "loss": 2.1804, "rewards/accuracies": 0.8125, "rewards/chosen": -23.44422149658203, "rewards/margins": 5.513256549835205, "rewards/rejected": -28.95747947692871, "step": 231 }, { "epoch": 0.7989668532070598, "grad_norm": 182.36869206020535, "learning_rate": 9.358222275240884e-08, "logits/chosen": -17.69605255126953, "logits/rejected": -17.339937210083008, "logps/chosen": -2.6390442848205566, "logps/rejected": -3.474766254425049, "loss": 2.8717, "rewards/accuracies": 0.8125, "rewards/chosen": -26.39044189453125, "rewards/margins": 8.357217788696289, "rewards/rejected": -34.74766159057617, "step": 232 }, { "epoch": 0.8024106758501938, "grad_norm": 253.44665519838688, "learning_rate": 9.050966425086546e-08, "logits/chosen": -17.90776252746582, "logits/rejected": -18.13881492614746, "logps/chosen": -2.3884530067443848, "logps/rejected": -3.7100043296813965, "loss": 3.4066, "rewards/accuracies": 0.875, "rewards/chosen": -23.884531021118164, "rewards/margins": 13.215514183044434, "rewards/rejected": -37.10004425048828, "step": 233 }, { "epoch": 0.8058544984933276, "grad_norm": 133.61246579427012, "learning_rate": 8.748194523054748e-08, "logits/chosen": -17.281970977783203, "logits/rejected": -17.455997467041016, "logps/chosen": -2.43415904045105, "logps/rejected": -3.0494344234466553, "loss": 1.9898, "rewards/accuracies": 0.8125, "rewards/chosen": -24.341590881347656, "rewards/margins": 6.152754783630371, "rewards/rejected": -30.49434471130371, "step": 234 }, { "epoch": 0.8092983211364615, "grad_norm": 147.12423003196992, "learning_rate": 8.449950435248676e-08, "logits/chosen": -17.537288665771484, "logits/rejected": -17.6503963470459, "logps/chosen": -2.411252737045288, "logps/rejected": -2.5560195446014404, "loss": 2.959, "rewards/accuracies": 0.625, "rewards/chosen": -24.11252784729004, "rewards/margins": 1.447667121887207, "rewards/rejected": -25.56019401550293, "step": 235 }, { "epoch": 0.8127421437795953, "grad_norm": 173.4351632725469, "learning_rate": 8.15627737177425e-08, "logits/chosen": -15.300152778625488, "logits/rejected": -14.813128471374512, "logps/chosen": -2.2649199962615967, "logps/rejected": -3.06396222114563, "loss": 3.1114, "rewards/accuracies": 0.8125, "rewards/chosen": -22.649198532104492, "rewards/margins": 7.99042272567749, "rewards/rejected": -30.63962173461914, "step": 236 }, { "epoch": 0.8161859664227292, "grad_norm": 148.7657685969793, "learning_rate": 7.867217880479629e-08, "logits/chosen": -15.767210006713867, "logits/rejected": -15.650933265686035, "logps/chosen": -2.087116003036499, "logps/rejected": -3.3104443550109863, "loss": 2.4406, "rewards/accuracies": 0.8125, "rewards/chosen": -20.87116241455078, "rewards/margins": 12.23327922821045, "rewards/rejected": -33.10444259643555, "step": 237 }, { "epoch": 0.8196297890658631, "grad_norm": 151.07749718687427, "learning_rate": 7.582813840790847e-08, "logits/chosen": -15.618885040283203, "logits/rejected": -15.997750282287598, "logps/chosen": -2.0786385536193848, "logps/rejected": -2.7961952686309814, "loss": 2.7965, "rewards/accuracies": 0.6875, "rewards/chosen": -20.786386489868164, "rewards/margins": 7.175565719604492, "rewards/rejected": -27.96195411682129, "step": 238 }, { "epoch": 0.823073611708997, "grad_norm": 144.2174329571498, "learning_rate": 7.303106457644328e-08, "logits/chosen": -16.712963104248047, "logits/rejected": -16.825284957885742, "logps/chosen": -2.238386392593384, "logps/rejected": -3.5630311965942383, "loss": 2.8129, "rewards/accuracies": 1.0, "rewards/chosen": -22.383865356445312, "rewards/margins": 13.246443748474121, "rewards/rejected": -35.63031005859375, "step": 239 }, { "epoch": 0.8265174343521309, "grad_norm": 169.73215342488882, "learning_rate": 7.028136255516938e-08, "logits/chosen": -18.084627151489258, "logits/rejected": -18.091758728027344, "logps/chosen": -3.0889220237731934, "logps/rejected": -3.6106934547424316, "loss": 2.5796, "rewards/accuracies": 0.875, "rewards/chosen": -30.88922119140625, "rewards/margins": 5.217716693878174, "rewards/rejected": -36.106937408447266, "step": 240 }, { "epoch": 0.8299612569952648, "grad_norm": 149.39660539357982, "learning_rate": 6.75794307255479e-08, "logits/chosen": -17.41098976135254, "logits/rejected": -17.147449493408203, "logps/chosen": -2.440295457839966, "logps/rejected": -3.6117165088653564, "loss": 2.1983, "rewards/accuracies": 0.9375, "rewards/chosen": -24.4029541015625, "rewards/margins": 11.714213371276855, "rewards/rejected": -36.11716842651367, "step": 241 }, { "epoch": 0.8334050796383986, "grad_norm": 154.41052222889476, "learning_rate": 6.492566054801414e-08, "logits/chosen": -17.25594139099121, "logits/rejected": -17.346635818481445, "logps/chosen": -2.8043160438537598, "logps/rejected": -3.589071750640869, "loss": 2.6675, "rewards/accuracies": 0.9375, "rewards/chosen": -28.04315948486328, "rewards/margins": 7.847556114196777, "rewards/rejected": -35.890716552734375, "step": 242 }, { "epoch": 0.8368489022815325, "grad_norm": 144.1864772937289, "learning_rate": 6.232043650526195e-08, "logits/chosen": -19.001102447509766, "logits/rejected": -19.054353713989258, "logps/chosen": -2.5245139598846436, "logps/rejected": -2.9145162105560303, "loss": 2.6573, "rewards/accuracies": 0.75, "rewards/chosen": -25.245140075683594, "rewards/margins": 3.9000210762023926, "rewards/rejected": -29.145160675048828, "step": 243 }, { "epoch": 0.8402927249246663, "grad_norm": 159.15922893834966, "learning_rate": 5.976413604653978e-08, "logits/chosen": -16.62332534790039, "logits/rejected": -17.053430557250977, "logps/chosen": -2.617964267730713, "logps/rejected": -3.058418035507202, "loss": 2.3253, "rewards/accuracies": 0.875, "rewards/chosen": -26.179641723632812, "rewards/margins": 4.404535293579102, "rewards/rejected": -30.58417510986328, "step": 244 }, { "epoch": 0.8437365475678003, "grad_norm": 140.0984307995632, "learning_rate": 5.725712953296438e-08, "logits/chosen": -15.821372985839844, "logits/rejected": -15.742339134216309, "logps/chosen": -1.8620578050613403, "logps/rejected": -2.808138847351074, "loss": 2.4966, "rewards/accuracies": 0.9375, "rewards/chosen": -18.62057876586914, "rewards/margins": 9.460811614990234, "rewards/rejected": -28.081388473510742, "step": 245 }, { "epoch": 0.8471803702109342, "grad_norm": 147.6831357797836, "learning_rate": 5.479978018386275e-08, "logits/chosen": -17.423532485961914, "logits/rejected": -17.499662399291992, "logps/chosen": -2.267676591873169, "logps/rejected": -2.954185962677002, "loss": 2.4517, "rewards/accuracies": 0.8125, "rewards/chosen": -22.676767349243164, "rewards/margins": 6.8650922775268555, "rewards/rejected": -29.541858673095703, "step": 246 }, { "epoch": 0.850624192854068, "grad_norm": 146.93865641824823, "learning_rate": 5.23924440241486e-08, "logits/chosen": -17.467391967773438, "logits/rejected": -17.293798446655273, "logps/chosen": -2.765316963195801, "logps/rejected": -3.1737022399902344, "loss": 3.1905, "rewards/accuracies": 0.6875, "rewards/chosen": -27.65317153930664, "rewards/margins": 4.083853244781494, "rewards/rejected": -31.737022399902344, "step": 247 }, { "epoch": 0.8540680154972019, "grad_norm": 128.25480467364565, "learning_rate": 5.003546983274014e-08, "logits/chosen": -17.457887649536133, "logits/rejected": -17.364229202270508, "logps/chosen": -2.2277069091796875, "logps/rejected": -3.1269993782043457, "loss": 2.1543, "rewards/accuracies": 0.8125, "rewards/chosen": -22.277069091796875, "rewards/margins": 8.992926597595215, "rewards/rejected": -31.26999282836914, "step": 248 }, { "epoch": 0.8575118381403358, "grad_norm": 126.61876835603796, "learning_rate": 4.77291990920289e-08, "logits/chosen": -15.818652153015137, "logits/rejected": -15.359832763671875, "logps/chosen": -1.8357523679733276, "logps/rejected": -2.8696041107177734, "loss": 2.5736, "rewards/accuracies": 0.875, "rewards/chosen": -18.357524871826172, "rewards/margins": 10.33851432800293, "rewards/rejected": -28.69603729248047, "step": 249 }, { "epoch": 0.8609556607834696, "grad_norm": 147.3951096860917, "learning_rate": 4.5473965938405e-08, "logits/chosen": -17.267982482910156, "logits/rejected": -17.239002227783203, "logps/chosen": -2.3899097442626953, "logps/rejected": -3.6304116249084473, "loss": 2.1633, "rewards/accuracies": 0.9375, "rewards/chosen": -23.899097442626953, "rewards/margins": 12.405017852783203, "rewards/rejected": -36.304115295410156, "step": 250 }, { "epoch": 0.8643994834266036, "grad_norm": 131.8214642518433, "learning_rate": 4.32700971138471e-08, "logits/chosen": -16.42135238647461, "logits/rejected": -17.10132598876953, "logps/chosen": -2.0020933151245117, "logps/rejected": -2.6545495986938477, "loss": 2.1802, "rewards/accuracies": 0.75, "rewards/chosen": -20.02093505859375, "rewards/margins": 6.524560928344727, "rewards/rejected": -26.54549789428711, "step": 251 }, { "epoch": 0.8678433060697374, "grad_norm": 129.82185059004897, "learning_rate": 4.11179119185832e-08, "logits/chosen": -15.84195327758789, "logits/rejected": -15.13234806060791, "logps/chosen": -2.1187713146209717, "logps/rejected": -3.074852228164673, "loss": 2.0957, "rewards/accuracies": 0.8125, "rewards/chosen": -21.187713623046875, "rewards/margins": 9.560807228088379, "rewards/rejected": -30.748519897460938, "step": 252 }, { "epoch": 0.8712871287128713, "grad_norm": 147.62415325752085, "learning_rate": 3.9017722164830014e-08, "logits/chosen": -15.684465408325195, "logits/rejected": -15.583179473876953, "logps/chosen": -2.2432422637939453, "logps/rejected": -3.128464698791504, "loss": 2.2196, "rewards/accuracies": 0.8125, "rewards/chosen": -22.432418823242188, "rewards/margins": 8.852226257324219, "rewards/rejected": -31.284645080566406, "step": 253 }, { "epoch": 0.8747309513560052, "grad_norm": 129.59541911753672, "learning_rate": 3.696983213161724e-08, "logits/chosen": -16.10622215270996, "logits/rejected": -15.97018814086914, "logps/chosen": -2.268676519393921, "logps/rejected": -3.3363349437713623, "loss": 1.8693, "rewards/accuracies": 0.9375, "rewards/chosen": -22.686765670776367, "rewards/margins": 10.676584243774414, "rewards/rejected": -33.363346099853516, "step": 254 }, { "epoch": 0.878174773999139, "grad_norm": 137.48943851483648, "learning_rate": 3.4974538520702756e-08, "logits/chosen": -14.73385238647461, "logits/rejected": -14.685803413391113, "logps/chosen": -2.011627197265625, "logps/rejected": -2.8314733505249023, "loss": 2.1826, "rewards/accuracies": 0.875, "rewards/chosen": -20.11627197265625, "rewards/margins": 8.198460578918457, "rewards/rejected": -28.314733505249023, "step": 255 }, { "epoch": 0.8816185966422729, "grad_norm": 159.72968498568136, "learning_rate": 3.303213041358628e-08, "logits/chosen": -16.42586898803711, "logits/rejected": -16.5064697265625, "logps/chosen": -2.3388772010803223, "logps/rejected": -3.104121208190918, "loss": 2.3098, "rewards/accuracies": 0.625, "rewards/chosen": -23.388771057128906, "rewards/margins": 7.652439117431641, "rewards/rejected": -31.041210174560547, "step": 256 }, { "epoch": 0.8850624192854069, "grad_norm": 139.996512233341, "learning_rate": 3.114288922962673e-08, "logits/chosen": -15.805761337280273, "logits/rejected": -15.97050666809082, "logps/chosen": -2.2719080448150635, "logps/rejected": -2.8697736263275146, "loss": 1.9082, "rewards/accuracies": 0.75, "rewards/chosen": -22.719078063964844, "rewards/margins": 5.9786577224731445, "rewards/rejected": -28.697734832763672, "step": 257 }, { "epoch": 0.8885062419285407, "grad_norm": 142.0757665384917, "learning_rate": 2.9307088685269544e-08, "logits/chosen": -16.687719345092773, "logits/rejected": -16.818946838378906, "logps/chosen": -2.10107421875, "logps/rejected": -2.945895195007324, "loss": 2.0949, "rewards/accuracies": 0.8125, "rewards/chosen": -21.010740280151367, "rewards/margins": 8.448209762573242, "rewards/rejected": -29.45895004272461, "step": 258 }, { "epoch": 0.8919500645716746, "grad_norm": 132.30438095070775, "learning_rate": 2.7524994754390206e-08, "logits/chosen": -18.812986373901367, "logits/rejected": -18.67325210571289, "logps/chosen": -2.727328300476074, "logps/rejected": -3.1050448417663574, "loss": 2.0166, "rewards/accuracies": 0.8125, "rewards/chosen": -27.27328109741211, "rewards/margins": 3.7771661281585693, "rewards/rejected": -31.050447463989258, "step": 259 }, { "epoch": 0.8953938872148084, "grad_norm": 146.74697878065496, "learning_rate": 2.5796865629759622e-08, "logits/chosen": -16.785795211791992, "logits/rejected": -16.06755256652832, "logps/chosen": -2.190410852432251, "logps/rejected": -3.289196014404297, "loss": 2.7578, "rewards/accuracies": 0.6875, "rewards/chosen": -21.90410614013672, "rewards/margins": 10.987853050231934, "rewards/rejected": -32.89196014404297, "step": 260 }, { "epoch": 0.8988377098579423, "grad_norm": 206.78249442348837, "learning_rate": 2.4122951685636674e-08, "logits/chosen": -16.686670303344727, "logits/rejected": -16.18621826171875, "logps/chosen": -2.611176013946533, "logps/rejected": -3.921743869781494, "loss": 3.0397, "rewards/accuracies": 1.0, "rewards/chosen": -26.11176109313965, "rewards/margins": 13.10567855834961, "rewards/rejected": -39.217437744140625, "step": 261 }, { "epoch": 0.9022815325010762, "grad_norm": 1949.594124426928, "learning_rate": 2.2503495441493503e-08, "logits/chosen": -16.57187271118164, "logits/rejected": -16.915918350219727, "logps/chosen": -1.6403616666793823, "logps/rejected": -2.541311740875244, "loss": 2.0577, "rewards/accuracies": 0.875, "rewards/chosen": -16.40361785888672, "rewards/margins": 9.009501457214355, "rewards/rejected": -25.41312026977539, "step": 262 }, { "epoch": 0.90572535514421, "grad_norm": 144.45425989954398, "learning_rate": 2.093873152687906e-08, "logits/chosen": -16.985397338867188, "logits/rejected": -16.533916473388672, "logps/chosen": -2.0635147094726562, "logps/rejected": -3.0182905197143555, "loss": 2.2249, "rewards/accuracies": 1.0, "rewards/chosen": -20.635149002075195, "rewards/margins": 9.547759056091309, "rewards/rejected": -30.18290901184082, "step": 263 }, { "epoch": 0.909169177787344, "grad_norm": 161.70566070216898, "learning_rate": 1.9428886647425214e-08, "logits/chosen": -18.097816467285156, "logits/rejected": -17.79208755493164, "logps/chosen": -2.152082681655884, "logps/rejected": -2.9253194332122803, "loss": 2.632, "rewards/accuracies": 0.75, "rewards/chosen": -21.52082633972168, "rewards/margins": 7.732368469238281, "rewards/rejected": -29.253196716308594, "step": 264 }, { "epoch": 0.9126130004304779, "grad_norm": 129.93371206599926, "learning_rate": 1.7974179552001866e-08, "logits/chosen": -17.505081176757812, "logits/rejected": -17.78946304321289, "logps/chosen": -2.234717845916748, "logps/rejected": -2.539454936981201, "loss": 2.4342, "rewards/accuracies": 0.6875, "rewards/chosen": -22.347179412841797, "rewards/margins": 3.0473670959472656, "rewards/rejected": -25.394546508789062, "step": 265 }, { "epoch": 0.9160568230736117, "grad_norm": 151.94990436818313, "learning_rate": 1.6574821001023474e-08, "logits/chosen": -18.333892822265625, "logits/rejected": -18.197607040405273, "logps/chosen": -1.9889158010482788, "logps/rejected": -2.735637903213501, "loss": 2.2676, "rewards/accuracies": 0.9375, "rewards/chosen": -19.889158248901367, "rewards/margins": 7.467221260070801, "rewards/rejected": -27.356380462646484, "step": 266 }, { "epoch": 0.9195006457167456, "grad_norm": 157.25320445860555, "learning_rate": 1.5231013735914444e-08, "logits/chosen": -16.340839385986328, "logits/rejected": -16.46707534790039, "logps/chosen": -2.3339834213256836, "logps/rejected": -2.750584840774536, "loss": 2.4662, "rewards/accuracies": 0.5625, "rewards/chosen": -23.339832305908203, "rewards/margins": 4.166016101837158, "rewards/rejected": -27.50585174560547, "step": 267 }, { "epoch": 0.9229444683598794, "grad_norm": 153.71273684620283, "learning_rate": 1.3942952449735201e-08, "logits/chosen": -17.994409561157227, "logits/rejected": -17.976512908935547, "logps/chosen": -2.3722710609436035, "logps/rejected": -3.688662528991699, "loss": 2.6692, "rewards/accuracies": 0.875, "rewards/chosen": -23.722707748413086, "rewards/margins": 13.163920402526855, "rewards/rejected": -36.886627197265625, "step": 268 }, { "epoch": 0.9263882910030133, "grad_norm": 159.9085920533978, "learning_rate": 1.2710823758974676e-08, "logits/chosen": -17.915321350097656, "logits/rejected": -17.735774993896484, "logps/chosen": -2.341430425643921, "logps/rejected": -3.303116798400879, "loss": 2.5158, "rewards/accuracies": 0.8125, "rewards/chosen": -23.414304733276367, "rewards/margins": 9.616860389709473, "rewards/rejected": -33.031166076660156, "step": 269 }, { "epoch": 0.9298321136461473, "grad_norm": 173.1432244469156, "learning_rate": 1.1534806176513434e-08, "logits/chosen": -17.020999908447266, "logits/rejected": -16.834182739257812, "logps/chosen": -3.0662012100219727, "logps/rejected": -4.367783546447754, "loss": 2.8725, "rewards/accuracies": 0.75, "rewards/chosen": -30.66200828552246, "rewards/margins": 13.01583194732666, "rewards/rejected": -43.67784118652344, "step": 270 }, { "epoch": 0.9332759362892811, "grad_norm": 175.82982939834184, "learning_rate": 1.0415070085759925e-08, "logits/chosen": -18.364770889282227, "logits/rejected": -17.562824249267578, "logps/chosen": -1.9744817018508911, "logps/rejected": -2.404961347579956, "loss": 2.7178, "rewards/accuracies": 0.9375, "rewards/chosen": -19.744815826416016, "rewards/margins": 4.304797172546387, "rewards/rejected": -24.049612045288086, "step": 271 }, { "epoch": 0.936719758932415, "grad_norm": 138.65454205055815, "learning_rate": 9.351777715965337e-09, "logits/chosen": -18.179439544677734, "logits/rejected": -18.3562068939209, "logps/chosen": -2.5122263431549072, "logps/rejected": -2.9324097633361816, "loss": 2.0112, "rewards/accuracies": 0.625, "rewards/chosen": -25.122264862060547, "rewards/margins": 4.2018327713012695, "rewards/rejected": -29.3240966796875, "step": 272 }, { "epoch": 0.9401635815755489, "grad_norm": 166.37792305846952, "learning_rate": 8.345083118719509e-09, "logits/chosen": -17.025907516479492, "logits/rejected": -16.50684928894043, "logps/chosen": -2.3154404163360596, "logps/rejected": -3.7841498851776123, "loss": 2.4035, "rewards/accuracies": 0.875, "rewards/chosen": -23.154401779174805, "rewards/margins": 14.68709659576416, "rewards/rejected": -37.84150314331055, "step": 273 }, { "epoch": 0.9436074042186827, "grad_norm": 156.2441873918835, "learning_rate": 7.395132145631544e-09, "logits/chosen": -16.066808700561523, "logits/rejected": -15.90978717803955, "logps/chosen": -1.8371270895004272, "logps/rejected": -2.678525924682617, "loss": 2.0809, "rewards/accuracies": 0.875, "rewards/chosen": -18.37127113342285, "rewards/margins": 8.413987159729004, "rewards/rejected": -26.785259246826172, "step": 274 }, { "epoch": 0.9470512268618166, "grad_norm": 153.24700496665506, "learning_rate": 6.502062427198929e-09, "logits/chosen": -17.553876876831055, "logits/rejected": -17.5835018157959, "logps/chosen": -2.4539918899536133, "logps/rejected": -3.212017774581909, "loss": 2.8158, "rewards/accuracies": 0.875, "rewards/chosen": -24.539915084838867, "rewards/margins": 7.580263137817383, "rewards/rejected": -32.12017822265625, "step": 275 }, { "epoch": 0.9504950495049505, "grad_norm": 143.41309705270993, "learning_rate": 5.666003352866733e-09, "logits/chosen": -16.96306610107422, "logits/rejected": -17.010374069213867, "logps/chosen": -2.3547351360321045, "logps/rejected": -3.560419797897339, "loss": 2.1713, "rewards/accuracies": 1.0, "rewards/chosen": -23.547353744506836, "rewards/margins": 12.056845664978027, "rewards/rejected": -35.60419845581055, "step": 276 }, { "epoch": 0.9539388721480844, "grad_norm": 127.28128452614312, "learning_rate": 4.887076052282291e-09, "logits/chosen": -16.996347427368164, "logits/rejected": -17.045413970947266, "logps/chosen": -2.568338394165039, "logps/rejected": -3.7449233531951904, "loss": 2.2934, "rewards/accuracies": 0.75, "rewards/chosen": -25.68338394165039, "rewards/margins": 11.765849113464355, "rewards/rejected": -37.44923400878906, "step": 277 }, { "epoch": 0.9573826947912183, "grad_norm": 157.32940289604164, "learning_rate": 4.165393377745108e-09, "logits/chosen": -15.926298141479492, "logits/rejected": -15.70862102508545, "logps/chosen": -2.419119358062744, "logps/rejected": -3.12341046333313, "loss": 2.6577, "rewards/accuracies": 0.8125, "rewards/chosen": -24.191190719604492, "rewards/margins": 7.042915344238281, "rewards/rejected": -31.234106063842773, "step": 278 }, { "epoch": 0.9608265174343521, "grad_norm": 166.13315912995816, "learning_rate": 3.5010598878567387e-09, "logits/chosen": -17.470443725585938, "logits/rejected": -17.031620025634766, "logps/chosen": -2.174730062484741, "logps/rejected": -2.8888866901397705, "loss": 2.6471, "rewards/accuracies": 0.875, "rewards/chosen": -21.747299194335938, "rewards/margins": 7.1415696144104, "rewards/rejected": -28.888866424560547, "step": 279 }, { "epoch": 0.964270340077486, "grad_norm": 147.30485634603158, "learning_rate": 2.8941718323724605e-09, "logits/chosen": -18.49142837524414, "logits/rejected": -18.490829467773438, "logps/chosen": -2.174673080444336, "logps/rejected": -2.49884295463562, "loss": 2.4879, "rewards/accuracies": 0.625, "rewards/chosen": -21.746734619140625, "rewards/margins": 3.241696357727051, "rewards/rejected": -24.98843002319336, "step": 280 }, { "epoch": 0.9677141627206199, "grad_norm": 164.2306340847377, "learning_rate": 2.344817138256161e-09, "logits/chosen": -16.280027389526367, "logits/rejected": -17.136314392089844, "logps/chosen": -2.1512930393218994, "logps/rejected": -3.025418519973755, "loss": 2.543, "rewards/accuracies": 0.875, "rewards/chosen": -21.512928009033203, "rewards/margins": 8.741255760192871, "rewards/rejected": -30.254182815551758, "step": 281 }, { "epoch": 0.9711579853637538, "grad_norm": 150.11832572590862, "learning_rate": 1.8530753969413282e-09, "logits/chosen": -18.478666305541992, "logits/rejected": -18.377622604370117, "logps/chosen": -1.8686823844909668, "logps/rejected": -2.416599750518799, "loss": 2.3333, "rewards/accuracies": 0.8125, "rewards/chosen": -18.686824798583984, "rewards/margins": 5.479173183441162, "rewards/rejected": -24.165996551513672, "step": 282 }, { "epoch": 0.9746018080068877, "grad_norm": 148.19843248369733, "learning_rate": 1.4190178527999198e-09, "logits/chosen": -18.65854263305664, "logits/rejected": -18.510950088500977, "logps/chosen": -2.684150457382202, "logps/rejected": -3.3109562397003174, "loss": 2.5508, "rewards/accuracies": 0.75, "rewards/chosen": -26.84150505065918, "rewards/margins": 6.268057823181152, "rewards/rejected": -33.109561920166016, "step": 283 }, { "epoch": 0.9780456306500215, "grad_norm": 145.3408171333848, "learning_rate": 1.0427073928200857e-09, "logits/chosen": -16.74791717529297, "logits/rejected": -17.11324119567871, "logps/chosen": -1.9323251247406006, "logps/rejected": -2.5761024951934814, "loss": 2.6734, "rewards/accuracies": 0.8125, "rewards/chosen": -19.32324981689453, "rewards/margins": 6.437774658203125, "rewards/rejected": -25.761024475097656, "step": 284 }, { "epoch": 0.9814894532931554, "grad_norm": 157.29447155335754, "learning_rate": 7.241985374952797e-10, "logits/chosen": -15.43560791015625, "logits/rejected": -16.016206741333008, "logps/chosen": -2.186183452606201, "logps/rejected": -2.557718515396118, "loss": 2.6831, "rewards/accuracies": 0.75, "rewards/chosen": -21.86183738708496, "rewards/margins": 3.7153472900390625, "rewards/rejected": -25.57718276977539, "step": 285 }, { "epoch": 0.9849332759362893, "grad_norm": 169.85671811117746, "learning_rate": 4.6353743292497637e-10, "logits/chosen": -17.268774032592773, "logits/rejected": -17.37803077697754, "logps/chosen": -2.315028429031372, "logps/rejected": -2.8562231063842773, "loss": 3.1469, "rewards/accuracies": 0.75, "rewards/chosen": -23.150283813476562, "rewards/margins": 5.411944389343262, "rewards/rejected": -28.56222915649414, "step": 286 }, { "epoch": 0.9883770985794231, "grad_norm": 134.22079815084547, "learning_rate": 2.607618441292203e-10, "logits/chosen": -17.780881881713867, "logits/rejected": -17.691925048828125, "logps/chosen": -2.1940627098083496, "logps/rejected": -2.694575786590576, "loss": 2.1033, "rewards/accuracies": 0.75, "rewards/chosen": -21.940628051757812, "rewards/margins": 5.005130290985107, "rewards/rejected": -26.945756912231445, "step": 287 }, { "epoch": 0.991820921222557, "grad_norm": 159.97180537715076, "learning_rate": 1.1590114957682473e-10, "logits/chosen": -18.85245704650879, "logits/rejected": -18.812564849853516, "logps/chosen": -2.036210536956787, "logps/rejected": -2.694965362548828, "loss": 2.2158, "rewards/accuracies": 0.8125, "rewards/chosen": -20.362106323242188, "rewards/margins": 6.58754825592041, "rewards/rejected": -26.94965362548828, "step": 288 }, { "epoch": 0.995264743865691, "grad_norm": 170.03207418013815, "learning_rate": 2.8976336929353863e-11, "logits/chosen": -17.584726333618164, "logits/rejected": -17.696880340576172, "logps/chosen": -2.0683584213256836, "logps/rejected": -2.683166980743408, "loss": 2.4093, "rewards/accuracies": 0.75, "rewards/chosen": -20.683582305908203, "rewards/margins": 6.1480865478515625, "rewards/rejected": -26.8316707611084, "step": 289 }, { "epoch": 0.9987085665088248, "grad_norm": 122.7613085104077, "learning_rate": 0.0, "logits/chosen": -18.123817443847656, "logits/rejected": -18.210723876953125, "logps/chosen": -2.4845879077911377, "logps/rejected": -3.084573268890381, "loss": 2.7978, "rewards/accuracies": 0.8125, "rewards/chosen": -24.84588050842285, "rewards/margins": 5.999850273132324, "rewards/rejected": -30.845731735229492, "step": 290 }, { "epoch": 0.9987085665088248, "step": 290, "total_flos": 0.0, "train_loss": 3.2915380025732106, "train_runtime": 46073.3913, "train_samples_per_second": 0.807, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 290, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }