{ "best_metric": 1.265723466873169, "best_model_checkpoint": "saves/Gemma-7B-It/lora/orpo-salt/checkpoint-1500", "epoch": 2.9969690846635686, "eval_steps": 500, "global_step": 1854, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01616488179430188, "grad_norm": 4.377878189086914, "learning_rate": 4.999648198770648e-06, "logits/chosen": 209.9345245361328, "logits/rejected": 210.6967315673828, "logps/chosen": -2.4765946865081787, "logps/rejected": -2.9186055660247803, "loss": 2.5449, "odds_ratio_loss": 0.6828715205192566, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.24765947461128235, "rewards/margins": 0.04420109838247299, "rewards/rejected": -0.29186058044433594, "sft_loss": 2.4765946865081787, "step": 10 }, { "epoch": 0.03232976358860376, "grad_norm": 2.781564950942993, "learning_rate": 4.998578646361359e-06, "logits/chosen": 210.4038543701172, "logits/rejected": 212.20718383789062, "logps/chosen": -2.4702863693237305, "logps/rejected": -2.504176616668701, "loss": 2.564, "odds_ratio_loss": 0.9375804662704468, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.2470286339521408, "rewards/margins": 0.0033890369813889265, "rewards/rejected": -0.25041764974594116, "sft_loss": 2.4702863693237305, "step": 20 }, { "epoch": 0.04849464538290564, "grad_norm": 5.785957336425781, "learning_rate": 4.996791614004449e-06, "logits/chosen": 209.83865356445312, "logits/rejected": 212.08535766601562, "logps/chosen": -2.6004161834716797, "logps/rejected": -2.695502758026123, "loss": 2.6963, "odds_ratio_loss": 0.9585107564926147, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.26004162430763245, "rewards/margins": 0.009508667513728142, "rewards/rejected": -0.26955026388168335, "sft_loss": 2.6004161834716797, "step": 30 }, { "epoch": 0.06465952717720752, "grad_norm": 7.009506702423096, "learning_rate": 4.994287614855618e-06, "logits/chosen": 210.0410614013672, "logits/rejected": 211.40286254882812, "logps/chosen": -2.6340386867523193, "logps/rejected": -2.6249070167541504, "loss": 2.7374, "odds_ratio_loss": 1.0338027477264404, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.26340389251708984, "rewards/margins": -0.0009131729602813721, "rewards/rejected": -0.2624906897544861, "sft_loss": 2.6340386867523193, "step": 40 }, { "epoch": 0.0808244089715094, "grad_norm": 4.594735145568848, "learning_rate": 4.991067367951343e-06, "logits/chosen": 219.71932983398438, "logits/rejected": 219.6745147705078, "logps/chosen": -2.3416378498077393, "logps/rejected": -2.4940619468688965, "loss": 2.4215, "odds_ratio_loss": 0.7983426451683044, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.2341637909412384, "rewards/margins": 0.015242427587509155, "rewards/rejected": -0.24940618872642517, "sft_loss": 2.3416378498077393, "step": 50 }, { "epoch": 0.09698929076581128, "grad_norm": 2.953855276107788, "learning_rate": 4.987131798002389e-06, "logits/chosen": 217.3623504638672, "logits/rejected": 218.24862670898438, "logps/chosen": -2.2888264656066895, "logps/rejected": -2.6409952640533447, "loss": 2.3829, "odds_ratio_loss": 0.940882682800293, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.22888264060020447, "rewards/margins": 0.035216934978961945, "rewards/rejected": -0.2640995383262634, "sft_loss": 2.2888264656066895, "step": 60 }, { "epoch": 0.11315417256011315, "grad_norm": 4.224141597747803, "learning_rate": 4.982482035128285e-06, "logits/chosen": 217.9263458251953, "logits/rejected": 218.54122924804688, "logps/chosen": -2.326590061187744, "logps/rejected": -2.605003833770752, "loss": 2.4191, "odds_ratio_loss": 0.9254364967346191, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2326590120792389, "rewards/margins": 0.027841363102197647, "rewards/rejected": -0.26050037145614624, "sft_loss": 2.326590061187744, "step": 70 }, { "epoch": 0.12931905435441504, "grad_norm": 7.0222883224487305, "learning_rate": 4.9771194145328e-06, "logits/chosen": 224.885986328125, "logits/rejected": 225.7215576171875, "logps/chosen": -1.8678385019302368, "logps/rejected": -2.1334400177001953, "loss": 1.9429, "odds_ratio_loss": 0.7510749697685242, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1867838352918625, "rewards/margins": 0.026560146361589432, "rewards/rejected": -0.213344007730484, "sft_loss": 1.8678385019302368, "step": 80 }, { "epoch": 0.1454839361487169, "grad_norm": 9.777688026428223, "learning_rate": 4.971045476120532e-06, "logits/chosen": 226.64450073242188, "logits/rejected": 227.11874389648438, "logps/chosen": -1.9129263162612915, "logps/rejected": -2.10162091255188, "loss": 1.9975, "odds_ratio_loss": 0.8461491465568542, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1912926286458969, "rewards/margins": 0.01886945590376854, "rewards/rejected": -0.21016211807727814, "sft_loss": 1.9129263162612915, "step": 90 }, { "epoch": 0.1616488179430188, "grad_norm": 3.441721200942993, "learning_rate": 4.964261964054713e-06, "logits/chosen": 230.32669067382812, "logits/rejected": 231.39498901367188, "logps/chosen": -1.8438594341278076, "logps/rejected": -2.1114680767059326, "loss": 1.923, "odds_ratio_loss": 0.7917153239250183, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.18438595533370972, "rewards/margins": 0.026760881766676903, "rewards/rejected": -0.21114683151245117, "sft_loss": 1.8438594341278076, "step": 100 }, { "epoch": 0.17781369973732067, "grad_norm": 3.7387969493865967, "learning_rate": 4.956770826256372e-06, "logits/chosen": 233.9343719482422, "logits/rejected": 234.51516723632812, "logps/chosen": -1.6228179931640625, "logps/rejected": -1.8143441677093506, "loss": 1.6988, "odds_ratio_loss": 0.7598803043365479, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1622818112373352, "rewards/margins": 0.01915261521935463, "rewards/rejected": -0.18143442273139954, "sft_loss": 1.6228179931640625, "step": 110 }, { "epoch": 0.19397858153162256, "grad_norm": 2.157771110534668, "learning_rate": 4.94857421384497e-06, "logits/chosen": 235.01248168945312, "logits/rejected": 235.390869140625, "logps/chosen": -1.6021674871444702, "logps/rejected": -1.885866403579712, "loss": 1.6762, "odds_ratio_loss": 0.7404050230979919, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1602167785167694, "rewards/margins": 0.02836987003684044, "rewards/rejected": -0.18858662247657776, "sft_loss": 1.6021674871444702, "step": 120 }, { "epoch": 0.21014346332592443, "grad_norm": 2.794867515563965, "learning_rate": 4.939674480520701e-06, "logits/chosen": 236.7910614013672, "logits/rejected": 237.41806030273438, "logps/chosen": -1.5707839727401733, "logps/rejected": -1.6964565515518188, "loss": 1.6508, "odds_ratio_loss": 0.8003607988357544, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.15707840025424957, "rewards/margins": 0.01256726123392582, "rewards/rejected": -0.16964565217494965, "sft_loss": 1.5707839727401733, "step": 130 }, { "epoch": 0.2263083451202263, "grad_norm": 1.2237716913223267, "learning_rate": 4.930074181888613e-06, "logits/chosen": 240.5333251953125, "logits/rejected": 241.0712432861328, "logps/chosen": -1.6245830059051514, "logps/rejected": -1.83035409450531, "loss": 1.6912, "odds_ratio_loss": 0.6659940481185913, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.16245830059051514, "rewards/margins": 0.02057710848748684, "rewards/rejected": -0.18303541839122772, "sft_loss": 1.6245830059051514, "step": 140 }, { "epoch": 0.2424732269145282, "grad_norm": 3.366241693496704, "learning_rate": 4.91977607472475e-06, "logits/chosen": 240.38449096679688, "logits/rejected": 241.23794555664062, "logps/chosen": -1.5312227010726929, "logps/rejected": -1.6705970764160156, "loss": 1.6035, "odds_ratio_loss": 0.7224593162536621, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.15312227606773376, "rewards/margins": 0.013937436044216156, "rewards/rejected": -0.16705971956253052, "sft_loss": 1.5312227010726929, "step": 150 }, { "epoch": 0.2586381087088301, "grad_norm": 2.1750807762145996, "learning_rate": 4.908783116184534e-06, "logits/chosen": 240.67446899414062, "logits/rejected": 241.75424194335938, "logps/chosen": -1.4731028079986572, "logps/rejected": -1.7678340673446655, "loss": 1.5362, "odds_ratio_loss": 0.6307698488235474, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1473102867603302, "rewards/margins": 0.029473140835762024, "rewards/rejected": -0.17678341269493103, "sft_loss": 1.4731028079986572, "step": 160 }, { "epoch": 0.27480299050313195, "grad_norm": 2.9354147911071777, "learning_rate": 4.897098462953598e-06, "logits/chosen": 243.85806274414062, "logits/rejected": 244.68405151367188, "logps/chosen": -1.3806183338165283, "logps/rejected": -1.7258154153823853, "loss": 1.4423, "odds_ratio_loss": 0.6170833706855774, "rewards/accuracies": 0.625, "rewards/chosen": -0.13806185126304626, "rewards/margins": 0.03451969474554062, "rewards/rejected": -0.1725815385580063, "sft_loss": 1.3806183338165283, "step": 170 }, { "epoch": 0.2909678722974338, "grad_norm": 1.4452638626098633, "learning_rate": 4.884725470341331e-06, "logits/chosen": 242.984619140625, "logits/rejected": 243.6776580810547, "logps/chosen": -1.2990996837615967, "logps/rejected": -1.616987943649292, "loss": 1.3597, "odds_ratio_loss": 0.6057690382003784, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12990999221801758, "rewards/margins": 0.031788814812898636, "rewards/rejected": -0.16169880330562592, "sft_loss": 1.2990996837615967, "step": 180 }, { "epoch": 0.3071327540917357, "grad_norm": 4.690347194671631, "learning_rate": 4.871667691317377e-06, "logits/chosen": 244.59634399414062, "logits/rejected": 244.5352325439453, "logps/chosen": -1.4848819971084595, "logps/rejected": -1.573250412940979, "loss": 1.5639, "odds_ratio_loss": 0.7902374267578125, "rewards/accuracies": 0.5, "rewards/chosen": -0.14848819375038147, "rewards/margins": 0.008836844936013222, "rewards/rejected": -0.15732502937316895, "sft_loss": 1.4848819971084595, "step": 190 }, { "epoch": 0.3232976358860376, "grad_norm": 7.527270317077637, "learning_rate": 4.857928875491392e-06, "logits/chosen": 243.60494995117188, "logits/rejected": 244.3643035888672, "logps/chosen": -1.3324997425079346, "logps/rejected": -1.5205867290496826, "loss": 1.402, "odds_ratio_loss": 0.6946145296096802, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13324996829032898, "rewards/margins": 0.018808716908097267, "rewards/rejected": -0.1520586758852005, "sft_loss": 1.3324997425079346, "step": 200 }, { "epoch": 0.33946251768033947, "grad_norm": 2.1978328227996826, "learning_rate": 4.843512968036314e-06, "logits/chosen": 244.3915557861328, "logits/rejected": 244.58700561523438, "logps/chosen": -1.3562281131744385, "logps/rejected": -1.4892576932907104, "loss": 1.4274, "odds_ratio_loss": 0.7121940851211548, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1356228142976761, "rewards/margins": 0.01330297440290451, "rewards/rejected": -0.1489257663488388, "sft_loss": 1.3562281131744385, "step": 210 }, { "epoch": 0.35562739947464134, "grad_norm": 6.31206750869751, "learning_rate": 4.828424108555486e-06, "logits/chosen": 246.1901092529297, "logits/rejected": 246.36703491210938, "logps/chosen": -1.5392124652862549, "logps/rejected": -1.7705978155136108, "loss": 1.6086, "odds_ratio_loss": 0.6943382024765015, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1539212465286255, "rewards/margins": 0.023138541728258133, "rewards/rejected": -0.17705979943275452, "sft_loss": 1.5392124652862549, "step": 220 }, { "epoch": 0.3717922812689432, "grad_norm": 1.1257890462875366, "learning_rate": 4.812666629893957e-06, "logits/chosen": 246.37399291992188, "logits/rejected": 246.72891235351562, "logps/chosen": -1.3704453706741333, "logps/rejected": -1.4485595226287842, "loss": 1.4433, "odds_ratio_loss": 0.7287623882293701, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1370445340871811, "rewards/margins": 0.007811415940523148, "rewards/rejected": -0.14485594630241394, "sft_loss": 1.3704453706741333, "step": 230 }, { "epoch": 0.3879571630632451, "grad_norm": 1.9700157642364502, "learning_rate": 4.796245056894273e-06, "logits/chosen": 244.54165649414062, "logits/rejected": 244.89407348632812, "logps/chosen": -1.4429550170898438, "logps/rejected": -1.5743396282196045, "loss": 1.5184, "odds_ratio_loss": 0.7547486424446106, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.14429552853107452, "rewards/margins": 0.013138455338776112, "rewards/rejected": -0.15743397176265717, "sft_loss": 1.4429550170898438, "step": 240 }, { "epoch": 0.404122044857547, "grad_norm": 1.5832947492599487, "learning_rate": 4.779164105097148e-06, "logits/chosen": 246.41659545898438, "logits/rejected": 246.4707489013672, "logps/chosen": -1.3124094009399414, "logps/rejected": -1.5739551782608032, "loss": 1.3768, "odds_ratio_loss": 0.6443756818771362, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.13124093413352966, "rewards/margins": 0.026154566556215286, "rewards/rejected": -0.15739551186561584, "sft_loss": 1.3124094009399414, "step": 250 }, { "epoch": 0.42028692665184886, "grad_norm": 2.2224152088165283, "learning_rate": 4.761428679387373e-06, "logits/chosen": 247.0335235595703, "logits/rejected": 247.7626953125, "logps/chosen": -1.2735482454299927, "logps/rejected": -1.5084031820297241, "loss": 1.3358, "odds_ratio_loss": 0.6226388216018677, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.12735481560230255, "rewards/margins": 0.02348550595343113, "rewards/rejected": -0.15084032714366913, "sft_loss": 1.2735482454299927, "step": 260 }, { "epoch": 0.4364518084461507, "grad_norm": 2.0271799564361572, "learning_rate": 4.7430438725853515e-06, "logits/chosen": 247.60205078125, "logits/rejected": 247.61654663085938, "logps/chosen": -1.3570277690887451, "logps/rejected": -1.714133858680725, "loss": 1.4226, "odds_ratio_loss": 0.6552777290344238, "rewards/accuracies": 0.625, "rewards/chosen": -0.13570277392864227, "rewards/margins": 0.035710614174604416, "rewards/rejected": -0.171413391828537, "sft_loss": 1.3570277690887451, "step": 270 }, { "epoch": 0.4526166902404526, "grad_norm": 2.142329216003418, "learning_rate": 4.724014963984669e-06, "logits/chosen": 248.28439331054688, "logits/rejected": 249.0177459716797, "logps/chosen": -1.3674625158309937, "logps/rejected": -1.6127933263778687, "loss": 1.435, "odds_ratio_loss": 0.6751004457473755, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.13674625754356384, "rewards/margins": 0.02453308179974556, "rewards/rejected": -0.1612793505191803, "sft_loss": 1.3674625158309937, "step": 280 }, { "epoch": 0.4687815720347545, "grad_norm": 2.8357582092285156, "learning_rate": 4.704347417836116e-06, "logits/chosen": 247.2007598876953, "logits/rejected": 247.60107421875, "logps/chosen": -1.2728191614151, "logps/rejected": -1.5069888830184937, "loss": 1.3382, "odds_ratio_loss": 0.6542028784751892, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.12728191912174225, "rewards/margins": 0.023416969925165176, "rewards/rejected": -0.15069888532161713, "sft_loss": 1.2728191614151, "step": 290 }, { "epoch": 0.4849464538290564, "grad_norm": 3.075584888458252, "learning_rate": 4.684046881778603e-06, "logits/chosen": 247.69580078125, "logits/rejected": 247.7963409423828, "logps/chosen": -1.3267529010772705, "logps/rejected": -1.46425461769104, "loss": 1.3929, "odds_ratio_loss": 0.6614553332328796, "rewards/accuracies": 0.59375, "rewards/chosen": -0.13267529010772705, "rewards/margins": 0.013750175014138222, "rewards/rejected": -0.14642547070980072, "sft_loss": 1.3267529010772705, "step": 300 }, { "epoch": 0.5011113356233583, "grad_norm": 1.1745957136154175, "learning_rate": 4.663119185217409e-06, "logits/chosen": 247.5077667236328, "logits/rejected": 247.80752563476562, "logps/chosen": -1.2750051021575928, "logps/rejected": -1.5364891290664673, "loss": 1.3385, "odds_ratio_loss": 0.6352204084396362, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1275005042552948, "rewards/margins": 0.026148397475481033, "rewards/rejected": -0.15364892780780792, "sft_loss": 1.2750051021575928, "step": 310 }, { "epoch": 0.5172762174176602, "grad_norm": 1.1816167831420898, "learning_rate": 4.641570337650232e-06, "logits/chosen": 248.5536651611328, "logits/rejected": 248.5113067626953, "logps/chosen": -1.1914936304092407, "logps/rejected": -1.4479808807373047, "loss": 1.2531, "odds_ratio_loss": 0.615585446357727, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11914938688278198, "rewards/margins": 0.025648722425103188, "rewards/rejected": -0.14479808509349823, "sft_loss": 1.1914936304092407, "step": 320 }, { "epoch": 0.533441099211962, "grad_norm": 6.805661678314209, "learning_rate": 4.61940652694154e-06, "logits/chosen": 246.8784637451172, "logits/rejected": 247.60842895507812, "logps/chosen": -1.371927261352539, "logps/rejected": -1.4951013326644897, "loss": 1.444, "odds_ratio_loss": 0.7208858728408813, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1371927261352539, "rewards/margins": 0.012317392975091934, "rewards/rejected": -0.14951011538505554, "sft_loss": 1.371927261352539, "step": 330 }, { "epoch": 0.5496059810062639, "grad_norm": 2.8288872241973877, "learning_rate": 4.596634117545689e-06, "logits/chosen": 248.96542358398438, "logits/rejected": 249.38369750976562, "logps/chosen": -1.3861172199249268, "logps/rejected": -1.6291033029556274, "loss": 1.4514, "odds_ratio_loss": 0.6529659032821655, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13861171901226044, "rewards/margins": 0.02429860271513462, "rewards/rejected": -0.1629103422164917, "sft_loss": 1.3861172199249268, "step": 340 }, { "epoch": 0.5657708628005658, "grad_norm": 2.343557834625244, "learning_rate": 4.573259648679335e-06, "logits/chosen": 247.5604248046875, "logits/rejected": 247.8214111328125, "logps/chosen": -1.3334286212921143, "logps/rejected": -1.642163634300232, "loss": 1.3937, "odds_ratio_loss": 0.603044331073761, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.13334286212921143, "rewards/margins": 0.030873507261276245, "rewards/rejected": -0.16421635448932648, "sft_loss": 1.3334286212921143, "step": 350 }, { "epoch": 0.5819357445948676, "grad_norm": 6.341250896453857, "learning_rate": 4.549289832443663e-06, "logits/chosen": 249.6760711669922, "logits/rejected": 249.2826385498047, "logps/chosen": -1.2829958200454712, "logps/rejected": -1.5420135259628296, "loss": 1.351, "odds_ratio_loss": 0.6805331110954285, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.12829959392547607, "rewards/margins": 0.025901764631271362, "rewards/rejected": -0.15420134365558624, "sft_loss": 1.2829958200454712, "step": 360 }, { "epoch": 0.5981006263891695, "grad_norm": 1.415165901184082, "learning_rate": 4.524731551896978e-06, "logits/chosen": 247.46142578125, "logits/rejected": 247.42459106445312, "logps/chosen": -1.2169711589813232, "logps/rejected": -1.3963136672973633, "loss": 1.2853, "odds_ratio_loss": 0.6832239031791687, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.12169712781906128, "rewards/margins": 0.017934244126081467, "rewards/rejected": -0.13963137567043304, "sft_loss": 1.2169711589813232, "step": 370 }, { "epoch": 0.6142655081834714, "grad_norm": 2.7373573780059814, "learning_rate": 4.4995918590781925e-06, "logits/chosen": 250.4862518310547, "logits/rejected": 250.1310272216797, "logps/chosen": -1.2185784578323364, "logps/rejected": -1.435258150100708, "loss": 1.2834, "odds_ratio_loss": 0.6484395265579224, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12185785919427872, "rewards/margins": 0.021667957305908203, "rewards/rejected": -0.14352580904960632, "sft_loss": 1.2185784578323364, "step": 380 }, { "epoch": 0.6304303899777733, "grad_norm": 1.0431718826293945, "learning_rate": 4.473877972981797e-06, "logits/chosen": 247.82730102539062, "logits/rejected": 248.197998046875, "logps/chosen": -1.3133275508880615, "logps/rejected": -1.566138505935669, "loss": 1.378, "odds_ratio_loss": 0.6465703248977661, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.13133276998996735, "rewards/margins": 0.02528109773993492, "rewards/rejected": -0.15661385655403137, "sft_loss": 1.3133275508880615, "step": 390 }, { "epoch": 0.6465952717720752, "grad_norm": 2.605905771255493, "learning_rate": 4.447597277484894e-06, "logits/chosen": 248.4889678955078, "logits/rejected": 248.0493927001953, "logps/chosen": -1.1982879638671875, "logps/rejected": -1.3909344673156738, "loss": 1.2662, "odds_ratio_loss": 0.6788827180862427, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.11982879787683487, "rewards/margins": 0.01926465705037117, "rewards/rejected": -0.13909344375133514, "sft_loss": 1.1982879638671875, "step": 400 }, { "epoch": 0.6627601535663771, "grad_norm": 2.7441976070404053, "learning_rate": 4.42075731922687e-06, "logits/chosen": 250.9984893798828, "logits/rejected": 250.879150390625, "logps/chosen": -1.3381072282791138, "logps/rejected": -1.476546049118042, "loss": 1.4062, "odds_ratio_loss": 0.6813501119613647, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13381072878837585, "rewards/margins": 0.013843873515725136, "rewards/rejected": -0.14765460789203644, "sft_loss": 1.3381072282791138, "step": 410 }, { "epoch": 0.6789250353606789, "grad_norm": 3.2034897804260254, "learning_rate": 4.3933658054423465e-06, "logits/chosen": 249.34951782226562, "logits/rejected": 249.37582397460938, "logps/chosen": -1.2343724966049194, "logps/rejected": -1.4455711841583252, "loss": 1.2964, "odds_ratio_loss": 0.6205655932426453, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.12343724817037582, "rewards/margins": 0.021119873970746994, "rewards/rejected": -0.1445571333169937, "sft_loss": 1.2343724966049194, "step": 420 }, { "epoch": 0.6950899171549808, "grad_norm": 2.552898645401001, "learning_rate": 4.365430601748003e-06, "logits/chosen": 247.7689208984375, "logits/rejected": 247.95205688476562, "logps/chosen": -1.3558423519134521, "logps/rejected": -1.4942983388900757, "loss": 1.4265, "odds_ratio_loss": 0.7065616250038147, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.13558423519134521, "rewards/margins": 0.013845594599843025, "rewards/rejected": -0.1494298279285431, "sft_loss": 1.3558423519134521, "step": 430 }, { "epoch": 0.7112547989492827, "grad_norm": 7.701834201812744, "learning_rate": 4.336959729883925e-06, "logits/chosen": 248.16812133789062, "logits/rejected": 248.47384643554688, "logps/chosen": -1.2508445978164673, "logps/rejected": -1.3401494026184082, "loss": 1.3242, "odds_ratio_loss": 0.7333300113677979, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.12508445978164673, "rewards/margins": 0.008930487558245659, "rewards/rejected": -0.13401496410369873, "sft_loss": 1.2508445978164673, "step": 440 }, { "epoch": 0.7274196807435845, "grad_norm": 1.3677743673324585, "learning_rate": 4.307961365410118e-06, "logits/chosen": 249.19546508789062, "logits/rejected": 249.5364990234375, "logps/chosen": -1.2851893901824951, "logps/rejected": -1.4277610778808594, "loss": 1.3525, "odds_ratio_loss": 0.6732175946235657, "rewards/accuracies": 0.53125, "rewards/chosen": -0.12851892411708832, "rewards/margins": 0.014257180504500866, "rewards/rejected": -0.14277611672878265, "sft_loss": 1.2851893901824951, "step": 450 }, { "epoch": 0.7435845625378864, "grad_norm": 3.3310444355010986, "learning_rate": 4.278443835358854e-06, "logits/chosen": 249.6570281982422, "logits/rejected": 249.6079864501953, "logps/chosen": -1.1893627643585205, "logps/rejected": -1.4945032596588135, "loss": 1.2482, "odds_ratio_loss": 0.5885173082351685, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.11893627792596817, "rewards/margins": 0.030514035373926163, "rewards/rejected": -0.14945031702518463, "sft_loss": 1.1893627643585205, "step": 460 }, { "epoch": 0.7597494443321883, "grad_norm": 2.5770180225372314, "learning_rate": 4.248415615843523e-06, "logits/chosen": 249.5537567138672, "logits/rejected": 249.5972442626953, "logps/chosen": -1.2710212469100952, "logps/rejected": -1.4037456512451172, "loss": 1.3415, "odds_ratio_loss": 0.7046067714691162, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.12710212171077728, "rewards/margins": 0.01327243447303772, "rewards/rejected": -0.140374556183815, "sft_loss": 1.2710212469100952, "step": 470 }, { "epoch": 0.7759143261264903, "grad_norm": 9.182385444641113, "learning_rate": 4.217885329624666e-06, "logits/chosen": 249.1255645751953, "logits/rejected": 249.16854858398438, "logps/chosen": -1.1571811437606812, "logps/rejected": -1.4825657606124878, "loss": 1.2175, "odds_ratio_loss": 0.6027355194091797, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11571812629699707, "rewards/margins": 0.032538462430238724, "rewards/rejected": -0.1482565701007843, "sft_loss": 1.1571811437606812, "step": 480 }, { "epoch": 0.7920792079207921, "grad_norm": 2.0430970191955566, "learning_rate": 4.186861743633911e-06, "logits/chosen": 248.51168823242188, "logits/rejected": 248.83370971679688, "logps/chosen": -1.216133713722229, "logps/rejected": -1.4709254503250122, "loss": 1.2856, "odds_ratio_loss": 0.694364070892334, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.12161336094141006, "rewards/margins": 0.025479182600975037, "rewards/rejected": -0.1470925360918045, "sft_loss": 1.216133713722229, "step": 490 }, { "epoch": 0.808244089715094, "grad_norm": 2.13413143157959, "learning_rate": 4.155353766456497e-06, "logits/chosen": 252.05142211914062, "logits/rejected": 251.9636993408203, "logps/chosen": -1.3067327737808228, "logps/rejected": -1.4753313064575195, "loss": 1.374, "odds_ratio_loss": 0.6729229688644409, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13067328929901123, "rewards/margins": 0.01685984991490841, "rewards/rejected": -0.147533118724823, "sft_loss": 1.3067327737808228, "step": 500 }, { "epoch": 0.808244089715094, "eval_logits/chosen": 249.61227416992188, "eval_logits/rejected": 249.90635681152344, "eval_logps/chosen": -1.2762008905410767, "eval_logps/rejected": -1.5033098459243774, "eval_loss": 1.3435848951339722, "eval_odds_ratio_loss": 0.6738389730453491, "eval_rewards/accuracies": 0.5672727227210999, "eval_rewards/chosen": -0.12762011587619781, "eval_rewards/margins": 0.0227108895778656, "eval_rewards/rejected": -0.15033100545406342, "eval_runtime": 221.4313, "eval_samples_per_second": 4.968, "eval_sft_loss": 1.2762008905410767, "eval_steps_per_second": 2.484, "step": 500 }, { "epoch": 0.8244089715093958, "grad_norm": 2.4113426208496094, "learning_rate": 4.123370445773134e-06, "logits/chosen": 250.3010711669922, "logits/rejected": 250.5003662109375, "logps/chosen": -1.2399041652679443, "logps/rejected": -1.3490018844604492, "loss": 1.3103, "odds_ratio_loss": 0.7042885422706604, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12399041652679443, "rewards/margins": 0.010909780859947205, "rewards/rejected": -0.13490018248558044, "sft_loss": 1.2399041652679443, "step": 510 }, { "epoch": 0.8405738533036977, "grad_norm": 4.632988452911377, "learning_rate": 4.090920965761906e-06, "logits/chosen": 249.44210815429688, "logits/rejected": 249.96994018554688, "logps/chosen": -1.2807283401489258, "logps/rejected": -1.4942976236343384, "loss": 1.3484, "odds_ratio_loss": 0.6771414875984192, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1280728280544281, "rewards/margins": 0.02135692723095417, "rewards/rejected": -0.14942976832389832, "sft_loss": 1.2807283401489258, "step": 520 }, { "epoch": 0.8567387350979996, "grad_norm": 9.196592330932617, "learning_rate": 4.058014644460991e-06, "logits/chosen": 250.1853790283203, "logits/rejected": 250.6529083251953, "logps/chosen": -1.2633569240570068, "logps/rejected": -1.4294393062591553, "loss": 1.329, "odds_ratio_loss": 0.6560603976249695, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.12633569538593292, "rewards/margins": 0.01660825125873089, "rewards/rejected": -0.14294394850730896, "sft_loss": 1.2633569240570068, "step": 530 }, { "epoch": 0.8729036168923014, "grad_norm": 1.8403383493423462, "learning_rate": 4.024660931092939e-06, "logits/chosen": 250.708251953125, "logits/rejected": 251.0161895751953, "logps/chosen": -1.287913203239441, "logps/rejected": -1.553476095199585, "loss": 1.3531, "odds_ratio_loss": 0.6521779298782349, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.12879131734371185, "rewards/margins": 0.026556288823485374, "rewards/rejected": -0.15534761548042297, "sft_loss": 1.287913203239441, "step": 540 }, { "epoch": 0.8890684986866033, "grad_norm": 7.186382293701172, "learning_rate": 3.990869403351272e-06, "logits/chosen": 251.8153839111328, "logits/rejected": 251.8900604248047, "logps/chosen": -1.268169641494751, "logps/rejected": -1.511528491973877, "loss": 1.3283, "odds_ratio_loss": 0.6014903783798218, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12681695818901062, "rewards/margins": 0.024335889145731926, "rewards/rejected": -0.1511528491973877, "sft_loss": 1.268169641494751, "step": 550 }, { "epoch": 0.9052333804809052, "grad_norm": 2.923800230026245, "learning_rate": 3.956649764650206e-06, "logits/chosen": 250.7381591796875, "logits/rejected": 250.7707061767578, "logps/chosen": -1.2698795795440674, "logps/rejected": -1.4995825290679932, "loss": 1.3379, "odds_ratio_loss": 0.6804186105728149, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.12698796391487122, "rewards/margins": 0.0229702927172184, "rewards/rejected": -0.14995825290679932, "sft_loss": 1.2698795795440674, "step": 560 }, { "epoch": 0.9213982622752072, "grad_norm": 6.1557416915893555, "learning_rate": 3.92201184133826e-06, "logits/chosen": 250.94808959960938, "logits/rejected": 251.642822265625, "logps/chosen": -1.2907052040100098, "logps/rejected": -1.54143226146698, "loss": 1.3538, "odds_ratio_loss": 0.6311336755752563, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.12907053530216217, "rewards/margins": 0.025072699412703514, "rewards/rejected": -0.15414324402809143, "sft_loss": 1.2907052040100098, "step": 570 }, { "epoch": 0.937563144069509, "grad_norm": 2.1665000915527344, "learning_rate": 3.886965579876572e-06, "logits/chosen": 252.3577423095703, "logits/rejected": 252.0865478515625, "logps/chosen": -1.2575817108154297, "logps/rejected": -1.3686320781707764, "loss": 1.3293, "odds_ratio_loss": 0.7170482873916626, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.12575815618038177, "rewards/margins": 0.011105048470199108, "rewards/rejected": -0.13686320185661316, "sft_loss": 1.2575817108154297, "step": 580 }, { "epoch": 0.9537280258638109, "grad_norm": 2.289733648300171, "learning_rate": 3.851521043982716e-06, "logits/chosen": 251.7819061279297, "logits/rejected": 251.52749633789062, "logps/chosen": -1.2542387247085571, "logps/rejected": -1.3954790830612183, "loss": 1.3206, "odds_ratio_loss": 0.6639243960380554, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1254238784313202, "rewards/margins": 0.014124047942459583, "rewards/rejected": -0.1395479142665863, "sft_loss": 1.2542387247085571, "step": 590 }, { "epoch": 0.9698929076581128, "grad_norm": 2.7564313411712646, "learning_rate": 3.81568841174086e-06, "logits/chosen": 251.03280639648438, "logits/rejected": 251.2174835205078, "logps/chosen": -1.2807530164718628, "logps/rejected": -1.5129293203353882, "loss": 1.3482, "odds_ratio_loss": 0.674277663230896, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.12807528674602509, "rewards/margins": 0.023217635229229927, "rewards/rejected": -0.15129292011260986, "sft_loss": 1.2807530164718628, "step": 600 }, { "epoch": 0.9860577894524146, "grad_norm": 2.1846888065338135, "learning_rate": 3.7794779726790664e-06, "logits/chosen": 249.8391571044922, "logits/rejected": 250.3789520263672, "logps/chosen": -1.1555012464523315, "logps/rejected": -1.3768011331558228, "loss": 1.2212, "odds_ratio_loss": 0.6573610305786133, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.11555011570453644, "rewards/margins": 0.022129978984594345, "rewards/rejected": -0.13768009841442108, "sft_loss": 1.1555012464523315, "step": 610 }, { "epoch": 1.0022226712467166, "grad_norm": 2.2191011905670166, "learning_rate": 3.7429001248146096e-06, "logits/chosen": 250.8198699951172, "logits/rejected": 251.24819946289062, "logps/chosen": -1.272541880607605, "logps/rejected": -1.5292177200317383, "loss": 1.3338, "odds_ratio_loss": 0.6125348806381226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1272541880607605, "rewards/margins": 0.0256675872951746, "rewards/rejected": -0.15292176604270935, "sft_loss": 1.272541880607605, "step": 620 }, { "epoch": 1.0183875530410185, "grad_norm": 1.6834843158721924, "learning_rate": 3.7059653716681227e-06, "logits/chosen": 250.3338623046875, "logits/rejected": 250.6593780517578, "logps/chosen": -1.2664134502410889, "logps/rejected": -1.469812035560608, "loss": 1.3343, "odds_ratio_loss": 0.6792756915092468, "rewards/accuracies": 0.53125, "rewards/chosen": -0.12664134800434113, "rewards/margins": 0.020339861512184143, "rewards/rejected": -0.14698120951652527, "sft_loss": 1.2664134502410889, "step": 630 }, { "epoch": 1.0345524348353203, "grad_norm": 5.188844203948975, "learning_rate": 3.668684319247463e-06, "logits/chosen": 249.46969604492188, "logits/rejected": 250.1366729736328, "logps/chosen": -1.1969501972198486, "logps/rejected": -1.5598738193511963, "loss": 1.2558, "odds_ratio_loss": 0.588589072227478, "rewards/accuracies": 0.625, "rewards/chosen": -0.11969500780105591, "rewards/margins": 0.03629238158464432, "rewards/rejected": -0.15598741173744202, "sft_loss": 1.1969501972198486, "step": 640 }, { "epoch": 1.0507173166296222, "grad_norm": 1.8501890897750854, "learning_rate": 3.6310676730021373e-06, "logits/chosen": 250.78857421875, "logits/rejected": 250.8007354736328, "logps/chosen": -1.2203996181488037, "logps/rejected": -1.3524749279022217, "loss": 1.2867, "odds_ratio_loss": 0.662962794303894, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12203995883464813, "rewards/margins": 0.013207539916038513, "rewards/rejected": -0.13524749875068665, "sft_loss": 1.2203996181488037, "step": 650 }, { "epoch": 1.066882198423924, "grad_norm": 3.5492091178894043, "learning_rate": 3.593126234749178e-06, "logits/chosen": 250.8761749267578, "logits/rejected": 251.28622436523438, "logps/chosen": -1.2661250829696655, "logps/rejected": -1.455129861831665, "loss": 1.3334, "odds_ratio_loss": 0.6727336645126343, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.12661252915859222, "rewards/margins": 0.01890045776963234, "rewards/rejected": -0.14551296830177307, "sft_loss": 1.2661250829696655, "step": 660 }, { "epoch": 1.083047080218226, "grad_norm": 3.5715062618255615, "learning_rate": 3.554870899571343e-06, "logits/chosen": 252.4844512939453, "logits/rejected": 252.82699584960938, "logps/chosen": -1.2469182014465332, "logps/rejected": -1.4401594400405884, "loss": 1.3131, "odds_ratio_loss": 0.6617658734321594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12469182908535004, "rewards/margins": 0.019324112683534622, "rewards/rejected": -0.14401593804359436, "sft_loss": 1.2469182014465332, "step": 670 }, { "epoch": 1.0992119620125278, "grad_norm": 4.318095684051514, "learning_rate": 3.5163126526885373e-06, "logits/chosen": 252.0143585205078, "logits/rejected": 251.80081176757812, "logps/chosen": -1.1914775371551514, "logps/rejected": -1.4009137153625488, "loss": 1.2573, "odds_ratio_loss": 0.6579803824424744, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.11914775520563126, "rewards/margins": 0.02094360813498497, "rewards/rejected": -0.14009135961532593, "sft_loss": 1.1914775371551514, "step": 680 }, { "epoch": 1.1153768438068297, "grad_norm": 2.403775930404663, "learning_rate": 3.4774625663033484e-06, "logits/chosen": 251.2095184326172, "logits/rejected": 251.48507690429688, "logps/chosen": -1.2048381567001343, "logps/rejected": -1.3877532482147217, "loss": 1.27, "odds_ratio_loss": 0.6517833471298218, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.12048381567001343, "rewards/margins": 0.018291514366865158, "rewards/rejected": -0.13877533376216888, "sft_loss": 1.2048381567001343, "step": 690 }, { "epoch": 1.1315417256011315, "grad_norm": 1.7898093461990356, "learning_rate": 3.4383317964216067e-06, "logits/chosen": 252.33316040039062, "logits/rejected": 252.1842498779297, "logps/chosen": -1.1471569538116455, "logps/rejected": -1.306755781173706, "loss": 1.2157, "odds_ratio_loss": 0.6855098009109497, "rewards/accuracies": 0.53125, "rewards/chosen": -0.11471569538116455, "rewards/margins": 0.01595989242196083, "rewards/rejected": -0.13067558407783508, "sft_loss": 1.1471569538116455, "step": 700 }, { "epoch": 1.1477066073954334, "grad_norm": 3.209373712539673, "learning_rate": 3.398931579648877e-06, "logits/chosen": 251.15170288085938, "logits/rejected": 251.59976196289062, "logps/chosen": -1.239712119102478, "logps/rejected": -1.5323327779769897, "loss": 1.3045, "odds_ratio_loss": 0.6475890874862671, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12397120893001556, "rewards/margins": 0.029262065887451172, "rewards/rejected": -0.15323328971862793, "sft_loss": 1.239712119102478, "step": 710 }, { "epoch": 1.1638714891897353, "grad_norm": 2.6601579189300537, "learning_rate": 3.359273229963813e-06, "logits/chosen": 250.285400390625, "logits/rejected": 250.47323608398438, "logps/chosen": -1.2064179182052612, "logps/rejected": -1.3739216327667236, "loss": 1.2742, "odds_ratio_loss": 0.6774007081985474, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.12064179033041, "rewards/margins": 0.01675037480890751, "rewards/rejected": -0.13739216327667236, "sft_loss": 1.2064179182052612, "step": 720 }, { "epoch": 1.1800363709840371, "grad_norm": 1.836297631263733, "learning_rate": 3.319368135469285e-06, "logits/chosen": 251.77334594726562, "logits/rejected": 252.28701782226562, "logps/chosen": -1.2479230165481567, "logps/rejected": -1.4433178901672363, "loss": 1.3175, "odds_ratio_loss": 0.6954701542854309, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12479230016469955, "rewards/margins": 0.019539497792720795, "rewards/rejected": -0.14433178305625916, "sft_loss": 1.2479230165481567, "step": 730 }, { "epoch": 1.196201252778339, "grad_norm": 3.1846110820770264, "learning_rate": 3.279227755122228e-06, "logits/chosen": 252.08438110351562, "logits/rejected": 252.65792846679688, "logps/chosen": -1.196380376815796, "logps/rejected": -1.5170973539352417, "loss": 1.2585, "odds_ratio_loss": 0.6215213537216187, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11963804066181183, "rewards/margins": 0.032071683555841446, "rewards/rejected": -0.15170973539352417, "sft_loss": 1.196380376815796, "step": 740 }, { "epoch": 1.2123661345726409, "grad_norm": 3.024951934814453, "learning_rate": 3.2388636154431417e-06, "logits/chosen": 253.1087646484375, "logits/rejected": 253.23635864257812, "logps/chosen": -1.3020392656326294, "logps/rejected": -1.5343925952911377, "loss": 1.3675, "odds_ratio_loss": 0.654754638671875, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.13020391762256622, "rewards/margins": 0.02323536016047001, "rewards/rejected": -0.1534392535686493, "sft_loss": 1.3020392656326294, "step": 750 }, { "epoch": 1.2285310163669427, "grad_norm": 2.166121482849121, "learning_rate": 3.198287307206192e-06, "logits/chosen": 251.711669921875, "logits/rejected": 251.5684356689453, "logps/chosen": -1.1889938116073608, "logps/rejected": -1.4522913694381714, "loss": 1.2499, "odds_ratio_loss": 0.60938560962677, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1188993826508522, "rewards/margins": 0.02632974646985531, "rewards/rejected": -0.14522913098335266, "sft_loss": 1.1889938116073608, "step": 760 }, { "epoch": 1.2446958981612446, "grad_norm": 1.8584887981414795, "learning_rate": 3.157510482110856e-06, "logits/chosen": 252.8727569580078, "logits/rejected": 253.4295654296875, "logps/chosen": -1.2046940326690674, "logps/rejected": -1.360910177230835, "loss": 1.2735, "odds_ratio_loss": 0.6879505515098572, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.12046940624713898, "rewards/margins": 0.01562163233757019, "rewards/rejected": -0.13609102368354797, "sft_loss": 1.2046940326690674, "step": 770 }, { "epoch": 1.2608607799555465, "grad_norm": 1.6219208240509033, "learning_rate": 3.116544849436077e-06, "logits/chosen": 251.80764770507812, "logits/rejected": 251.75509643554688, "logps/chosen": -1.3175479173660278, "logps/rejected": -1.6150630712509155, "loss": 1.3813, "odds_ratio_loss": 0.6378855109214783, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1317548006772995, "rewards/margins": 0.029751509428024292, "rewards/rejected": -0.1615062952041626, "sft_loss": 1.3175479173660278, "step": 780 }, { "epoch": 1.2770256617498483, "grad_norm": 2.1420071125030518, "learning_rate": 3.0754021726778848e-06, "logits/chosen": 252.167724609375, "logits/rejected": 251.9316864013672, "logps/chosen": -1.1495087146759033, "logps/rejected": -1.426129937171936, "loss": 1.2132, "odds_ratio_loss": 0.6372426748275757, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11495087295770645, "rewards/margins": 0.02766209840774536, "rewards/rejected": -0.1426129937171936, "sft_loss": 1.1495087146759033, "step": 790 }, { "epoch": 1.2931905435441502, "grad_norm": 1.3823323249816895, "learning_rate": 3.0340942661714463e-06, "logits/chosen": 252.6959686279297, "logits/rejected": 252.73464965820312, "logps/chosen": -1.2912076711654663, "logps/rejected": -1.4657213687896729, "loss": 1.3573, "odds_ratio_loss": 0.6610310673713684, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.12912078201770782, "rewards/margins": 0.017451368272304535, "rewards/rejected": -0.14657214283943176, "sft_loss": 1.2912076711654663, "step": 800 }, { "epoch": 1.3093554253384523, "grad_norm": 3.4516756534576416, "learning_rate": 2.992632991698512e-06, "logits/chosen": 250.41928100585938, "logits/rejected": 250.66513061523438, "logps/chosen": -1.219699501991272, "logps/rejected": -1.483235239982605, "loss": 1.2828, "odds_ratio_loss": 0.6311507821083069, "rewards/accuracies": 0.625, "rewards/chosen": -0.12196997553110123, "rewards/margins": 0.02635357342660427, "rewards/rejected": -0.14832353591918945, "sft_loss": 1.219699501991272, "step": 810 }, { "epoch": 1.3255203071327541, "grad_norm": 2.465632677078247, "learning_rate": 2.9510302550812537e-06, "logits/chosen": 251.94296264648438, "logits/rejected": 252.61587524414062, "logps/chosen": -1.144325852394104, "logps/rejected": -1.4354488849639893, "loss": 1.2042, "odds_ratio_loss": 0.5983381271362305, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.11443258821964264, "rewards/margins": 0.02911229059100151, "rewards/rejected": -0.14354488253593445, "sft_loss": 1.144325852394104, "step": 820 }, { "epoch": 1.341685188927056, "grad_norm": 3.969513416290283, "learning_rate": 2.9092980027634325e-06, "logits/chosen": 251.37832641601562, "logits/rejected": 251.625244140625, "logps/chosen": -1.1136391162872314, "logps/rejected": -1.3801125288009644, "loss": 1.1766, "odds_ratio_loss": 0.6293498277664185, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.11136390268802643, "rewards/margins": 0.026647353544831276, "rewards/rejected": -0.13801124691963196, "sft_loss": 1.1136391162872314, "step": 830 }, { "epoch": 1.3578500707213579, "grad_norm": 1.7552839517593384, "learning_rate": 2.867448218379927e-06, "logits/chosen": 252.9868621826172, "logits/rejected": 253.2499542236328, "logps/chosen": -1.249079704284668, "logps/rejected": -1.4685295820236206, "loss": 1.3139, "odds_ratio_loss": 0.6482545733451843, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1249079704284668, "rewards/margins": 0.02194499969482422, "rewards/rejected": -0.14685297012329102, "sft_loss": 1.249079704284668, "step": 840 }, { "epoch": 1.3740149525156597, "grad_norm": 5.6061906814575195, "learning_rate": 2.825492919315559e-06, "logits/chosen": 252.72372436523438, "logits/rejected": 252.32168579101562, "logps/chosen": -1.2922828197479248, "logps/rejected": -1.4327681064605713, "loss": 1.3613, "odds_ratio_loss": 0.6897528767585754, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.12922829389572144, "rewards/margins": 0.01404851209372282, "rewards/rejected": -0.14327679574489594, "sft_loss": 1.2922828197479248, "step": 850 }, { "epoch": 1.3901798343099616, "grad_norm": 2.2057290077209473, "learning_rate": 2.7834441532542482e-06, "logits/chosen": 251.51272583007812, "logits/rejected": 251.97573852539062, "logps/chosen": -1.1630654335021973, "logps/rejected": -1.4224598407745361, "loss": 1.2262, "odds_ratio_loss": 0.6317997574806213, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.11630652844905853, "rewards/margins": 0.025939440354704857, "rewards/rejected": -0.14224597811698914, "sft_loss": 1.1630654335021973, "step": 860 }, { "epoch": 1.4063447161042635, "grad_norm": 2.0599286556243896, "learning_rate": 2.74131399471945e-06, "logits/chosen": 252.7571258544922, "logits/rejected": 253.06008911132812, "logps/chosen": -1.2314178943634033, "logps/rejected": -1.404909372329712, "loss": 1.297, "odds_ratio_loss": 0.6555390357971191, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.12314176559448242, "rewards/margins": 0.01734915003180504, "rewards/rejected": -0.14049093425273895, "sft_loss": 1.2314178943634033, "step": 870 }, { "epoch": 1.4225095978985653, "grad_norm": 3.7026567459106445, "learning_rate": 2.6991145416068947e-06, "logits/chosen": 252.689697265625, "logits/rejected": 252.87332153320312, "logps/chosen": -1.2634754180908203, "logps/rejected": -1.376312255859375, "loss": 1.3339, "odds_ratio_loss": 0.7037913799285889, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.12634754180908203, "rewards/margins": 0.011283671483397484, "rewards/rejected": -0.13763120770454407, "sft_loss": 1.2634754180908203, "step": 880 }, { "epoch": 1.4386744796928672, "grad_norm": 2.7741122245788574, "learning_rate": 2.6568579117106143e-06, "logits/chosen": 251.893310546875, "logits/rejected": 251.9792022705078, "logps/chosen": -1.1909462213516235, "logps/rejected": -1.444592833518982, "loss": 1.257, "odds_ratio_loss": 0.6606670618057251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11909462511539459, "rewards/margins": 0.025364672765135765, "rewards/rejected": -0.1444592922925949, "sft_loss": 1.1909462213516235, "step": 890 }, { "epoch": 1.454839361487169, "grad_norm": 1.2793887853622437, "learning_rate": 2.6145562392432544e-06, "logits/chosen": 253.50723266601562, "logits/rejected": 253.42764282226562, "logps/chosen": -1.2168656587600708, "logps/rejected": -1.336360216140747, "loss": 1.2887, "odds_ratio_loss": 0.7181415557861328, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12168655544519424, "rewards/margins": 0.01194946188479662, "rewards/rejected": -0.13363602757453918, "sft_loss": 1.2168656587600708, "step": 900 }, { "epoch": 1.471004243281471, "grad_norm": 2.857558012008667, "learning_rate": 2.5722216713516682e-06, "logits/chosen": 252.78237915039062, "logits/rejected": 253.788330078125, "logps/chosen": -1.1416139602661133, "logps/rejected": -1.3757555484771729, "loss": 1.2043, "odds_ratio_loss": 0.6271349787712097, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.11416139453649521, "rewards/margins": 0.023414146155118942, "rewards/rejected": -0.13757555186748505, "sft_loss": 1.1416139602661133, "step": 910 }, { "epoch": 1.4871691250757728, "grad_norm": 2.625776529312134, "learning_rate": 2.5298663646288064e-06, "logits/chosen": 253.61221313476562, "logits/rejected": 254.0120391845703, "logps/chosen": -1.1546480655670166, "logps/rejected": -1.4036109447479248, "loss": 1.2201, "odds_ratio_loss": 0.6542297601699829, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11546480655670166, "rewards/margins": 0.024896297603845596, "rewards/rejected": -0.14036110043525696, "sft_loss": 1.1546480655670166, "step": 920 }, { "epoch": 1.503334006870075, "grad_norm": 3.928030014038086, "learning_rate": 2.487502481622879e-06, "logits/chosen": 252.84619140625, "logits/rejected": 253.71127319335938, "logps/chosen": -1.2712576389312744, "logps/rejected": -1.42746901512146, "loss": 1.3413, "odds_ratio_loss": 0.7003083229064941, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12712574005126953, "rewards/margins": 0.01562117226421833, "rewards/rejected": -0.1427469402551651, "sft_loss": 1.2712576389312744, "step": 930 }, { "epoch": 1.5194988886643768, "grad_norm": 2.4900426864624023, "learning_rate": 2.4451421873448253e-06, "logits/chosen": 252.51846313476562, "logits/rejected": 253.07400512695312, "logps/chosen": -1.193199634552002, "logps/rejected": -1.3677222728729248, "loss": 1.2601, "odds_ratio_loss": 0.6688076257705688, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.11931997537612915, "rewards/margins": 0.01745227724313736, "rewards/rejected": -0.1367722451686859, "sft_loss": 1.193199634552002, "step": 940 }, { "epoch": 1.5356637704586786, "grad_norm": 6.85699987411499, "learning_rate": 2.40279764577506e-06, "logits/chosen": 253.85693359375, "logits/rejected": 253.9010467529297, "logps/chosen": -1.304840087890625, "logps/rejected": -1.417873501777649, "loss": 1.3741, "odds_ratio_loss": 0.6923686861991882, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13048401474952698, "rewards/margins": 0.011303339153528214, "rewards/rejected": -0.1417873501777649, "sft_loss": 1.304840087890625, "step": 950 }, { "epoch": 1.5518286522529805, "grad_norm": 2.3570547103881836, "learning_rate": 2.3604810163705242e-06, "logits/chosen": 253.90060424804688, "logits/rejected": 254.25430297851562, "logps/chosen": -1.1358963251113892, "logps/rejected": -1.3512394428253174, "loss": 1.1966, "odds_ratio_loss": 0.6068128943443298, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11358962953090668, "rewards/margins": 0.021534323692321777, "rewards/rejected": -0.13512396812438965, "sft_loss": 1.1358963251113892, "step": 960 }, { "epoch": 1.5679935340472824, "grad_norm": 1.6715513467788696, "learning_rate": 2.3182044505730364e-06, "logits/chosen": 252.765380859375, "logits/rejected": 252.7443389892578, "logps/chosen": -1.0937732458114624, "logps/rejected": -1.302191972732544, "loss": 1.1567, "odds_ratio_loss": 0.6288636922836304, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.10937733948230743, "rewards/margins": 0.020841870456933975, "rewards/rejected": -0.13021919131278992, "sft_loss": 1.0937732458114624, "step": 970 }, { "epoch": 1.5841584158415842, "grad_norm": 1.8489584922790527, "learning_rate": 2.275980088319941e-06, "logits/chosen": 253.30712890625, "logits/rejected": 253.5155487060547, "logps/chosen": -1.149460792541504, "logps/rejected": -1.2745110988616943, "loss": 1.2198, "odds_ratio_loss": 0.7036079168319702, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.11494608223438263, "rewards/margins": 0.012505029328167439, "rewards/rejected": -0.1274511069059372, "sft_loss": 1.149460792541504, "step": 980 }, { "epoch": 1.600323297635886, "grad_norm": 2.3143341541290283, "learning_rate": 2.2338200545580577e-06, "logits/chosen": 253.9146728515625, "logits/rejected": 254.3609619140625, "logps/chosen": -1.1358720064163208, "logps/rejected": -1.409860372543335, "loss": 1.203, "odds_ratio_loss": 0.6715231537818909, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.11358718574047089, "rewards/margins": 0.02739885076880455, "rewards/rejected": -0.14098605513572693, "sft_loss": 1.1358720064163208, "step": 990 }, { "epoch": 1.616488179430188, "grad_norm": 2.5078933238983154, "learning_rate": 2.191736455761947e-06, "logits/chosen": 252.4419708251953, "logits/rejected": 252.6824493408203, "logps/chosen": -1.102782964706421, "logps/rejected": -1.295693039894104, "loss": 1.1628, "odds_ratio_loss": 0.5999386310577393, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.11027830839157104, "rewards/margins": 0.019290992990136147, "rewards/rejected": -0.12956929206848145, "sft_loss": 1.102782964706421, "step": 1000 }, { "epoch": 1.616488179430188, "eval_logits/chosen": 252.82716369628906, "eval_logits/rejected": 253.18104553222656, "eval_logps/chosen": -1.2153432369232178, "eval_logps/rejected": -1.446128010749817, "eval_loss": 1.2833058834075928, "eval_odds_ratio_loss": 0.6796271204948425, "eval_rewards/accuracies": 0.5618181824684143, "eval_rewards/chosen": -0.1215343102812767, "eval_rewards/margins": 0.023078490048646927, "eval_rewards/rejected": -0.14461281895637512, "eval_runtime": 221.4361, "eval_samples_per_second": 4.968, "eval_sft_loss": 1.2153432369232178, "eval_steps_per_second": 2.484, "step": 1000 }, { "epoch": 1.6326530612244898, "grad_norm": 1.7511672973632812, "learning_rate": 2.1497413764574673e-06, "logits/chosen": 253.8401336669922, "logits/rejected": 253.7457733154297, "logps/chosen": -1.2121939659118652, "logps/rejected": -1.4931201934814453, "loss": 1.2703, "odds_ratio_loss": 0.5808267593383789, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.12121939659118652, "rewards/margins": 0.028092628344893456, "rewards/rejected": -0.14931201934814453, "sft_loss": 1.2121939659118652, "step": 1010 }, { "epoch": 1.6488179430187917, "grad_norm": 2.1624321937561035, "learning_rate": 2.1078468757516395e-06, "logits/chosen": 252.7372589111328, "logits/rejected": 253.10342407226562, "logps/chosen": -1.1226885318756104, "logps/rejected": -1.302170991897583, "loss": 1.1845, "odds_ratio_loss": 0.6178861856460571, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.11226886510848999, "rewards/margins": 0.017948249354958534, "rewards/rejected": -0.13021712005138397, "sft_loss": 1.1226885318756104, "step": 1020 }, { "epoch": 1.6649828248130936, "grad_norm": 2.5826563835144043, "learning_rate": 2.0660649838698145e-06, "logits/chosen": 255.34326171875, "logits/rejected": 255.65859985351562, "logps/chosen": -1.1558864116668701, "logps/rejected": -1.3295384645462036, "loss": 1.2211, "odds_ratio_loss": 0.6525439023971558, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.11558864265680313, "rewards/margins": 0.01736520044505596, "rewards/rejected": -0.13295385241508484, "sft_loss": 1.1558864116668701, "step": 1030 }, { "epoch": 1.6811477066073954, "grad_norm": 1.975549340248108, "learning_rate": 2.0244076987011284e-06, "logits/chosen": 255.1981964111328, "logits/rejected": 255.7158966064453, "logps/chosen": -1.2127221822738647, "logps/rejected": -1.4685566425323486, "loss": 1.2727, "odds_ratio_loss": 0.6000550389289856, "rewards/accuracies": 0.625, "rewards/chosen": -0.1212722510099411, "rewards/margins": 0.025583425536751747, "rewards/rejected": -0.1468556672334671, "sft_loss": 1.2127221822738647, "step": 1040 }, { "epoch": 1.6973125884016973, "grad_norm": 2.224191904067993, "learning_rate": 1.982886982353251e-06, "logits/chosen": 252.6818389892578, "logits/rejected": 252.80859375, "logps/chosen": -1.193681240081787, "logps/rejected": -1.44672691822052, "loss": 1.2608, "odds_ratio_loss": 0.6715336441993713, "rewards/accuracies": 0.5, "rewards/chosen": -0.11936812102794647, "rewards/margins": 0.025304565206170082, "rewards/rejected": -0.144672691822052, "sft_loss": 1.193681240081787, "step": 1050 }, { "epoch": 1.7134774701959992, "grad_norm": 2.571403980255127, "learning_rate": 1.941514757717392e-06, "logits/chosen": 253.2911376953125, "logits/rejected": 254.0371551513672, "logps/chosen": -1.2021260261535645, "logps/rejected": -1.443331003189087, "loss": 1.2653, "odds_ratio_loss": 0.6321113705635071, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1202125996351242, "rewards/margins": 0.02412049099802971, "rewards/rejected": -0.1443330943584442, "sft_loss": 1.2021260261535645, "step": 1060 }, { "epoch": 1.729642351990301, "grad_norm": 4.061903476715088, "learning_rate": 1.9003029050445953e-06, "logits/chosen": 254.00650024414062, "logits/rejected": 254.38876342773438, "logps/chosen": -1.2242114543914795, "logps/rejected": -1.4163745641708374, "loss": 1.2891, "odds_ratio_loss": 0.6486276984214783, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.12242114543914795, "rewards/margins": 0.019216306507587433, "rewards/rejected": -0.14163745939731598, "sft_loss": 1.2242114543914795, "step": 1070 }, { "epoch": 1.745807233784603, "grad_norm": 2.371570110321045, "learning_rate": 1.8592632585342523e-06, "logits/chosen": 254.29745483398438, "logits/rejected": 254.67745971679688, "logps/chosen": -1.1612073183059692, "logps/rejected": -1.4247183799743652, "loss": 1.2246, "odds_ratio_loss": 0.6341406106948853, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11612071841955185, "rewards/margins": 0.026351114735007286, "rewards/rejected": -0.1424718201160431, "sft_loss": 1.1612073183059692, "step": 1080 }, { "epoch": 1.7619721155789048, "grad_norm": 8.819137573242188, "learning_rate": 1.8184076029358527e-06, "logits/chosen": 253.06661987304688, "logits/rejected": 252.38119506835938, "logps/chosen": -1.161278486251831, "logps/rejected": -1.2557886838912964, "loss": 1.2286, "odds_ratio_loss": 0.6734786033630371, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1161278635263443, "rewards/margins": 0.009451002813875675, "rewards/rejected": -0.1255788505077362, "sft_loss": 1.161278486251831, "step": 1090 }, { "epoch": 1.7781369973732066, "grad_norm": 1.7281618118286133, "learning_rate": 1.7777476701649318e-06, "logits/chosen": 251.3661651611328, "logits/rejected": 252.11477661132812, "logps/chosen": -1.1861474514007568, "logps/rejected": -1.3936630487442017, "loss": 1.2518, "odds_ratio_loss": 0.6561599373817444, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.11861474812030792, "rewards/margins": 0.020751552656292915, "rewards/rejected": -0.1393662989139557, "sft_loss": 1.1861474514007568, "step": 1100 }, { "epoch": 1.7943018791675085, "grad_norm": 3.3538103103637695, "learning_rate": 1.7372951359341925e-06, "logits/chosen": 253.0167236328125, "logits/rejected": 253.53396606445312, "logps/chosen": -1.137481451034546, "logps/rejected": -1.2894176244735718, "loss": 1.2055, "odds_ratio_loss": 0.6805364489555359, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.11374815553426743, "rewards/margins": 0.01519359927624464, "rewards/rejected": -0.12894175946712494, "sft_loss": 1.137481451034546, "step": 1110 }, { "epoch": 1.8104667609618104, "grad_norm": 3.6225833892822266, "learning_rate": 1.6970616164007547e-06, "logits/chosen": 252.6470489501953, "logits/rejected": 252.9353485107422, "logps/chosen": -1.1084340810775757, "logps/rejected": -1.3413022756576538, "loss": 1.1728, "odds_ratio_loss": 0.6432042717933655, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11084340512752533, "rewards/margins": 0.02328682318329811, "rewards/rejected": -0.13413023948669434, "sft_loss": 1.1084340810775757, "step": 1120 }, { "epoch": 1.8266316427561122, "grad_norm": 4.332692623138428, "learning_rate": 1.6570586648305276e-06, "logits/chosen": 253.6255645751953, "logits/rejected": 253.76217651367188, "logps/chosen": -1.1925103664398193, "logps/rejected": -1.4342319965362549, "loss": 1.2579, "odds_ratio_loss": 0.6541949510574341, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.11925105005502701, "rewards/margins": 0.024172160774469376, "rewards/rejected": -0.14342321455478668, "sft_loss": 1.1925103664398193, "step": 1130 }, { "epoch": 1.842796524550414, "grad_norm": 3.238105535507202, "learning_rate": 1.6172977682806151e-06, "logits/chosen": 253.7568817138672, "logits/rejected": 254.8863525390625, "logps/chosen": -1.2200841903686523, "logps/rejected": -1.4592787027359009, "loss": 1.2837, "odds_ratio_loss": 0.6365170478820801, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.12200842052698135, "rewards/margins": 0.023919429630041122, "rewards/rejected": -0.14592786133289337, "sft_loss": 1.2200841903686523, "step": 1140 }, { "epoch": 1.858961406344716, "grad_norm": 2.5219290256500244, "learning_rate": 1.5777903443007586e-06, "logits/chosen": 253.631103515625, "logits/rejected": 253.82504272460938, "logps/chosen": -1.235215425491333, "logps/rejected": -1.4535129070281982, "loss": 1.3023, "odds_ratio_loss": 0.6708552241325378, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.12352155148983002, "rewards/margins": 0.021829739212989807, "rewards/rejected": -0.14535130560398102, "sft_loss": 1.235215425491333, "step": 1150 }, { "epoch": 1.8751262881390178, "grad_norm": 3.190958261489868, "learning_rate": 1.5385477376547226e-06, "logits/chosen": 255.1521759033203, "logits/rejected": 255.2525634765625, "logps/chosen": -1.229001760482788, "logps/rejected": -1.4793845415115356, "loss": 1.2891, "odds_ratio_loss": 0.601111888885498, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.12290020287036896, "rewards/margins": 0.02503824792802334, "rewards/rejected": -0.14793843030929565, "sft_loss": 1.229001760482788, "step": 1160 }, { "epoch": 1.89129116993332, "grad_norm": 2.217510461807251, "learning_rate": 1.4995812170625845e-06, "logits/chosen": 253.1751251220703, "logits/rejected": 253.72238159179688, "logps/chosen": -1.2252581119537354, "logps/rejected": -1.5921032428741455, "loss": 1.2851, "odds_ratio_loss": 0.598137617111206, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.12252581119537354, "rewards/margins": 0.036684513092041016, "rewards/rejected": -0.15921030938625336, "sft_loss": 1.2252581119537354, "step": 1170 }, { "epoch": 1.9074560517276218, "grad_norm": 3.0452287197113037, "learning_rate": 1.4609019719648666e-06, "logits/chosen": 254.07901000976562, "logits/rejected": 254.59957885742188, "logps/chosen": -1.2207356691360474, "logps/rejected": -1.4706141948699951, "loss": 1.2826, "odds_ratio_loss": 0.6183902025222778, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.12207356840372086, "rewards/margins": 0.024987850338220596, "rewards/rejected": -0.14706142246723175, "sft_loss": 1.2207356691360474, "step": 1180 }, { "epoch": 1.9236209335219236, "grad_norm": 4.679479122161865, "learning_rate": 1.42252110930943e-06, "logits/chosen": 252.7305450439453, "logits/rejected": 252.6374969482422, "logps/chosen": -1.064835786819458, "logps/rejected": -1.2910759449005127, "loss": 1.1283, "odds_ratio_loss": 0.6346200704574585, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.1064835786819458, "rewards/margins": 0.02262401580810547, "rewards/rejected": -0.12910759449005127, "sft_loss": 1.064835786819458, "step": 1190 }, { "epoch": 1.9397858153162255, "grad_norm": 3.286461353302002, "learning_rate": 1.3844496503620493e-06, "logits/chosen": 253.310302734375, "logits/rejected": 253.27023315429688, "logps/chosen": -1.2112998962402344, "logps/rejected": -1.3967139720916748, "loss": 1.2737, "odds_ratio_loss": 0.6242542862892151, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.12112998962402344, "rewards/margins": 0.01854141615331173, "rewards/rejected": -0.1396714150905609, "sft_loss": 1.2112998962402344, "step": 1200 }, { "epoch": 1.9559506971105274, "grad_norm": 2.8077545166015625, "learning_rate": 1.3466985275416081e-06, "logits/chosen": 254.2769775390625, "logits/rejected": 254.47360229492188, "logps/chosen": -1.2563018798828125, "logps/rejected": -1.4514508247375488, "loss": 1.3239, "odds_ratio_loss": 0.6757391691207886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1256301999092102, "rewards/margins": 0.019514882937073708, "rewards/rejected": -0.14514507353305817, "sft_loss": 1.2563018798828125, "step": 1210 }, { "epoch": 1.9721155789048292, "grad_norm": 2.275397777557373, "learning_rate": 1.309278581280791e-06, "logits/chosen": 253.5750274658203, "logits/rejected": 253.99935913085938, "logps/chosen": -1.1356334686279297, "logps/rejected": -1.4314597845077515, "loss": 1.1934, "odds_ratio_loss": 0.5772610902786255, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11356334388256073, "rewards/margins": 0.029582645744085312, "rewards/rejected": -0.14314597845077515, "sft_loss": 1.1356334686279297, "step": 1220 }, { "epoch": 1.9882804606991311, "grad_norm": 1.454276204109192, "learning_rate": 1.272200556913199e-06, "logits/chosen": 254.544677734375, "logits/rejected": 254.67251586914062, "logps/chosen": -1.1884077787399292, "logps/rejected": -1.395115613937378, "loss": 1.2599, "odds_ratio_loss": 0.7147720456123352, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1188407689332962, "rewards/margins": 0.020670795813202858, "rewards/rejected": -0.1395115852355957, "sft_loss": 1.1884077787399292, "step": 1230 }, { "epoch": 2.004445342493433, "grad_norm": 3.6475422382354736, "learning_rate": 1.2354751015877698e-06, "logits/chosen": 252.74777221679688, "logits/rejected": 253.66641235351562, "logps/chosen": -1.1167339086532593, "logps/rejected": -1.4450454711914062, "loss": 1.1791, "odds_ratio_loss": 0.6237870454788208, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.11167339980602264, "rewards/margins": 0.03283114731311798, "rewards/rejected": -0.14450454711914062, "sft_loss": 1.1167339086532593, "step": 1240 }, { "epoch": 2.020610224287735, "grad_norm": 3.52698016166687, "learning_rate": 1.1991127612113945e-06, "logits/chosen": 254.6741943359375, "logits/rejected": 254.9825897216797, "logps/chosen": -1.1792643070220947, "logps/rejected": -1.4326034784317017, "loss": 1.2387, "odds_ratio_loss": 0.5942111611366272, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11792641878128052, "rewards/margins": 0.02533392235636711, "rewards/rejected": -0.14326035976409912, "sft_loss": 1.1792643070220947, "step": 1250 }, { "epoch": 2.036775106082037, "grad_norm": 3.579160690307617, "learning_rate": 1.1631239774206035e-06, "logits/chosen": 253.5153350830078, "logits/rejected": 253.659912109375, "logps/chosen": -1.1673438549041748, "logps/rejected": -1.4458467960357666, "loss": 1.2314, "odds_ratio_loss": 0.6410170793533325, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.11673440039157867, "rewards/margins": 0.027850273996591568, "rewards/rejected": -0.14458468556404114, "sft_loss": 1.1673438549041748, "step": 1260 }, { "epoch": 2.052939987876339, "grad_norm": 3.0812463760375977, "learning_rate": 1.1275190845831978e-06, "logits/chosen": 254.5985870361328, "logits/rejected": 254.2525177001953, "logps/chosen": -1.1342524290084839, "logps/rejected": -1.3948237895965576, "loss": 1.1925, "odds_ratio_loss": 0.5824798345565796, "rewards/accuracies": 0.625, "rewards/chosen": -0.11342523247003555, "rewards/margins": 0.026057133451104164, "rewards/rejected": -0.13948237895965576, "sft_loss": 1.1342524290084839, "step": 1270 }, { "epoch": 2.0691048696706407, "grad_norm": 2.4549710750579834, "learning_rate": 1.0923083068306778e-06, "logits/chosen": 254.7982635498047, "logits/rejected": 255.1488494873047, "logps/chosen": -1.1482160091400146, "logps/rejected": -1.4850049018859863, "loss": 1.2055, "odds_ratio_loss": 0.5724589824676514, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.11482159048318863, "rewards/margins": 0.033678896725177765, "rewards/rejected": -0.1485004872083664, "sft_loss": 1.1482160091400146, "step": 1280 }, { "epoch": 2.0852697514649425, "grad_norm": 1.778605580329895, "learning_rate": 1.0575017551223348e-06, "logits/chosen": 253.03524780273438, "logits/rejected": 253.53366088867188, "logps/chosen": -1.087461233139038, "logps/rejected": -1.321656584739685, "loss": 1.1524, "odds_ratio_loss": 0.6496065855026245, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10874611139297485, "rewards/margins": 0.023419544100761414, "rewards/rejected": -0.13216565549373627, "sft_loss": 1.087461233139038, "step": 1290 }, { "epoch": 2.1014346332592444, "grad_norm": 1.522445797920227, "learning_rate": 1.023109424341833e-06, "logits/chosen": 254.5054168701172, "logits/rejected": 255.06100463867188, "logps/chosen": -1.2142359018325806, "logps/rejected": -1.448194146156311, "loss": 1.2781, "odds_ratio_loss": 0.6386287808418274, "rewards/accuracies": 0.625, "rewards/chosen": -0.12142357975244522, "rewards/margins": 0.023395827040076256, "rewards/rejected": -0.14481940865516663, "sft_loss": 1.2142359018325806, "step": 1300 }, { "epoch": 2.1175995150535463, "grad_norm": 2.577580690383911, "learning_rate": 9.891411904271273e-07, "logits/chosen": 254.14779663085938, "logits/rejected": 254.121826171875, "logps/chosen": -1.100303292274475, "logps/rejected": -1.3276548385620117, "loss": 1.1632, "odds_ratio_loss": 0.628852128982544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11003033071756363, "rewards/margins": 0.02273516170680523, "rewards/rejected": -0.1327655017375946, "sft_loss": 1.100303292274475, "step": 1310 }, { "epoch": 2.133764396847848, "grad_norm": 1.5676363706588745, "learning_rate": 9.556068075345363e-07, "logits/chosen": 255.0603485107422, "logits/rejected": 255.1541748046875, "logps/chosen": -1.1494947671890259, "logps/rejected": -1.3283547163009644, "loss": 1.2112, "odds_ratio_loss": 0.6172733306884766, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11494947969913483, "rewards/margins": 0.017885997891426086, "rewards/rejected": -0.1328354775905609, "sft_loss": 1.1494947671890259, "step": 1320 }, { "epoch": 2.14992927864215, "grad_norm": 1.964956521987915, "learning_rate": 9.225159052377838e-07, "logits/chosen": 254.16183471679688, "logits/rejected": 254.3532257080078, "logps/chosen": -1.1823852062225342, "logps/rejected": -1.4285701513290405, "loss": 1.2468, "odds_ratio_loss": 0.6443654894828796, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.11823852360248566, "rewards/margins": 0.024618491530418396, "rewards/rejected": -0.14285701513290405, "sft_loss": 1.1823852062225342, "step": 1330 }, { "epoch": 2.166094160436452, "grad_norm": 4.320827484130859, "learning_rate": 8.898779857628184e-07, "logits/chosen": 253.94775390625, "logits/rejected": 253.83328247070312, "logps/chosen": -1.0813744068145752, "logps/rejected": -1.289052963256836, "loss": 1.1442, "odds_ratio_loss": 0.6285432577133179, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10813745111227036, "rewards/margins": 0.020767847076058388, "rewards/rejected": -0.1289052963256836, "sft_loss": 1.0813744068145752, "step": 1340 }, { "epoch": 2.1822590422307537, "grad_norm": 1.9166721105575562, "learning_rate": 8.577024212591975e-07, "logits/chosen": 255.42715454101562, "logits/rejected": 255.6570281982422, "logps/chosen": -1.2112232446670532, "logps/rejected": -1.4022705554962158, "loss": 1.2754, "odds_ratio_loss": 0.6421025991439819, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.1211223155260086, "rewards/margins": 0.019104719161987305, "rewards/rejected": -0.1402270495891571, "sft_loss": 1.2112232446670532, "step": 1350 }, { "epoch": 2.1984239240250556, "grad_norm": 2.231593370437622, "learning_rate": 8.259984511088276e-07, "logits/chosen": 252.95217895507812, "logits/rejected": 253.24972534179688, "logps/chosen": -1.1978521347045898, "logps/rejected": -1.4085915088653564, "loss": 1.2643, "odds_ratio_loss": 0.6643570065498352, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11978521198034286, "rewards/margins": 0.021073944866657257, "rewards/rejected": -0.14085917174816132, "sft_loss": 1.1978521347045898, "step": 1360 }, { "epoch": 2.2145888058193575, "grad_norm": 1.891438364982605, "learning_rate": 7.947751792728237e-07, "logits/chosen": 252.89163208007812, "logits/rejected": 252.9368133544922, "logps/chosen": -1.1386100053787231, "logps/rejected": -1.3931357860565186, "loss": 1.2001, "odds_ratio_loss": 0.6149393320083618, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.11386100947856903, "rewards/margins": 0.02545258030295372, "rewards/rejected": -0.13931360840797424, "sft_loss": 1.1386100053787231, "step": 1370 }, { "epoch": 2.2307536876136593, "grad_norm": 11.893668174743652, "learning_rate": 7.640415716772626e-07, "logits/chosen": 254.87893676757812, "logits/rejected": 254.9901123046875, "logps/chosen": -1.2301312685012817, "logps/rejected": -1.4715359210968018, "loss": 1.2969, "odds_ratio_loss": 0.6676316857337952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12301311641931534, "rewards/margins": 0.02414046786725521, "rewards/rejected": -0.1471536010503769, "sft_loss": 1.2301312685012817, "step": 1380 }, { "epoch": 2.246918569407961, "grad_norm": 1.648759365081787, "learning_rate": 7.338064536385722e-07, "logits/chosen": 253.27536010742188, "logits/rejected": 253.39450073242188, "logps/chosen": -1.172890543937683, "logps/rejected": -1.4573774337768555, "loss": 1.2306, "odds_ratio_loss": 0.5775946974754333, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11728904396295547, "rewards/margins": 0.02844870649278164, "rewards/rejected": -0.14573773741722107, "sft_loss": 1.172890543937683, "step": 1390 }, { "epoch": 2.263083451202263, "grad_norm": 2.644590377807617, "learning_rate": 7.040785073292883e-07, "logits/chosen": 254.41006469726562, "logits/rejected": 254.5663299560547, "logps/chosen": -1.243436336517334, "logps/rejected": -1.455594778060913, "loss": 1.3115, "odds_ratio_loss": 0.6802859902381897, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.12434364855289459, "rewards/margins": 0.021215861663222313, "rewards/rejected": -0.14555948972702026, "sft_loss": 1.243436336517334, "step": 1400 }, { "epoch": 2.279248332996565, "grad_norm": 3.2420878410339355, "learning_rate": 6.748662692849297e-07, "logits/chosen": 253.18417358398438, "logits/rejected": 254.11279296875, "logps/chosen": -1.1471028327941895, "logps/rejected": -1.5119264125823975, "loss": 1.2055, "odds_ratio_loss": 0.5840214490890503, "rewards/accuracies": 0.625, "rewards/chosen": -0.11471028625965118, "rewards/margins": 0.03648235648870468, "rewards/rejected": -0.15119265019893646, "sft_loss": 1.1471028327941895, "step": 1410 }, { "epoch": 2.295413214790867, "grad_norm": 4.394900798797607, "learning_rate": 6.46178127952686e-07, "logits/chosen": 254.48062133789062, "logits/rejected": 254.9115753173828, "logps/chosen": -1.1684550046920776, "logps/rejected": -1.38850998878479, "loss": 1.2286, "odds_ratio_loss": 0.601581871509552, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1168455109000206, "rewards/margins": 0.022005509585142136, "rewards/rejected": -0.13885101675987244, "sft_loss": 1.1684550046920776, "step": 1420 }, { "epoch": 2.3115780965851687, "grad_norm": 2.162309169769287, "learning_rate": 6.180223212826289e-07, "logits/chosen": 253.58633422851562, "logits/rejected": 253.8734588623047, "logps/chosen": -1.1496318578720093, "logps/rejected": -1.364654779434204, "loss": 1.213, "odds_ratio_loss": 0.633824348449707, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.11496319621801376, "rewards/margins": 0.021502288058400154, "rewards/rejected": -0.13646547496318817, "sft_loss": 1.1496318578720093, "step": 1430 }, { "epoch": 2.3277429783794705, "grad_norm": 1.522935152053833, "learning_rate": 5.904069343621443e-07, "logits/chosen": 255.19082641601562, "logits/rejected": 255.11474609375, "logps/chosen": -1.1330249309539795, "logps/rejected": -1.386264443397522, "loss": 1.195, "odds_ratio_loss": 0.6193984746932983, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11330248415470123, "rewards/margins": 0.025323981419205666, "rewards/rejected": -0.13862647116184235, "sft_loss": 1.1330249309539795, "step": 1440 }, { "epoch": 2.3439078601737724, "grad_norm": 2.982042074203491, "learning_rate": 5.633398970942544e-07, "logits/chosen": 254.9903564453125, "logits/rejected": 255.1248321533203, "logps/chosen": -1.1471365690231323, "logps/rejected": -1.3323371410369873, "loss": 1.2137, "odds_ratio_loss": 0.6657688617706299, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.114713653922081, "rewards/margins": 0.01852005161345005, "rewards/rejected": -0.1332337111234665, "sft_loss": 1.1471365690231323, "step": 1450 }, { "epoch": 2.3600727419680743, "grad_norm": 3.2461607456207275, "learning_rate": 5.368289819205069e-07, "logits/chosen": 254.27847290039062, "logits/rejected": 255.0779571533203, "logps/chosen": -1.11297607421875, "logps/rejected": -1.3122992515563965, "loss": 1.1805, "odds_ratio_loss": 0.6752298474311829, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1112975925207138, "rewards/margins": 0.019932324066758156, "rewards/rejected": -0.1312299221754074, "sft_loss": 1.11297607421875, "step": 1460 }, { "epoch": 2.376237623762376, "grad_norm": 2.7223591804504395, "learning_rate": 5.108818015890785e-07, "logits/chosen": 255.47216796875, "logits/rejected": 255.6534423828125, "logps/chosen": -1.2367959022521973, "logps/rejected": -1.4369171857833862, "loss": 1.3029, "odds_ratio_loss": 0.6609222888946533, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.12367959320545197, "rewards/margins": 0.020012129098176956, "rewards/rejected": -0.14369171857833862, "sft_loss": 1.2367959022521973, "step": 1470 }, { "epoch": 2.392402505556678, "grad_norm": 2.912327527999878, "learning_rate": 4.855058069687291e-07, "logits/chosen": 253.0759735107422, "logits/rejected": 253.6752471923828, "logps/chosen": -1.111169695854187, "logps/rejected": -1.4328919649124146, "loss": 1.1697, "odds_ratio_loss": 0.5851289629936218, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.11111698299646378, "rewards/margins": 0.03217221051454544, "rewards/rejected": -0.1432892084121704, "sft_loss": 1.111169695854187, "step": 1480 }, { "epoch": 2.40856738735098, "grad_norm": 2.995020627975464, "learning_rate": 4.607082849092523e-07, "logits/chosen": 253.9425811767578, "logits/rejected": 254.0526580810547, "logps/chosen": -1.2607060670852661, "logps/rejected": -1.4026780128479004, "loss": 1.3291, "odds_ratio_loss": 0.6835006475448608, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12607058882713318, "rewards/margins": 0.014197212643921375, "rewards/rejected": -0.14026781916618347, "sft_loss": 1.2607060670852661, "step": 1490 }, { "epoch": 2.4247322691452817, "grad_norm": 3.760835886001587, "learning_rate": 4.3649635614901405e-07, "logits/chosen": 254.07601928710938, "logits/rejected": 254.50985717773438, "logps/chosen": -1.1233417987823486, "logps/rejected": -1.2859314680099487, "loss": 1.1874, "odds_ratio_loss": 0.6403074860572815, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.11233416944742203, "rewards/margins": 0.01625899039208889, "rewards/rejected": -0.12859316170215607, "sft_loss": 1.1233417987823486, "step": 1500 }, { "epoch": 2.4247322691452817, "eval_logits/chosen": 253.6037139892578, "eval_logits/rejected": 253.95994567871094, "eval_logps/chosen": -1.1982638835906982, "eval_logps/rejected": -1.4377323389053345, "eval_loss": 1.265723466873169, "eval_odds_ratio_loss": 0.6745957732200623, "eval_rewards/accuracies": 0.5699999928474426, "eval_rewards/chosen": -0.11982638388872147, "eval_rewards/margins": 0.023946860805153847, "eval_rewards/rejected": -0.14377322793006897, "eval_runtime": 221.0804, "eval_samples_per_second": 4.976, "eval_sft_loss": 1.1982638835906982, "eval_steps_per_second": 2.488, "step": 1500 }, { "epoch": 2.4408971509395836, "grad_norm": 2.074381113052368, "learning_rate": 4.128769732701973e-07, "logits/chosen": 254.7397918701172, "logits/rejected": 254.7943572998047, "logps/chosen": -1.1791332960128784, "logps/rejected": -1.4214551448822021, "loss": 1.2445, "odds_ratio_loss": 0.6540807485580444, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.11791334301233292, "rewards/margins": 0.02423218823969364, "rewards/rejected": -0.14214551448822021, "sft_loss": 1.1791332960128784, "step": 1510 }, { "epoch": 2.4570620327338855, "grad_norm": 3.3741965293884277, "learning_rate": 3.8985691870233046e-07, "logits/chosen": 254.44534301757812, "logits/rejected": 254.93307495117188, "logps/chosen": -1.209789514541626, "logps/rejected": -1.4953523874282837, "loss": 1.2744, "odds_ratio_loss": 0.645904004573822, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.1209789514541626, "rewards/margins": 0.02855629101395607, "rewards/rejected": -0.14953525364398956, "sft_loss": 1.209789514541626, "step": 1520 }, { "epoch": 2.4732269145281873, "grad_norm": 4.19984245300293, "learning_rate": 3.6744280277467904e-07, "logits/chosen": 253.19186401367188, "logits/rejected": 253.959228515625, "logps/chosen": -1.1898527145385742, "logps/rejected": -1.418869972229004, "loss": 1.2551, "odds_ratio_loss": 0.6521813273429871, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11898528039455414, "rewards/margins": 0.022901728749275208, "rewards/rejected": -0.14188699424266815, "sft_loss": 1.1898527145385742, "step": 1530 }, { "epoch": 2.489391796322489, "grad_norm": 3.145732879638672, "learning_rate": 3.456410618180503e-07, "logits/chosen": 252.92282104492188, "logits/rejected": 253.6022491455078, "logps/chosen": -1.0681793689727783, "logps/rejected": -1.446299433708191, "loss": 1.1295, "odds_ratio_loss": 0.6130428910255432, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10681793838739395, "rewards/margins": 0.0378120057284832, "rewards/rejected": -0.14462995529174805, "sft_loss": 1.0681793689727783, "step": 1540 }, { "epoch": 2.5055566781167915, "grad_norm": 2.4530389308929443, "learning_rate": 3.244579563165753e-07, "logits/chosen": 252.8817138671875, "logits/rejected": 253.0416259765625, "logps/chosen": -1.1208980083465576, "logps/rejected": -1.4661897420883179, "loss": 1.1805, "odds_ratio_loss": 0.5958081483840942, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.11208979785442352, "rewards/margins": 0.034529171884059906, "rewards/rejected": -0.14661899209022522, "sft_loss": 1.1208980083465576, "step": 1550 }, { "epoch": 2.521721559911093, "grad_norm": 1.9151691198349, "learning_rate": 3.038995691099697e-07, "logits/chosen": 252.8979949951172, "logits/rejected": 253.34262084960938, "logps/chosen": -1.2310686111450195, "logps/rejected": -1.493896484375, "loss": 1.2954, "odds_ratio_loss": 0.6432778239250183, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.12310687452554703, "rewards/margins": 0.02628278359770775, "rewards/rejected": -0.14938965439796448, "sft_loss": 1.2310686111450195, "step": 1560 }, { "epoch": 2.5378864417053952, "grad_norm": 4.0270304679870605, "learning_rate": 2.839718036468192e-07, "logits/chosen": 255.1189422607422, "logits/rejected": 255.6085662841797, "logps/chosen": -1.2376972436904907, "logps/rejected": -1.4336402416229248, "loss": 1.306, "odds_ratio_loss": 0.6828280091285706, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.12376973778009415, "rewards/margins": 0.019594285637140274, "rewards/rejected": -0.14336401224136353, "sft_loss": 1.2376972436904907, "step": 1570 }, { "epoch": 2.5540513234996967, "grad_norm": 2.8464980125427246, "learning_rate": 2.646803822893723e-07, "logits/chosen": 254.5944366455078, "logits/rejected": 254.6045379638672, "logps/chosen": -1.1911519765853882, "logps/rejected": -1.4335906505584717, "loss": 1.255, "odds_ratio_loss": 0.6387141346931458, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1191151887178421, "rewards/margins": 0.024243878200650215, "rewards/rejected": -0.14335909485816956, "sft_loss": 1.1911519765853882, "step": 1580 }, { "epoch": 2.570216205293999, "grad_norm": 2.3468587398529053, "learning_rate": 2.460308446703341e-07, "logits/chosen": 255.0273895263672, "logits/rejected": 255.33963012695312, "logps/chosen": -1.1678217649459839, "logps/rejected": -1.3280802965164185, "loss": 1.2319, "odds_ratio_loss": 0.6405949592590332, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11678217351436615, "rewards/margins": 0.016025854274630547, "rewards/rejected": -0.13280804455280304, "sft_loss": 1.1678217649459839, "step": 1590 }, { "epoch": 2.5863810870883004, "grad_norm": 2.52286434173584, "learning_rate": 2.2802854610213143e-07, "logits/chosen": 253.905517578125, "logits/rejected": 254.2148895263672, "logps/chosen": -1.0991406440734863, "logps/rejected": -1.5185635089874268, "loss": 1.1551, "odds_ratio_loss": 0.5599113702774048, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10991404950618744, "rewards/margins": 0.041942298412323, "rewards/rejected": -0.15185634791851044, "sft_loss": 1.0991406440734863, "step": 1600 }, { "epoch": 2.6025459688826027, "grad_norm": 6.791391849517822, "learning_rate": 2.106786560391072e-07, "logits/chosen": 253.763671875, "logits/rejected": 253.8026580810547, "logps/chosen": -1.2003083229064941, "logps/rejected": -1.4066941738128662, "loss": 1.2644, "odds_ratio_loss": 0.6404808163642883, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12003083527088165, "rewards/margins": 0.020638594403862953, "rewards/rejected": -0.14066943526268005, "sft_loss": 1.2003083229064941, "step": 1610 }, { "epoch": 2.6187108506769046, "grad_norm": 2.421247720718384, "learning_rate": 1.9398615659308255e-07, "logits/chosen": 254.5044403076172, "logits/rejected": 255.14804077148438, "logps/chosen": -1.145989179611206, "logps/rejected": -1.2900917530059814, "loss": 1.2124, "odds_ratio_loss": 0.663682222366333, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.11459891498088837, "rewards/margins": 0.014410244300961494, "rewards/rejected": -0.1290091574192047, "sft_loss": 1.145989179611206, "step": 1620 }, { "epoch": 2.6348757324712064, "grad_norm": 2.5792062282562256, "learning_rate": 1.7795584110272184e-07, "logits/chosen": 254.8577117919922, "logits/rejected": 254.72903442382812, "logps/chosen": -1.1733181476593018, "logps/rejected": -1.3872268199920654, "loss": 1.2372, "odds_ratio_loss": 0.6392764449119568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11733181774616241, "rewards/margins": 0.021390849724411964, "rewards/rejected": -0.13872265815734863, "sft_loss": 1.1733181476593018, "step": 1630 }, { "epoch": 2.6510406142655083, "grad_norm": 4.67177152633667, "learning_rate": 1.6259231275709636e-07, "logits/chosen": 254.6771697998047, "logits/rejected": 254.81906127929688, "logps/chosen": -1.1654198169708252, "logps/rejected": -1.331291913986206, "loss": 1.2329, "odds_ratio_loss": 0.6746650338172913, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.11654196679592133, "rewards/margins": 0.016587218269705772, "rewards/rejected": -0.13312919437885284, "sft_loss": 1.1654198169708252, "step": 1640 }, { "epoch": 2.66720549605981, "grad_norm": 2.415234327316284, "learning_rate": 1.478999832738548e-07, "logits/chosen": 253.84658813476562, "logits/rejected": 254.2846221923828, "logps/chosen": -1.1557317972183228, "logps/rejected": -1.4208415746688843, "loss": 1.2199, "odds_ratio_loss": 0.6416669487953186, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.11557319015264511, "rewards/margins": 0.026510965079069138, "rewards/rejected": -0.14208415150642395, "sft_loss": 1.1557317972183228, "step": 1650 }, { "epoch": 2.683370377854112, "grad_norm": 2.4805846214294434, "learning_rate": 1.338830716323769e-07, "logits/chosen": 253.04605102539062, "logits/rejected": 253.3199005126953, "logps/chosen": -1.1211105585098267, "logps/rejected": -1.3149446249008179, "loss": 1.1846, "odds_ratio_loss": 0.6351101994514465, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.11211104691028595, "rewards/margins": 0.019383419305086136, "rewards/rejected": -0.13149447739124298, "sft_loss": 1.1211105585098267, "step": 1660 }, { "epoch": 2.699535259648414, "grad_norm": 4.832085609436035, "learning_rate": 1.205456028622723e-07, "logits/chosen": 254.3978729248047, "logits/rejected": 254.5135498046875, "logps/chosen": -1.0987465381622314, "logps/rejected": -1.4824903011322021, "loss": 1.157, "odds_ratio_loss": 0.582126259803772, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1098746508359909, "rewards/margins": 0.038374386727809906, "rewards/rejected": -0.1482490599155426, "sft_loss": 1.0987465381622314, "step": 1670 }, { "epoch": 2.7157001414427158, "grad_norm": 1.8406291007995605, "learning_rate": 1.0789140688756805e-07, "logits/chosen": 254.9768524169922, "logits/rejected": 255.4145965576172, "logps/chosen": -1.1487500667572021, "logps/rejected": -1.416771650314331, "loss": 1.2068, "odds_ratio_loss": 0.5807241201400757, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11487500369548798, "rewards/margins": 0.026802152395248413, "rewards/rejected": -0.14167717099189758, "sft_loss": 1.1487500667572021, "step": 1680 }, { "epoch": 2.7318650232370176, "grad_norm": 9.125133514404297, "learning_rate": 9.592411742693098e-08, "logits/chosen": 253.77490234375, "logits/rejected": 253.99368286132812, "logps/chosen": -1.2172834873199463, "logps/rejected": -1.371734857559204, "loss": 1.2881, "odds_ratio_loss": 0.7082632780075073, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.12172834575176239, "rewards/margins": 0.015445133671164513, "rewards/rejected": -0.13717348873615265, "sft_loss": 1.2172834873199463, "step": 1690 }, { "epoch": 2.7480299050313195, "grad_norm": 1.7780921459197998, "learning_rate": 8.464717095022168e-08, "logits/chosen": 251.9024658203125, "logits/rejected": 253.18222045898438, "logps/chosen": -1.1575608253479004, "logps/rejected": -1.4255679845809937, "loss": 1.2219, "odds_ratio_loss": 0.6432427167892456, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.115756094455719, "rewards/margins": 0.026800716295838356, "rewards/rejected": -0.1425568014383316, "sft_loss": 1.1575608253479004, "step": 1700 }, { "epoch": 2.7641947868256214, "grad_norm": 2.707679271697998, "learning_rate": 7.406380569169841e-08, "logits/chosen": 254.63046264648438, "logits/rejected": 255.2324676513672, "logps/chosen": -1.21084725856781, "logps/rejected": -1.339179277420044, "loss": 1.278, "odds_ratio_loss": 0.6712638139724731, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12108473479747772, "rewards/margins": 0.012833192944526672, "rewards/rejected": -0.1339179426431656, "sft_loss": 1.21084725856781, "step": 1710 }, { "epoch": 2.7803596686199232, "grad_norm": 9.321467399597168, "learning_rate": 6.417706072013808e-08, "logits/chosen": 255.14419555664062, "logits/rejected": 255.5975341796875, "logps/chosen": -1.1439273357391357, "logps/rejected": -1.3444888591766357, "loss": 1.2084, "odds_ratio_loss": 0.644874095916748, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.1143927350640297, "rewards/margins": 0.020056165754795074, "rewards/rejected": -0.13444891571998596, "sft_loss": 1.1439273357391357, "step": 1720 }, { "epoch": 2.796524550414225, "grad_norm": 4.651902675628662, "learning_rate": 5.498977506615294e-08, "logits/chosen": 254.2846221923828, "logits/rejected": 255.0900421142578, "logps/chosen": -1.2078436613082886, "logps/rejected": -1.3565863370895386, "loss": 1.2795, "odds_ratio_loss": 0.7164067029953003, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12078437954187393, "rewards/margins": 0.014874264597892761, "rewards/rejected": -0.1356586515903473, "sft_loss": 1.2078436613082886, "step": 1730 }, { "epoch": 2.812689432208527, "grad_norm": 2.109281301498413, "learning_rate": 4.6504586906947756e-08, "logits/chosen": 255.70327758789062, "logits/rejected": 255.75326538085938, "logps/chosen": -1.204192876815796, "logps/rejected": -1.3763076066970825, "loss": 1.2661, "odds_ratio_loss": 0.6194810271263123, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.12041930109262466, "rewards/margins": 0.01721145212650299, "rewards/rejected": -0.13763076066970825, "sft_loss": 1.204192876815796, "step": 1740 }, { "epoch": 2.828854314002829, "grad_norm": 11.28996467590332, "learning_rate": 3.8723932808754914e-08, "logits/chosen": 254.12826538085938, "logits/rejected": 254.1260528564453, "logps/chosen": -1.2869278192520142, "logps/rejected": -1.4129821062088013, "loss": 1.3563, "odds_ratio_loss": 0.6939128637313843, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.12869277596473694, "rewards/margins": 0.012605440802872181, "rewards/rejected": -0.14129820466041565, "sft_loss": 1.2869278192520142, "step": 1750 }, { "epoch": 2.8450191957971307, "grad_norm": 4.297213077545166, "learning_rate": 3.1650047027158014e-08, "logits/chosen": 254.026123046875, "logits/rejected": 254.0368194580078, "logps/chosen": -1.1498368978500366, "logps/rejected": -1.3505172729492188, "loss": 1.212, "odds_ratio_loss": 0.6218072175979614, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.11498367786407471, "rewards/margins": 0.020068055018782616, "rewards/rejected": -0.13505175709724426, "sft_loss": 1.1498368978500366, "step": 1760 }, { "epoch": 2.8611840775914326, "grad_norm": 3.5106639862060547, "learning_rate": 2.5284960865517848e-08, "logits/chosen": 253.19677734375, "logits/rejected": 253.49380493164062, "logps/chosen": -1.0654290914535522, "logps/rejected": -1.4208238124847412, "loss": 1.1222, "odds_ratio_loss": 0.5680567026138306, "rewards/accuracies": 0.625, "rewards/chosen": -0.1065429076552391, "rewards/margins": 0.035539474338293076, "rewards/rejected": -0.14208237826824188, "sft_loss": 1.0654290914535522, "step": 1770 }, { "epoch": 2.8773489593857344, "grad_norm": 1.9635688066482544, "learning_rate": 1.9630502091670388e-08, "logits/chosen": 254.31576538085938, "logits/rejected": 254.73184204101562, "logps/chosen": -1.16495680809021, "logps/rejected": -1.438716173171997, "loss": 1.2222, "odds_ratio_loss": 0.5726233720779419, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11649568378925323, "rewards/margins": 0.027375921607017517, "rewards/rejected": -0.14387162029743195, "sft_loss": 1.16495680809021, "step": 1780 }, { "epoch": 2.8935138411800363, "grad_norm": 7.4126458168029785, "learning_rate": 1.4688294413074677e-08, "logits/chosen": 253.7817840576172, "logits/rejected": 254.4253692626953, "logps/chosen": -1.081469178199768, "logps/rejected": -1.3936357498168945, "loss": 1.1414, "odds_ratio_loss": 0.5996376872062683, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.10814690589904785, "rewards/margins": 0.03121664747595787, "rewards/rejected": -0.13936355710029602, "sft_loss": 1.081469178199768, "step": 1790 }, { "epoch": 2.909678722974338, "grad_norm": 1.8516889810562134, "learning_rate": 1.0459757010556626e-08, "logits/chosen": 252.57345581054688, "logits/rejected": 252.3190155029297, "logps/chosen": -1.173514723777771, "logps/rejected": -1.303662657737732, "loss": 1.2401, "odds_ratio_loss": 0.6658231019973755, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1173514723777771, "rewards/margins": 0.013014810159802437, "rewards/rejected": -0.13036629557609558, "sft_loss": 1.173514723777771, "step": 1800 }, { "epoch": 2.92584360476864, "grad_norm": 7.74896764755249, "learning_rate": 6.94610413078306e-09, "logits/chosen": 253.01602172851562, "logits/rejected": 253.71841430664062, "logps/chosen": -1.2020865678787231, "logps/rejected": -1.5033478736877441, "loss": 1.2671, "odds_ratio_loss": 0.6497219204902649, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12020864337682724, "rewards/margins": 0.030126124620437622, "rewards/rejected": -0.15033479034900665, "sft_loss": 1.2020865678787231, "step": 1810 }, { "epoch": 2.942008486562942, "grad_norm": 2.0265376567840576, "learning_rate": 4.14834473758563e-09, "logits/chosen": 252.50656127929688, "logits/rejected": 252.5145721435547, "logps/chosen": -1.0974245071411133, "logps/rejected": -1.4048916101455688, "loss": 1.1558, "odds_ratio_loss": 0.5838974714279175, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10974244773387909, "rewards/margins": 0.030746713280677795, "rewards/rejected": -0.14048916101455688, "sft_loss": 1.0974245071411133, "step": 1820 }, { "epoch": 2.9581733683572438, "grad_norm": 2.291332721710205, "learning_rate": 2.067282222230349e-09, "logits/chosen": 254.31192016601562, "logits/rejected": 254.5823974609375, "logps/chosen": -1.1228351593017578, "logps/rejected": -1.4580986499786377, "loss": 1.1841, "odds_ratio_loss": 0.6124657392501831, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11228351294994354, "rewards/margins": 0.03352636098861694, "rewards/rejected": -0.1458098590373993, "sft_loss": 1.1228351593017578, "step": 1830 }, { "epoch": 2.9743382501515456, "grad_norm": 8.377201080322266, "learning_rate": 7.035141727212979e-10, "logits/chosen": 252.5790252685547, "logits/rejected": 253.33993530273438, "logps/chosen": -1.062239646911621, "logps/rejected": -1.3180148601531982, "loss": 1.1223, "odds_ratio_loss": 0.6001896858215332, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10622396320104599, "rewards/margins": 0.025577524676918983, "rewards/rejected": -0.13180148601531982, "sft_loss": 1.062239646911621, "step": 1840 }, { "epoch": 2.9905031319458475, "grad_norm": 5.447793006896973, "learning_rate": 5.743220219761592e-11, "logits/chosen": 254.24691772460938, "logits/rejected": 254.78369140625, "logps/chosen": -1.195462942123413, "logps/rejected": -1.4066945314407349, "loss": 1.2627, "odds_ratio_loss": 0.6723325252532959, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1195463091135025, "rewards/margins": 0.021123168990015984, "rewards/rejected": -0.14066946506500244, "sft_loss": 1.195462942123413, "step": 1850 }, { "epoch": 2.9969690846635686, "step": 1854, "total_flos": 2.1935611788745114e+18, "train_loss": 1.3469306265266197, "train_runtime": 24131.5713, "train_samples_per_second": 1.231, "train_steps_per_second": 0.077 } ], "logging_steps": 10, "max_steps": 1854, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.1935611788745114e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }