diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3229 @@ +{ + "best_metric": 1.265723466873169, + "best_model_checkpoint": "saves/Gemma-7B-It/lora/orpo-salt/checkpoint-1500", + "epoch": 2.9969690846635686, + "eval_steps": 500, + "global_step": 1854, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01616488179430188, + "grad_norm": 4.377878189086914, + "learning_rate": 4.999648198770648e-06, + "logits/chosen": 209.9345245361328, + "logits/rejected": 210.6967315673828, + "logps/chosen": -2.4765946865081787, + "logps/rejected": -2.9186055660247803, + "loss": 2.5449, + "odds_ratio_loss": 0.6828715205192566, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.24765947461128235, + "rewards/margins": 0.04420109838247299, + "rewards/rejected": -0.29186058044433594, + "sft_loss": 2.4765946865081787, + "step": 10 + }, + { + "epoch": 0.03232976358860376, + "grad_norm": 2.781564950942993, + "learning_rate": 4.998578646361359e-06, + "logits/chosen": 210.4038543701172, + "logits/rejected": 212.20718383789062, + "logps/chosen": -2.4702863693237305, + "logps/rejected": -2.504176616668701, + "loss": 2.564, + "odds_ratio_loss": 0.9375804662704468, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2470286339521408, + "rewards/margins": 0.0033890369813889265, + "rewards/rejected": -0.25041764974594116, + "sft_loss": 2.4702863693237305, + "step": 20 + }, + { + "epoch": 0.04849464538290564, + "grad_norm": 5.785957336425781, + "learning_rate": 4.996791614004449e-06, + "logits/chosen": 209.83865356445312, + "logits/rejected": 212.08535766601562, + "logps/chosen": -2.6004161834716797, + "logps/rejected": -2.695502758026123, + "loss": 2.6963, + "odds_ratio_loss": 0.9585107564926147, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.26004162430763245, + "rewards/margins": 0.009508667513728142, + "rewards/rejected": -0.26955026388168335, + "sft_loss": 2.6004161834716797, + "step": 30 + }, + { + "epoch": 0.06465952717720752, + "grad_norm": 7.009506702423096, + "learning_rate": 4.994287614855618e-06, + "logits/chosen": 210.0410614013672, + "logits/rejected": 211.40286254882812, + "logps/chosen": -2.6340386867523193, + "logps/rejected": -2.6249070167541504, + "loss": 2.7374, + "odds_ratio_loss": 1.0338027477264404, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.26340389251708984, + "rewards/margins": -0.0009131729602813721, + "rewards/rejected": -0.2624906897544861, + "sft_loss": 2.6340386867523193, + "step": 40 + }, + { + "epoch": 0.0808244089715094, + "grad_norm": 4.594735145568848, + "learning_rate": 4.991067367951343e-06, + "logits/chosen": 219.71932983398438, + "logits/rejected": 219.6745147705078, + "logps/chosen": -2.3416378498077393, + "logps/rejected": -2.4940619468688965, + "loss": 2.4215, + "odds_ratio_loss": 0.7983426451683044, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.2341637909412384, + "rewards/margins": 0.015242427587509155, + "rewards/rejected": -0.24940618872642517, + "sft_loss": 2.3416378498077393, + "step": 50 + }, + { + "epoch": 0.09698929076581128, + "grad_norm": 2.953855276107788, + "learning_rate": 4.987131798002389e-06, + "logits/chosen": 217.3623504638672, + "logits/rejected": 218.24862670898438, + "logps/chosen": -2.2888264656066895, + "logps/rejected": -2.6409952640533447, + "loss": 2.3829, + "odds_ratio_loss": 0.940882682800293, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.22888264060020447, + "rewards/margins": 0.035216934978961945, + "rewards/rejected": -0.2640995383262634, + "sft_loss": 2.2888264656066895, + "step": 60 + }, + { + "epoch": 0.11315417256011315, + "grad_norm": 4.224141597747803, + "learning_rate": 4.982482035128285e-06, + "logits/chosen": 217.9263458251953, + "logits/rejected": 218.54122924804688, + "logps/chosen": -2.326590061187744, + "logps/rejected": -2.605003833770752, + "loss": 2.4191, + "odds_ratio_loss": 0.9254364967346191, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.2326590120792389, + "rewards/margins": 0.027841363102197647, + "rewards/rejected": -0.26050037145614624, + "sft_loss": 2.326590061187744, + "step": 70 + }, + { + "epoch": 0.12931905435441504, + "grad_norm": 7.0222883224487305, + "learning_rate": 4.9771194145328e-06, + "logits/chosen": 224.885986328125, + "logits/rejected": 225.7215576171875, + "logps/chosen": -1.8678385019302368, + "logps/rejected": -2.1334400177001953, + "loss": 1.9429, + "odds_ratio_loss": 0.7510749697685242, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1867838352918625, + "rewards/margins": 0.026560146361589432, + "rewards/rejected": -0.213344007730484, + "sft_loss": 1.8678385019302368, + "step": 80 + }, + { + "epoch": 0.1454839361487169, + "grad_norm": 9.777688026428223, + "learning_rate": 4.971045476120532e-06, + "logits/chosen": 226.64450073242188, + "logits/rejected": 227.11874389648438, + "logps/chosen": -1.9129263162612915, + "logps/rejected": -2.10162091255188, + "loss": 1.9975, + "odds_ratio_loss": 0.8461491465568542, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1912926286458969, + "rewards/margins": 0.01886945590376854, + "rewards/rejected": -0.21016211807727814, + "sft_loss": 1.9129263162612915, + "step": 90 + }, + { + "epoch": 0.1616488179430188, + "grad_norm": 3.441721200942993, + "learning_rate": 4.964261964054713e-06, + "logits/chosen": 230.32669067382812, + "logits/rejected": 231.39498901367188, + "logps/chosen": -1.8438594341278076, + "logps/rejected": -2.1114680767059326, + "loss": 1.923, + "odds_ratio_loss": 0.7917153239250183, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.18438595533370972, + "rewards/margins": 0.026760881766676903, + "rewards/rejected": -0.21114683151245117, + "sft_loss": 1.8438594341278076, + "step": 100 + }, + { + "epoch": 0.17781369973732067, + "grad_norm": 3.7387969493865967, + "learning_rate": 4.956770826256372e-06, + "logits/chosen": 233.9343719482422, + "logits/rejected": 234.51516723632812, + "logps/chosen": -1.6228179931640625, + "logps/rejected": -1.8143441677093506, + "loss": 1.6988, + "odds_ratio_loss": 0.7598803043365479, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1622818112373352, + "rewards/margins": 0.01915261521935463, + "rewards/rejected": -0.18143442273139954, + "sft_loss": 1.6228179931640625, + "step": 110 + }, + { + "epoch": 0.19397858153162256, + "grad_norm": 2.157771110534668, + "learning_rate": 4.94857421384497e-06, + "logits/chosen": 235.01248168945312, + "logits/rejected": 235.390869140625, + "logps/chosen": -1.6021674871444702, + "logps/rejected": -1.885866403579712, + "loss": 1.6762, + "odds_ratio_loss": 0.7404050230979919, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.1602167785167694, + "rewards/margins": 0.02836987003684044, + "rewards/rejected": -0.18858662247657776, + "sft_loss": 1.6021674871444702, + "step": 120 + }, + { + "epoch": 0.21014346332592443, + "grad_norm": 2.794867515563965, + "learning_rate": 4.939674480520701e-06, + "logits/chosen": 236.7910614013672, + "logits/rejected": 237.41806030273438, + "logps/chosen": -1.5707839727401733, + "logps/rejected": -1.6964565515518188, + "loss": 1.6508, + "odds_ratio_loss": 0.8003607988357544, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.15707840025424957, + "rewards/margins": 0.01256726123392582, + "rewards/rejected": -0.16964565217494965, + "sft_loss": 1.5707839727401733, + "step": 130 + }, + { + "epoch": 0.2263083451202263, + "grad_norm": 1.2237716913223267, + "learning_rate": 4.930074181888613e-06, + "logits/chosen": 240.5333251953125, + "logits/rejected": 241.0712432861328, + "logps/chosen": -1.6245830059051514, + "logps/rejected": -1.83035409450531, + "loss": 1.6912, + "odds_ratio_loss": 0.6659940481185913, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.16245830059051514, + "rewards/margins": 0.02057710848748684, + "rewards/rejected": -0.18303541839122772, + "sft_loss": 1.6245830059051514, + "step": 140 + }, + { + "epoch": 0.2424732269145282, + "grad_norm": 3.366241693496704, + "learning_rate": 4.91977607472475e-06, + "logits/chosen": 240.38449096679688, + "logits/rejected": 241.23794555664062, + "logps/chosen": -1.5312227010726929, + "logps/rejected": -1.6705970764160156, + "loss": 1.6035, + "odds_ratio_loss": 0.7224593162536621, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.15312227606773376, + "rewards/margins": 0.013937436044216156, + "rewards/rejected": -0.16705971956253052, + "sft_loss": 1.5312227010726929, + "step": 150 + }, + { + "epoch": 0.2586381087088301, + "grad_norm": 2.1750807762145996, + "learning_rate": 4.908783116184534e-06, + "logits/chosen": 240.67446899414062, + "logits/rejected": 241.75424194335938, + "logps/chosen": -1.4731028079986572, + "logps/rejected": -1.7678340673446655, + "loss": 1.5362, + "odds_ratio_loss": 0.6307698488235474, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1473102867603302, + "rewards/margins": 0.029473140835762024, + "rewards/rejected": -0.17678341269493103, + "sft_loss": 1.4731028079986572, + "step": 160 + }, + { + "epoch": 0.27480299050313195, + "grad_norm": 2.9354147911071777, + "learning_rate": 4.897098462953598e-06, + "logits/chosen": 243.85806274414062, + "logits/rejected": 244.68405151367188, + "logps/chosen": -1.3806183338165283, + "logps/rejected": -1.7258154153823853, + "loss": 1.4423, + "odds_ratio_loss": 0.6170833706855774, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13806185126304626, + "rewards/margins": 0.03451969474554062, + "rewards/rejected": -0.1725815385580063, + "sft_loss": 1.3806183338165283, + "step": 170 + }, + { + "epoch": 0.2909678722974338, + "grad_norm": 1.4452638626098633, + "learning_rate": 4.884725470341331e-06, + "logits/chosen": 242.984619140625, + "logits/rejected": 243.6776580810547, + "logps/chosen": -1.2990996837615967, + "logps/rejected": -1.616987943649292, + "loss": 1.3597, + "odds_ratio_loss": 0.6057690382003784, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12990999221801758, + "rewards/margins": 0.031788814812898636, + "rewards/rejected": -0.16169880330562592, + "sft_loss": 1.2990996837615967, + "step": 180 + }, + { + "epoch": 0.3071327540917357, + "grad_norm": 4.690347194671631, + "learning_rate": 4.871667691317377e-06, + "logits/chosen": 244.59634399414062, + "logits/rejected": 244.5352325439453, + "logps/chosen": -1.4848819971084595, + "logps/rejected": -1.573250412940979, + "loss": 1.5639, + "odds_ratio_loss": 0.7902374267578125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14848819375038147, + "rewards/margins": 0.008836844936013222, + "rewards/rejected": -0.15732502937316895, + "sft_loss": 1.4848819971084595, + "step": 190 + }, + { + "epoch": 0.3232976358860376, + "grad_norm": 7.527270317077637, + "learning_rate": 4.857928875491392e-06, + "logits/chosen": 243.60494995117188, + "logits/rejected": 244.3643035888672, + "logps/chosen": -1.3324997425079346, + "logps/rejected": -1.5205867290496826, + "loss": 1.402, + "odds_ratio_loss": 0.6946145296096802, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.13324996829032898, + "rewards/margins": 0.018808716908097267, + "rewards/rejected": -0.1520586758852005, + "sft_loss": 1.3324997425079346, + "step": 200 + }, + { + "epoch": 0.33946251768033947, + "grad_norm": 2.1978328227996826, + "learning_rate": 4.843512968036314e-06, + "logits/chosen": 244.3915557861328, + "logits/rejected": 244.58700561523438, + "logps/chosen": -1.3562281131744385, + "logps/rejected": -1.4892576932907104, + "loss": 1.4274, + "odds_ratio_loss": 0.7121940851211548, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1356228142976761, + "rewards/margins": 0.01330297440290451, + "rewards/rejected": -0.1489257663488388, + "sft_loss": 1.3562281131744385, + "step": 210 + }, + { + "epoch": 0.35562739947464134, + "grad_norm": 6.31206750869751, + "learning_rate": 4.828424108555486e-06, + "logits/chosen": 246.1901092529297, + "logits/rejected": 246.36703491210938, + "logps/chosen": -1.5392124652862549, + "logps/rejected": -1.7705978155136108, + "loss": 1.6086, + "odds_ratio_loss": 0.6943382024765015, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.1539212465286255, + "rewards/margins": 0.023138541728258133, + "rewards/rejected": -0.17705979943275452, + "sft_loss": 1.5392124652862549, + "step": 220 + }, + { + "epoch": 0.3717922812689432, + "grad_norm": 1.1257890462875366, + "learning_rate": 4.812666629893957e-06, + "logits/chosen": 246.37399291992188, + "logits/rejected": 246.72891235351562, + "logps/chosen": -1.3704453706741333, + "logps/rejected": -1.4485595226287842, + "loss": 1.4433, + "odds_ratio_loss": 0.7287623882293701, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1370445340871811, + "rewards/margins": 0.007811415940523148, + "rewards/rejected": -0.14485594630241394, + "sft_loss": 1.3704453706741333, + "step": 230 + }, + { + "epoch": 0.3879571630632451, + "grad_norm": 1.9700157642364502, + "learning_rate": 4.796245056894273e-06, + "logits/chosen": 244.54165649414062, + "logits/rejected": 244.89407348632812, + "logps/chosen": -1.4429550170898438, + "logps/rejected": -1.5743396282196045, + "loss": 1.5184, + "odds_ratio_loss": 0.7547486424446106, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.14429552853107452, + "rewards/margins": 0.013138455338776112, + "rewards/rejected": -0.15743397176265717, + "sft_loss": 1.4429550170898438, + "step": 240 + }, + { + "epoch": 0.404122044857547, + "grad_norm": 1.5832947492599487, + "learning_rate": 4.779164105097148e-06, + "logits/chosen": 246.41659545898438, + "logits/rejected": 246.4707489013672, + "logps/chosen": -1.3124094009399414, + "logps/rejected": -1.5739551782608032, + "loss": 1.3768, + "odds_ratio_loss": 0.6443756818771362, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.13124093413352966, + "rewards/margins": 0.026154566556215286, + "rewards/rejected": -0.15739551186561584, + "sft_loss": 1.3124094009399414, + "step": 250 + }, + { + "epoch": 0.42028692665184886, + "grad_norm": 2.2224152088165283, + "learning_rate": 4.761428679387373e-06, + "logits/chosen": 247.0335235595703, + "logits/rejected": 247.7626953125, + "logps/chosen": -1.2735482454299927, + "logps/rejected": -1.5084031820297241, + "loss": 1.3358, + "odds_ratio_loss": 0.6226388216018677, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.12735481560230255, + "rewards/margins": 0.02348550595343113, + "rewards/rejected": -0.15084032714366913, + "sft_loss": 1.2735482454299927, + "step": 260 + }, + { + "epoch": 0.4364518084461507, + "grad_norm": 2.0271799564361572, + "learning_rate": 4.7430438725853515e-06, + "logits/chosen": 247.60205078125, + "logits/rejected": 247.61654663085938, + "logps/chosen": -1.3570277690887451, + "logps/rejected": -1.714133858680725, + "loss": 1.4226, + "odds_ratio_loss": 0.6552777290344238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13570277392864227, + "rewards/margins": 0.035710614174604416, + "rewards/rejected": -0.171413391828537, + "sft_loss": 1.3570277690887451, + "step": 270 + }, + { + "epoch": 0.4526166902404526, + "grad_norm": 2.142329216003418, + "learning_rate": 4.724014963984669e-06, + "logits/chosen": 248.28439331054688, + "logits/rejected": 249.0177459716797, + "logps/chosen": -1.3674625158309937, + "logps/rejected": -1.6127933263778687, + "loss": 1.435, + "odds_ratio_loss": 0.6751004457473755, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.13674625754356384, + "rewards/margins": 0.02453308179974556, + "rewards/rejected": -0.1612793505191803, + "sft_loss": 1.3674625158309937, + "step": 280 + }, + { + "epoch": 0.4687815720347545, + "grad_norm": 2.8357582092285156, + "learning_rate": 4.704347417836116e-06, + "logits/chosen": 247.2007598876953, + "logits/rejected": 247.60107421875, + "logps/chosen": -1.2728191614151, + "logps/rejected": -1.5069888830184937, + "loss": 1.3382, + "odds_ratio_loss": 0.6542028784751892, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.12728191912174225, + "rewards/margins": 0.023416969925165176, + "rewards/rejected": -0.15069888532161713, + "sft_loss": 1.2728191614151, + "step": 290 + }, + { + "epoch": 0.4849464538290564, + "grad_norm": 3.075584888458252, + "learning_rate": 4.684046881778603e-06, + "logits/chosen": 247.69580078125, + "logits/rejected": 247.7963409423828, + "logps/chosen": -1.3267529010772705, + "logps/rejected": -1.46425461769104, + "loss": 1.3929, + "odds_ratio_loss": 0.6614553332328796, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.13267529010772705, + "rewards/margins": 0.013750175014138222, + "rewards/rejected": -0.14642547070980072, + "sft_loss": 1.3267529010772705, + "step": 300 + }, + { + "epoch": 0.5011113356233583, + "grad_norm": 1.1745957136154175, + "learning_rate": 4.663119185217409e-06, + "logits/chosen": 247.5077667236328, + "logits/rejected": 247.80752563476562, + "logps/chosen": -1.2750051021575928, + "logps/rejected": -1.5364891290664673, + "loss": 1.3385, + "odds_ratio_loss": 0.6352204084396362, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1275005042552948, + "rewards/margins": 0.026148397475481033, + "rewards/rejected": -0.15364892780780792, + "sft_loss": 1.2750051021575928, + "step": 310 + }, + { + "epoch": 0.5172762174176602, + "grad_norm": 1.1816167831420898, + "learning_rate": 4.641570337650232e-06, + "logits/chosen": 248.5536651611328, + "logits/rejected": 248.5113067626953, + "logps/chosen": -1.1914936304092407, + "logps/rejected": -1.4479808807373047, + "loss": 1.2531, + "odds_ratio_loss": 0.615585446357727, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11914938688278198, + "rewards/margins": 0.025648722425103188, + "rewards/rejected": -0.14479808509349823, + "sft_loss": 1.1914936304092407, + "step": 320 + }, + { + "epoch": 0.533441099211962, + "grad_norm": 6.805661678314209, + "learning_rate": 4.61940652694154e-06, + "logits/chosen": 246.8784637451172, + "logits/rejected": 247.60842895507812, + "logps/chosen": -1.371927261352539, + "logps/rejected": -1.4951013326644897, + "loss": 1.444, + "odds_ratio_loss": 0.7208858728408813, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.1371927261352539, + "rewards/margins": 0.012317392975091934, + "rewards/rejected": -0.14951011538505554, + "sft_loss": 1.371927261352539, + "step": 330 + }, + { + "epoch": 0.5496059810062639, + "grad_norm": 2.8288872241973877, + "learning_rate": 4.596634117545689e-06, + "logits/chosen": 248.96542358398438, + "logits/rejected": 249.38369750976562, + "logps/chosen": -1.3861172199249268, + "logps/rejected": -1.6291033029556274, + "loss": 1.4514, + "odds_ratio_loss": 0.6529659032821655, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.13861171901226044, + "rewards/margins": 0.02429860271513462, + "rewards/rejected": -0.1629103422164917, + "sft_loss": 1.3861172199249268, + "step": 340 + }, + { + "epoch": 0.5657708628005658, + "grad_norm": 2.343557834625244, + "learning_rate": 4.573259648679335e-06, + "logits/chosen": 247.5604248046875, + "logits/rejected": 247.8214111328125, + "logps/chosen": -1.3334286212921143, + "logps/rejected": -1.642163634300232, + "loss": 1.3937, + "odds_ratio_loss": 0.603044331073761, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.13334286212921143, + "rewards/margins": 0.030873507261276245, + "rewards/rejected": -0.16421635448932648, + "sft_loss": 1.3334286212921143, + "step": 350 + }, + { + "epoch": 0.5819357445948676, + "grad_norm": 6.341250896453857, + "learning_rate": 4.549289832443663e-06, + "logits/chosen": 249.6760711669922, + "logits/rejected": 249.2826385498047, + "logps/chosen": -1.2829958200454712, + "logps/rejected": -1.5420135259628296, + "loss": 1.351, + "odds_ratio_loss": 0.6805331110954285, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.12829959392547607, + "rewards/margins": 0.025901764631271362, + "rewards/rejected": -0.15420134365558624, + "sft_loss": 1.2829958200454712, + "step": 360 + }, + { + "epoch": 0.5981006263891695, + "grad_norm": 1.415165901184082, + "learning_rate": 4.524731551896978e-06, + "logits/chosen": 247.46142578125, + "logits/rejected": 247.42459106445312, + "logps/chosen": -1.2169711589813232, + "logps/rejected": -1.3963136672973633, + "loss": 1.2853, + "odds_ratio_loss": 0.6832239031791687, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.12169712781906128, + "rewards/margins": 0.017934244126081467, + "rewards/rejected": -0.13963137567043304, + "sft_loss": 1.2169711589813232, + "step": 370 + }, + { + "epoch": 0.6142655081834714, + "grad_norm": 2.7373573780059814, + "learning_rate": 4.4995918590781925e-06, + "logits/chosen": 250.4862518310547, + "logits/rejected": 250.1310272216797, + "logps/chosen": -1.2185784578323364, + "logps/rejected": -1.435258150100708, + "loss": 1.2834, + "odds_ratio_loss": 0.6484395265579224, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12185785919427872, + "rewards/margins": 0.021667957305908203, + "rewards/rejected": -0.14352580904960632, + "sft_loss": 1.2185784578323364, + "step": 380 + }, + { + "epoch": 0.6304303899777733, + "grad_norm": 1.0431718826293945, + "learning_rate": 4.473877972981797e-06, + "logits/chosen": 247.82730102539062, + "logits/rejected": 248.197998046875, + "logps/chosen": -1.3133275508880615, + "logps/rejected": -1.566138505935669, + "loss": 1.378, + "odds_ratio_loss": 0.6465703248977661, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.13133276998996735, + "rewards/margins": 0.02528109773993492, + "rewards/rejected": -0.15661385655403137, + "sft_loss": 1.3133275508880615, + "step": 390 + }, + { + "epoch": 0.6465952717720752, + "grad_norm": 2.605905771255493, + "learning_rate": 4.447597277484894e-06, + "logits/chosen": 248.4889678955078, + "logits/rejected": 248.0493927001953, + "logps/chosen": -1.1982879638671875, + "logps/rejected": -1.3909344673156738, + "loss": 1.2662, + "odds_ratio_loss": 0.6788827180862427, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.11982879787683487, + "rewards/margins": 0.01926465705037117, + "rewards/rejected": -0.13909344375133514, + "sft_loss": 1.1982879638671875, + "step": 400 + }, + { + "epoch": 0.6627601535663771, + "grad_norm": 2.7441976070404053, + "learning_rate": 4.42075731922687e-06, + "logits/chosen": 250.9984893798828, + "logits/rejected": 250.879150390625, + "logps/chosen": -1.3381072282791138, + "logps/rejected": -1.476546049118042, + "loss": 1.4062, + "odds_ratio_loss": 0.6813501119613647, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.13381072878837585, + "rewards/margins": 0.013843873515725136, + "rewards/rejected": -0.14765460789203644, + "sft_loss": 1.3381072282791138, + "step": 410 + }, + { + "epoch": 0.6789250353606789, + "grad_norm": 3.2034897804260254, + "learning_rate": 4.3933658054423465e-06, + "logits/chosen": 249.34951782226562, + "logits/rejected": 249.37582397460938, + "logps/chosen": -1.2343724966049194, + "logps/rejected": -1.4455711841583252, + "loss": 1.2964, + "odds_ratio_loss": 0.6205655932426453, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.12343724817037582, + "rewards/margins": 0.021119873970746994, + "rewards/rejected": -0.1445571333169937, + "sft_loss": 1.2343724966049194, + "step": 420 + }, + { + "epoch": 0.6950899171549808, + "grad_norm": 2.552898645401001, + "learning_rate": 4.365430601748003e-06, + "logits/chosen": 247.7689208984375, + "logits/rejected": 247.95205688476562, + "logps/chosen": -1.3558423519134521, + "logps/rejected": -1.4942983388900757, + "loss": 1.4265, + "odds_ratio_loss": 0.7065616250038147, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.13558423519134521, + "rewards/margins": 0.013845594599843025, + "rewards/rejected": -0.1494298279285431, + "sft_loss": 1.3558423519134521, + "step": 430 + }, + { + "epoch": 0.7112547989492827, + "grad_norm": 7.701834201812744, + "learning_rate": 4.336959729883925e-06, + "logits/chosen": 248.16812133789062, + "logits/rejected": 248.47384643554688, + "logps/chosen": -1.2508445978164673, + "logps/rejected": -1.3401494026184082, + "loss": 1.3242, + "odds_ratio_loss": 0.7333300113677979, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.12508445978164673, + "rewards/margins": 0.008930487558245659, + "rewards/rejected": -0.13401496410369873, + "sft_loss": 1.2508445978164673, + "step": 440 + }, + { + "epoch": 0.7274196807435845, + "grad_norm": 1.3677743673324585, + "learning_rate": 4.307961365410118e-06, + "logits/chosen": 249.19546508789062, + "logits/rejected": 249.5364990234375, + "logps/chosen": -1.2851893901824951, + "logps/rejected": -1.4277610778808594, + "loss": 1.3525, + "odds_ratio_loss": 0.6732175946235657, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.12851892411708832, + "rewards/margins": 0.014257180504500866, + "rewards/rejected": -0.14277611672878265, + "sft_loss": 1.2851893901824951, + "step": 450 + }, + { + "epoch": 0.7435845625378864, + "grad_norm": 3.3310444355010986, + "learning_rate": 4.278443835358854e-06, + "logits/chosen": 249.6570281982422, + "logits/rejected": 249.6079864501953, + "logps/chosen": -1.1893627643585205, + "logps/rejected": -1.4945032596588135, + "loss": 1.2482, + "odds_ratio_loss": 0.5885173082351685, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.11893627792596817, + "rewards/margins": 0.030514035373926163, + "rewards/rejected": -0.14945031702518463, + "sft_loss": 1.1893627643585205, + "step": 460 + }, + { + "epoch": 0.7597494443321883, + "grad_norm": 2.5770180225372314, + "learning_rate": 4.248415615843523e-06, + "logits/chosen": 249.5537567138672, + "logits/rejected": 249.5972442626953, + "logps/chosen": -1.2710212469100952, + "logps/rejected": -1.4037456512451172, + "loss": 1.3415, + "odds_ratio_loss": 0.7046067714691162, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.12710212171077728, + "rewards/margins": 0.01327243447303772, + "rewards/rejected": -0.140374556183815, + "sft_loss": 1.2710212469100952, + "step": 470 + }, + { + "epoch": 0.7759143261264903, + "grad_norm": 9.182385444641113, + "learning_rate": 4.217885329624666e-06, + "logits/chosen": 249.1255645751953, + "logits/rejected": 249.16854858398438, + "logps/chosen": -1.1571811437606812, + "logps/rejected": -1.4825657606124878, + "loss": 1.2175, + "odds_ratio_loss": 0.6027355194091797, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.11571812629699707, + "rewards/margins": 0.032538462430238724, + "rewards/rejected": -0.1482565701007843, + "sft_loss": 1.1571811437606812, + "step": 480 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 2.0430970191955566, + "learning_rate": 4.186861743633911e-06, + "logits/chosen": 248.51168823242188, + "logits/rejected": 248.83370971679688, + "logps/chosen": -1.216133713722229, + "logps/rejected": -1.4709254503250122, + "loss": 1.2856, + "odds_ratio_loss": 0.694364070892334, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.12161336094141006, + "rewards/margins": 0.025479182600975037, + "rewards/rejected": -0.1470925360918045, + "sft_loss": 1.216133713722229, + "step": 490 + }, + { + "epoch": 0.808244089715094, + "grad_norm": 2.13413143157959, + "learning_rate": 4.155353766456497e-06, + "logits/chosen": 252.05142211914062, + "logits/rejected": 251.9636993408203, + "logps/chosen": -1.3067327737808228, + "logps/rejected": -1.4753313064575195, + "loss": 1.374, + "odds_ratio_loss": 0.6729229688644409, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.13067328929901123, + "rewards/margins": 0.01685984991490841, + "rewards/rejected": -0.147533118724823, + "sft_loss": 1.3067327737808228, + "step": 500 + }, + { + "epoch": 0.808244089715094, + "eval_logits/chosen": 249.61227416992188, + "eval_logits/rejected": 249.90635681152344, + "eval_logps/chosen": -1.2762008905410767, + "eval_logps/rejected": -1.5033098459243774, + "eval_loss": 1.3435848951339722, + "eval_odds_ratio_loss": 0.6738389730453491, + "eval_rewards/accuracies": 0.5672727227210999, + "eval_rewards/chosen": -0.12762011587619781, + "eval_rewards/margins": 0.0227108895778656, + "eval_rewards/rejected": -0.15033100545406342, + "eval_runtime": 221.4313, + "eval_samples_per_second": 4.968, + "eval_sft_loss": 1.2762008905410767, + "eval_steps_per_second": 2.484, + "step": 500 + }, + { + "epoch": 0.8244089715093958, + "grad_norm": 2.4113426208496094, + "learning_rate": 4.123370445773134e-06, + "logits/chosen": 250.3010711669922, + "logits/rejected": 250.5003662109375, + "logps/chosen": -1.2399041652679443, + "logps/rejected": -1.3490018844604492, + "loss": 1.3103, + "odds_ratio_loss": 0.7042885422706604, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.12399041652679443, + "rewards/margins": 0.010909780859947205, + "rewards/rejected": -0.13490018248558044, + "sft_loss": 1.2399041652679443, + "step": 510 + }, + { + "epoch": 0.8405738533036977, + "grad_norm": 4.632988452911377, + "learning_rate": 4.090920965761906e-06, + "logits/chosen": 249.44210815429688, + "logits/rejected": 249.96994018554688, + "logps/chosen": -1.2807283401489258, + "logps/rejected": -1.4942976236343384, + "loss": 1.3484, + "odds_ratio_loss": 0.6771414875984192, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1280728280544281, + "rewards/margins": 0.02135692723095417, + "rewards/rejected": -0.14942976832389832, + "sft_loss": 1.2807283401489258, + "step": 520 + }, + { + "epoch": 0.8567387350979996, + "grad_norm": 9.196592330932617, + "learning_rate": 4.058014644460991e-06, + "logits/chosen": 250.1853790283203, + "logits/rejected": 250.6529083251953, + "logps/chosen": -1.2633569240570068, + "logps/rejected": -1.4294393062591553, + "loss": 1.329, + "odds_ratio_loss": 0.6560603976249695, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.12633569538593292, + "rewards/margins": 0.01660825125873089, + "rewards/rejected": -0.14294394850730896, + "sft_loss": 1.2633569240570068, + "step": 530 + }, + { + "epoch": 0.8729036168923014, + "grad_norm": 1.8403383493423462, + "learning_rate": 4.024660931092939e-06, + "logits/chosen": 250.708251953125, + "logits/rejected": 251.0161895751953, + "logps/chosen": -1.287913203239441, + "logps/rejected": -1.553476095199585, + "loss": 1.3531, + "odds_ratio_loss": 0.6521779298782349, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.12879131734371185, + "rewards/margins": 0.026556288823485374, + "rewards/rejected": -0.15534761548042297, + "sft_loss": 1.287913203239441, + "step": 540 + }, + { + "epoch": 0.8890684986866033, + "grad_norm": 7.186382293701172, + "learning_rate": 3.990869403351272e-06, + "logits/chosen": 251.8153839111328, + "logits/rejected": 251.8900604248047, + "logps/chosen": -1.268169641494751, + "logps/rejected": -1.511528491973877, + "loss": 1.3283, + "odds_ratio_loss": 0.6014903783798218, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.12681695818901062, + "rewards/margins": 0.024335889145731926, + "rewards/rejected": -0.1511528491973877, + "sft_loss": 1.268169641494751, + "step": 550 + }, + { + "epoch": 0.9052333804809052, + "grad_norm": 2.923800230026245, + "learning_rate": 3.956649764650206e-06, + "logits/chosen": 250.7381591796875, + "logits/rejected": 250.7707061767578, + "logps/chosen": -1.2698795795440674, + "logps/rejected": -1.4995825290679932, + "loss": 1.3379, + "odds_ratio_loss": 0.6804186105728149, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.12698796391487122, + "rewards/margins": 0.0229702927172184, + "rewards/rejected": -0.14995825290679932, + "sft_loss": 1.2698795795440674, + "step": 560 + }, + { + "epoch": 0.9213982622752072, + "grad_norm": 6.1557416915893555, + "learning_rate": 3.92201184133826e-06, + "logits/chosen": 250.94808959960938, + "logits/rejected": 251.642822265625, + "logps/chosen": -1.2907052040100098, + "logps/rejected": -1.54143226146698, + "loss": 1.3538, + "odds_ratio_loss": 0.6311336755752563, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.12907053530216217, + "rewards/margins": 0.025072699412703514, + "rewards/rejected": -0.15414324402809143, + "sft_loss": 1.2907052040100098, + "step": 570 + }, + { + "epoch": 0.937563144069509, + "grad_norm": 2.1665000915527344, + "learning_rate": 3.886965579876572e-06, + "logits/chosen": 252.3577423095703, + "logits/rejected": 252.0865478515625, + "logps/chosen": -1.2575817108154297, + "logps/rejected": -1.3686320781707764, + "loss": 1.3293, + "odds_ratio_loss": 0.7170482873916626, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.12575815618038177, + "rewards/margins": 0.011105048470199108, + "rewards/rejected": -0.13686320185661316, + "sft_loss": 1.2575817108154297, + "step": 580 + }, + { + "epoch": 0.9537280258638109, + "grad_norm": 2.289733648300171, + "learning_rate": 3.851521043982716e-06, + "logits/chosen": 251.7819061279297, + "logits/rejected": 251.52749633789062, + "logps/chosen": -1.2542387247085571, + "logps/rejected": -1.3954790830612183, + "loss": 1.3206, + "odds_ratio_loss": 0.6639243960380554, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1254238784313202, + "rewards/margins": 0.014124047942459583, + "rewards/rejected": -0.1395479142665863, + "sft_loss": 1.2542387247085571, + "step": 590 + }, + { + "epoch": 0.9698929076581128, + "grad_norm": 2.7564313411712646, + "learning_rate": 3.81568841174086e-06, + "logits/chosen": 251.03280639648438, + "logits/rejected": 251.2174835205078, + "logps/chosen": -1.2807530164718628, + "logps/rejected": -1.5129293203353882, + "loss": 1.3482, + "odds_ratio_loss": 0.674277663230896, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.12807528674602509, + "rewards/margins": 0.023217635229229927, + "rewards/rejected": -0.15129292011260986, + "sft_loss": 1.2807530164718628, + "step": 600 + }, + { + "epoch": 0.9860577894524146, + "grad_norm": 2.1846888065338135, + "learning_rate": 3.7794779726790664e-06, + "logits/chosen": 249.8391571044922, + "logits/rejected": 250.3789520263672, + "logps/chosen": -1.1555012464523315, + "logps/rejected": -1.3768011331558228, + "loss": 1.2212, + "odds_ratio_loss": 0.6573610305786133, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.11555011570453644, + "rewards/margins": 0.022129978984594345, + "rewards/rejected": -0.13768009841442108, + "sft_loss": 1.1555012464523315, + "step": 610 + }, + { + "epoch": 1.0022226712467166, + "grad_norm": 2.2191011905670166, + "learning_rate": 3.7429001248146096e-06, + "logits/chosen": 250.8198699951172, + "logits/rejected": 251.24819946289062, + "logps/chosen": -1.272541880607605, + "logps/rejected": -1.5292177200317383, + "loss": 1.3338, + "odds_ratio_loss": 0.6125348806381226, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1272541880607605, + "rewards/margins": 0.0256675872951746, + "rewards/rejected": -0.15292176604270935, + "sft_loss": 1.272541880607605, + "step": 620 + }, + { + "epoch": 1.0183875530410185, + "grad_norm": 1.6834843158721924, + "learning_rate": 3.7059653716681227e-06, + "logits/chosen": 250.3338623046875, + "logits/rejected": 250.6593780517578, + "logps/chosen": -1.2664134502410889, + "logps/rejected": -1.469812035560608, + "loss": 1.3343, + "odds_ratio_loss": 0.6792756915092468, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.12664134800434113, + "rewards/margins": 0.020339861512184143, + "rewards/rejected": -0.14698120951652527, + "sft_loss": 1.2664134502410889, + "step": 630 + }, + { + "epoch": 1.0345524348353203, + "grad_norm": 5.188844203948975, + "learning_rate": 3.668684319247463e-06, + "logits/chosen": 249.46969604492188, + "logits/rejected": 250.1366729736328, + "logps/chosen": -1.1969501972198486, + "logps/rejected": -1.5598738193511963, + "loss": 1.2558, + "odds_ratio_loss": 0.588589072227478, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11969500780105591, + "rewards/margins": 0.03629238158464432, + "rewards/rejected": -0.15598741173744202, + "sft_loss": 1.1969501972198486, + "step": 640 + }, + { + "epoch": 1.0507173166296222, + "grad_norm": 1.8501890897750854, + "learning_rate": 3.6310676730021373e-06, + "logits/chosen": 250.78857421875, + "logits/rejected": 250.8007354736328, + "logps/chosen": -1.2203996181488037, + "logps/rejected": -1.3524749279022217, + "loss": 1.2867, + "odds_ratio_loss": 0.662962794303894, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12203995883464813, + "rewards/margins": 0.013207539916038513, + "rewards/rejected": -0.13524749875068665, + "sft_loss": 1.2203996181488037, + "step": 650 + }, + { + "epoch": 1.066882198423924, + "grad_norm": 3.5492091178894043, + "learning_rate": 3.593126234749178e-06, + "logits/chosen": 250.8761749267578, + "logits/rejected": 251.28622436523438, + "logps/chosen": -1.2661250829696655, + "logps/rejected": -1.455129861831665, + "loss": 1.3334, + "odds_ratio_loss": 0.6727336645126343, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.12661252915859222, + "rewards/margins": 0.01890045776963234, + "rewards/rejected": -0.14551296830177307, + "sft_loss": 1.2661250829696655, + "step": 660 + }, + { + "epoch": 1.083047080218226, + "grad_norm": 3.5715062618255615, + "learning_rate": 3.554870899571343e-06, + "logits/chosen": 252.4844512939453, + "logits/rejected": 252.82699584960938, + "logps/chosen": -1.2469182014465332, + "logps/rejected": -1.4401594400405884, + "loss": 1.3131, + "odds_ratio_loss": 0.6617658734321594, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12469182908535004, + "rewards/margins": 0.019324112683534622, + "rewards/rejected": -0.14401593804359436, + "sft_loss": 1.2469182014465332, + "step": 670 + }, + { + "epoch": 1.0992119620125278, + "grad_norm": 4.318095684051514, + "learning_rate": 3.5163126526885373e-06, + "logits/chosen": 252.0143585205078, + "logits/rejected": 251.80081176757812, + "logps/chosen": -1.1914775371551514, + "logps/rejected": -1.4009137153625488, + "loss": 1.2573, + "odds_ratio_loss": 0.6579803824424744, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.11914775520563126, + "rewards/margins": 0.02094360813498497, + "rewards/rejected": -0.14009135961532593, + "sft_loss": 1.1914775371551514, + "step": 680 + }, + { + "epoch": 1.1153768438068297, + "grad_norm": 2.403775930404663, + "learning_rate": 3.4774625663033484e-06, + "logits/chosen": 251.2095184326172, + "logits/rejected": 251.48507690429688, + "logps/chosen": -1.2048381567001343, + "logps/rejected": -1.3877532482147217, + "loss": 1.27, + "odds_ratio_loss": 0.6517833471298218, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.12048381567001343, + "rewards/margins": 0.018291514366865158, + "rewards/rejected": -0.13877533376216888, + "sft_loss": 1.2048381567001343, + "step": 690 + }, + { + "epoch": 1.1315417256011315, + "grad_norm": 1.7898093461990356, + "learning_rate": 3.4383317964216067e-06, + "logits/chosen": 252.33316040039062, + "logits/rejected": 252.1842498779297, + "logps/chosen": -1.1471569538116455, + "logps/rejected": -1.306755781173706, + "loss": 1.2157, + "odds_ratio_loss": 0.6855098009109497, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.11471569538116455, + "rewards/margins": 0.01595989242196083, + "rewards/rejected": -0.13067558407783508, + "sft_loss": 1.1471569538116455, + "step": 700 + }, + { + "epoch": 1.1477066073954334, + "grad_norm": 3.209373712539673, + "learning_rate": 3.398931579648877e-06, + "logits/chosen": 251.15170288085938, + "logits/rejected": 251.59976196289062, + "logps/chosen": -1.239712119102478, + "logps/rejected": -1.5323327779769897, + "loss": 1.3045, + "odds_ratio_loss": 0.6475890874862671, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.12397120893001556, + "rewards/margins": 0.029262065887451172, + "rewards/rejected": -0.15323328971862793, + "sft_loss": 1.239712119102478, + "step": 710 + }, + { + "epoch": 1.1638714891897353, + "grad_norm": 2.6601579189300537, + "learning_rate": 3.359273229963813e-06, + "logits/chosen": 250.285400390625, + "logits/rejected": 250.47323608398438, + "logps/chosen": -1.2064179182052612, + "logps/rejected": -1.3739216327667236, + "loss": 1.2742, + "odds_ratio_loss": 0.6774007081985474, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.12064179033041, + "rewards/margins": 0.01675037480890751, + "rewards/rejected": -0.13739216327667236, + "sft_loss": 1.2064179182052612, + "step": 720 + }, + { + "epoch": 1.1800363709840371, + "grad_norm": 1.836297631263733, + "learning_rate": 3.319368135469285e-06, + "logits/chosen": 251.77334594726562, + "logits/rejected": 252.28701782226562, + "logps/chosen": -1.2479230165481567, + "logps/rejected": -1.4433178901672363, + "loss": 1.3175, + "odds_ratio_loss": 0.6954701542854309, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12479230016469955, + "rewards/margins": 0.019539497792720795, + "rewards/rejected": -0.14433178305625916, + "sft_loss": 1.2479230165481567, + "step": 730 + }, + { + "epoch": 1.196201252778339, + "grad_norm": 3.1846110820770264, + "learning_rate": 3.279227755122228e-06, + "logits/chosen": 252.08438110351562, + "logits/rejected": 252.65792846679688, + "logps/chosen": -1.196380376815796, + "logps/rejected": -1.5170973539352417, + "loss": 1.2585, + "odds_ratio_loss": 0.6215213537216187, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.11963804066181183, + "rewards/margins": 0.032071683555841446, + "rewards/rejected": -0.15170973539352417, + "sft_loss": 1.196380376815796, + "step": 740 + }, + { + "epoch": 1.2123661345726409, + "grad_norm": 3.024951934814453, + "learning_rate": 3.2388636154431417e-06, + "logits/chosen": 253.1087646484375, + "logits/rejected": 253.23635864257812, + "logps/chosen": -1.3020392656326294, + "logps/rejected": -1.5343925952911377, + "loss": 1.3675, + "odds_ratio_loss": 0.654754638671875, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.13020391762256622, + "rewards/margins": 0.02323536016047001, + "rewards/rejected": -0.1534392535686493, + "sft_loss": 1.3020392656326294, + "step": 750 + }, + { + "epoch": 1.2285310163669427, + "grad_norm": 2.166121482849121, + "learning_rate": 3.198287307206192e-06, + "logits/chosen": 251.711669921875, + "logits/rejected": 251.5684356689453, + "logps/chosen": -1.1889938116073608, + "logps/rejected": -1.4522913694381714, + "loss": 1.2499, + "odds_ratio_loss": 0.60938560962677, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1188993826508522, + "rewards/margins": 0.02632974646985531, + "rewards/rejected": -0.14522913098335266, + "sft_loss": 1.1889938116073608, + "step": 760 + }, + { + "epoch": 1.2446958981612446, + "grad_norm": 1.8584887981414795, + "learning_rate": 3.157510482110856e-06, + "logits/chosen": 252.8727569580078, + "logits/rejected": 253.4295654296875, + "logps/chosen": -1.2046940326690674, + "logps/rejected": -1.360910177230835, + "loss": 1.2735, + "odds_ratio_loss": 0.6879505515098572, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.12046940624713898, + "rewards/margins": 0.01562163233757019, + "rewards/rejected": -0.13609102368354797, + "sft_loss": 1.2046940326690674, + "step": 770 + }, + { + "epoch": 1.2608607799555465, + "grad_norm": 1.6219208240509033, + "learning_rate": 3.116544849436077e-06, + "logits/chosen": 251.80764770507812, + "logits/rejected": 251.75509643554688, + "logps/chosen": -1.3175479173660278, + "logps/rejected": -1.6150630712509155, + "loss": 1.3813, + "odds_ratio_loss": 0.6378855109214783, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1317548006772995, + "rewards/margins": 0.029751509428024292, + "rewards/rejected": -0.1615062952041626, + "sft_loss": 1.3175479173660278, + "step": 780 + }, + { + "epoch": 1.2770256617498483, + "grad_norm": 2.1420071125030518, + "learning_rate": 3.0754021726778848e-06, + "logits/chosen": 252.167724609375, + "logits/rejected": 251.9316864013672, + "logps/chosen": -1.1495087146759033, + "logps/rejected": -1.426129937171936, + "loss": 1.2132, + "odds_ratio_loss": 0.6372426748275757, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.11495087295770645, + "rewards/margins": 0.02766209840774536, + "rewards/rejected": -0.1426129937171936, + "sft_loss": 1.1495087146759033, + "step": 790 + }, + { + "epoch": 1.2931905435441502, + "grad_norm": 1.3823323249816895, + "learning_rate": 3.0340942661714463e-06, + "logits/chosen": 252.6959686279297, + "logits/rejected": 252.73464965820312, + "logps/chosen": -1.2912076711654663, + "logps/rejected": -1.4657213687896729, + "loss": 1.3573, + "odds_ratio_loss": 0.6610310673713684, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.12912078201770782, + "rewards/margins": 0.017451368272304535, + "rewards/rejected": -0.14657214283943176, + "sft_loss": 1.2912076711654663, + "step": 800 + }, + { + "epoch": 1.3093554253384523, + "grad_norm": 3.4516756534576416, + "learning_rate": 2.992632991698512e-06, + "logits/chosen": 250.41928100585938, + "logits/rejected": 250.66513061523438, + "logps/chosen": -1.219699501991272, + "logps/rejected": -1.483235239982605, + "loss": 1.2828, + "odds_ratio_loss": 0.6311507821083069, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12196997553110123, + "rewards/margins": 0.02635357342660427, + "rewards/rejected": -0.14832353591918945, + "sft_loss": 1.219699501991272, + "step": 810 + }, + { + "epoch": 1.3255203071327541, + "grad_norm": 2.465632677078247, + "learning_rate": 2.9510302550812537e-06, + "logits/chosen": 251.94296264648438, + "logits/rejected": 252.61587524414062, + "logps/chosen": -1.144325852394104, + "logps/rejected": -1.4354488849639893, + "loss": 1.2042, + "odds_ratio_loss": 0.5983381271362305, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.11443258821964264, + "rewards/margins": 0.02911229059100151, + "rewards/rejected": -0.14354488253593445, + "sft_loss": 1.144325852394104, + "step": 820 + }, + { + "epoch": 1.341685188927056, + "grad_norm": 3.969513416290283, + "learning_rate": 2.9092980027634325e-06, + "logits/chosen": 251.37832641601562, + "logits/rejected": 251.625244140625, + "logps/chosen": -1.1136391162872314, + "logps/rejected": -1.3801125288009644, + "loss": 1.1766, + "odds_ratio_loss": 0.6293498277664185, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.11136390268802643, + "rewards/margins": 0.026647353544831276, + "rewards/rejected": -0.13801124691963196, + "sft_loss": 1.1136391162872314, + "step": 830 + }, + { + "epoch": 1.3578500707213579, + "grad_norm": 1.7552839517593384, + "learning_rate": 2.867448218379927e-06, + "logits/chosen": 252.9868621826172, + "logits/rejected": 253.2499542236328, + "logps/chosen": -1.249079704284668, + "logps/rejected": -1.4685295820236206, + "loss": 1.3139, + "odds_ratio_loss": 0.6482545733451843, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1249079704284668, + "rewards/margins": 0.02194499969482422, + "rewards/rejected": -0.14685297012329102, + "sft_loss": 1.249079704284668, + "step": 840 + }, + { + "epoch": 1.3740149525156597, + "grad_norm": 5.6061906814575195, + "learning_rate": 2.825492919315559e-06, + "logits/chosen": 252.72372436523438, + "logits/rejected": 252.32168579101562, + "logps/chosen": -1.2922828197479248, + "logps/rejected": -1.4327681064605713, + "loss": 1.3613, + "odds_ratio_loss": 0.6897528767585754, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.12922829389572144, + "rewards/margins": 0.01404851209372282, + "rewards/rejected": -0.14327679574489594, + "sft_loss": 1.2922828197479248, + "step": 850 + }, + { + "epoch": 1.3901798343099616, + "grad_norm": 2.2057290077209473, + "learning_rate": 2.7834441532542482e-06, + "logits/chosen": 251.51272583007812, + "logits/rejected": 251.97573852539062, + "logps/chosen": -1.1630654335021973, + "logps/rejected": -1.4224598407745361, + "loss": 1.2262, + "odds_ratio_loss": 0.6317997574806213, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.11630652844905853, + "rewards/margins": 0.025939440354704857, + "rewards/rejected": -0.14224597811698914, + "sft_loss": 1.1630654335021973, + "step": 860 + }, + { + "epoch": 1.4063447161042635, + "grad_norm": 2.0599286556243896, + "learning_rate": 2.74131399471945e-06, + "logits/chosen": 252.7571258544922, + "logits/rejected": 253.06008911132812, + "logps/chosen": -1.2314178943634033, + "logps/rejected": -1.404909372329712, + "loss": 1.297, + "odds_ratio_loss": 0.6555390357971191, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.12314176559448242, + "rewards/margins": 0.01734915003180504, + "rewards/rejected": -0.14049093425273895, + "sft_loss": 1.2314178943634033, + "step": 870 + }, + { + "epoch": 1.4225095978985653, + "grad_norm": 3.7026567459106445, + "learning_rate": 2.6991145416068947e-06, + "logits/chosen": 252.689697265625, + "logits/rejected": 252.87332153320312, + "logps/chosen": -1.2634754180908203, + "logps/rejected": -1.376312255859375, + "loss": 1.3339, + "odds_ratio_loss": 0.7037913799285889, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.12634754180908203, + "rewards/margins": 0.011283671483397484, + "rewards/rejected": -0.13763120770454407, + "sft_loss": 1.2634754180908203, + "step": 880 + }, + { + "epoch": 1.4386744796928672, + "grad_norm": 2.7741122245788574, + "learning_rate": 2.6568579117106143e-06, + "logits/chosen": 251.893310546875, + "logits/rejected": 251.9792022705078, + "logps/chosen": -1.1909462213516235, + "logps/rejected": -1.444592833518982, + "loss": 1.257, + "odds_ratio_loss": 0.6606670618057251, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11909462511539459, + "rewards/margins": 0.025364672765135765, + "rewards/rejected": -0.1444592922925949, + "sft_loss": 1.1909462213516235, + "step": 890 + }, + { + "epoch": 1.454839361487169, + "grad_norm": 1.2793887853622437, + "learning_rate": 2.6145562392432544e-06, + "logits/chosen": 253.50723266601562, + "logits/rejected": 253.42764282226562, + "logps/chosen": -1.2168656587600708, + "logps/rejected": -1.336360216140747, + "loss": 1.2887, + "odds_ratio_loss": 0.7181415557861328, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.12168655544519424, + "rewards/margins": 0.01194946188479662, + "rewards/rejected": -0.13363602757453918, + "sft_loss": 1.2168656587600708, + "step": 900 + }, + { + "epoch": 1.471004243281471, + "grad_norm": 2.857558012008667, + "learning_rate": 2.5722216713516682e-06, + "logits/chosen": 252.78237915039062, + "logits/rejected": 253.788330078125, + "logps/chosen": -1.1416139602661133, + "logps/rejected": -1.3757555484771729, + "loss": 1.2043, + "odds_ratio_loss": 0.6271349787712097, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.11416139453649521, + "rewards/margins": 0.023414146155118942, + "rewards/rejected": -0.13757555186748505, + "sft_loss": 1.1416139602661133, + "step": 910 + }, + { + "epoch": 1.4871691250757728, + "grad_norm": 2.625776529312134, + "learning_rate": 2.5298663646288064e-06, + "logits/chosen": 253.61221313476562, + "logits/rejected": 254.0120391845703, + "logps/chosen": -1.1546480655670166, + "logps/rejected": -1.4036109447479248, + "loss": 1.2201, + "odds_ratio_loss": 0.6542297601699829, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.11546480655670166, + "rewards/margins": 0.024896297603845596, + "rewards/rejected": -0.14036110043525696, + "sft_loss": 1.1546480655670166, + "step": 920 + }, + { + "epoch": 1.503334006870075, + "grad_norm": 3.928030014038086, + "learning_rate": 2.487502481622879e-06, + "logits/chosen": 252.84619140625, + "logits/rejected": 253.71127319335938, + "logps/chosen": -1.2712576389312744, + "logps/rejected": -1.42746901512146, + "loss": 1.3413, + "odds_ratio_loss": 0.7003083229064941, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12712574005126953, + "rewards/margins": 0.01562117226421833, + "rewards/rejected": -0.1427469402551651, + "sft_loss": 1.2712576389312744, + "step": 930 + }, + { + "epoch": 1.5194988886643768, + "grad_norm": 2.4900426864624023, + "learning_rate": 2.4451421873448253e-06, + "logits/chosen": 252.51846313476562, + "logits/rejected": 253.07400512695312, + "logps/chosen": -1.193199634552002, + "logps/rejected": -1.3677222728729248, + "loss": 1.2601, + "odds_ratio_loss": 0.6688076257705688, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.11931997537612915, + "rewards/margins": 0.01745227724313736, + "rewards/rejected": -0.1367722451686859, + "sft_loss": 1.193199634552002, + "step": 940 + }, + { + "epoch": 1.5356637704586786, + "grad_norm": 6.85699987411499, + "learning_rate": 2.40279764577506e-06, + "logits/chosen": 253.85693359375, + "logits/rejected": 253.9010467529297, + "logps/chosen": -1.304840087890625, + "logps/rejected": -1.417873501777649, + "loss": 1.3741, + "odds_ratio_loss": 0.6923686861991882, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.13048401474952698, + "rewards/margins": 0.011303339153528214, + "rewards/rejected": -0.1417873501777649, + "sft_loss": 1.304840087890625, + "step": 950 + }, + { + "epoch": 1.5518286522529805, + "grad_norm": 2.3570547103881836, + "learning_rate": 2.3604810163705242e-06, + "logits/chosen": 253.90060424804688, + "logits/rejected": 254.25430297851562, + "logps/chosen": -1.1358963251113892, + "logps/rejected": -1.3512394428253174, + "loss": 1.1966, + "odds_ratio_loss": 0.6068128943443298, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11358962953090668, + "rewards/margins": 0.021534323692321777, + "rewards/rejected": -0.13512396812438965, + "sft_loss": 1.1358963251113892, + "step": 960 + }, + { + "epoch": 1.5679935340472824, + "grad_norm": 1.6715513467788696, + "learning_rate": 2.3182044505730364e-06, + "logits/chosen": 252.765380859375, + "logits/rejected": 252.7443389892578, + "logps/chosen": -1.0937732458114624, + "logps/rejected": -1.302191972732544, + "loss": 1.1567, + "odds_ratio_loss": 0.6288636922836304, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.10937733948230743, + "rewards/margins": 0.020841870456933975, + "rewards/rejected": -0.13021919131278992, + "sft_loss": 1.0937732458114624, + "step": 970 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 1.8489584922790527, + "learning_rate": 2.275980088319941e-06, + "logits/chosen": 253.30712890625, + "logits/rejected": 253.5155487060547, + "logps/chosen": -1.149460792541504, + "logps/rejected": -1.2745110988616943, + "loss": 1.2198, + "odds_ratio_loss": 0.7036079168319702, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.11494608223438263, + "rewards/margins": 0.012505029328167439, + "rewards/rejected": -0.1274511069059372, + "sft_loss": 1.149460792541504, + "step": 980 + }, + { + "epoch": 1.600323297635886, + "grad_norm": 2.3143341541290283, + "learning_rate": 2.2338200545580577e-06, + "logits/chosen": 253.9146728515625, + "logits/rejected": 254.3609619140625, + "logps/chosen": -1.1358720064163208, + "logps/rejected": -1.409860372543335, + "loss": 1.203, + "odds_ratio_loss": 0.6715231537818909, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.11358718574047089, + "rewards/margins": 0.02739885076880455, + "rewards/rejected": -0.14098605513572693, + "sft_loss": 1.1358720064163208, + "step": 990 + }, + { + "epoch": 1.616488179430188, + "grad_norm": 2.5078933238983154, + "learning_rate": 2.191736455761947e-06, + "logits/chosen": 252.4419708251953, + "logits/rejected": 252.6824493408203, + "logps/chosen": -1.102782964706421, + "logps/rejected": -1.295693039894104, + "loss": 1.1628, + "odds_ratio_loss": 0.5999386310577393, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.11027830839157104, + "rewards/margins": 0.019290992990136147, + "rewards/rejected": -0.12956929206848145, + "sft_loss": 1.102782964706421, + "step": 1000 + }, + { + "epoch": 1.616488179430188, + "eval_logits/chosen": 252.82716369628906, + "eval_logits/rejected": 253.18104553222656, + "eval_logps/chosen": -1.2153432369232178, + "eval_logps/rejected": -1.446128010749817, + "eval_loss": 1.2833058834075928, + "eval_odds_ratio_loss": 0.6796271204948425, + "eval_rewards/accuracies": 0.5618181824684143, + "eval_rewards/chosen": -0.1215343102812767, + "eval_rewards/margins": 0.023078490048646927, + "eval_rewards/rejected": -0.14461281895637512, + "eval_runtime": 221.4361, + "eval_samples_per_second": 4.968, + "eval_sft_loss": 1.2153432369232178, + "eval_steps_per_second": 2.484, + "step": 1000 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 1.7511672973632812, + "learning_rate": 2.1497413764574673e-06, + "logits/chosen": 253.8401336669922, + "logits/rejected": 253.7457733154297, + "logps/chosen": -1.2121939659118652, + "logps/rejected": -1.4931201934814453, + "loss": 1.2703, + "odds_ratio_loss": 0.5808267593383789, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.12121939659118652, + "rewards/margins": 0.028092628344893456, + "rewards/rejected": -0.14931201934814453, + "sft_loss": 1.2121939659118652, + "step": 1010 + }, + { + "epoch": 1.6488179430187917, + "grad_norm": 2.1624321937561035, + "learning_rate": 2.1078468757516395e-06, + "logits/chosen": 252.7372589111328, + "logits/rejected": 253.10342407226562, + "logps/chosen": -1.1226885318756104, + "logps/rejected": -1.302170991897583, + "loss": 1.1845, + "odds_ratio_loss": 0.6178861856460571, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.11226886510848999, + "rewards/margins": 0.017948249354958534, + "rewards/rejected": -0.13021712005138397, + "sft_loss": 1.1226885318756104, + "step": 1020 + }, + { + "epoch": 1.6649828248130936, + "grad_norm": 2.5826563835144043, + "learning_rate": 2.0660649838698145e-06, + "logits/chosen": 255.34326171875, + "logits/rejected": 255.65859985351562, + "logps/chosen": -1.1558864116668701, + "logps/rejected": -1.3295384645462036, + "loss": 1.2211, + "odds_ratio_loss": 0.6525439023971558, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.11558864265680313, + "rewards/margins": 0.01736520044505596, + "rewards/rejected": -0.13295385241508484, + "sft_loss": 1.1558864116668701, + "step": 1030 + }, + { + "epoch": 1.6811477066073954, + "grad_norm": 1.975549340248108, + "learning_rate": 2.0244076987011284e-06, + "logits/chosen": 255.1981964111328, + "logits/rejected": 255.7158966064453, + "logps/chosen": -1.2127221822738647, + "logps/rejected": -1.4685566425323486, + "loss": 1.2727, + "odds_ratio_loss": 0.6000550389289856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1212722510099411, + "rewards/margins": 0.025583425536751747, + "rewards/rejected": -0.1468556672334671, + "sft_loss": 1.2127221822738647, + "step": 1040 + }, + { + "epoch": 1.6973125884016973, + "grad_norm": 2.224191904067993, + "learning_rate": 1.982886982353251e-06, + "logits/chosen": 252.6818389892578, + "logits/rejected": 252.80859375, + "logps/chosen": -1.193681240081787, + "logps/rejected": -1.44672691822052, + "loss": 1.2608, + "odds_ratio_loss": 0.6715336441993713, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11936812102794647, + "rewards/margins": 0.025304565206170082, + "rewards/rejected": -0.144672691822052, + "sft_loss": 1.193681240081787, + "step": 1050 + }, + { + "epoch": 1.7134774701959992, + "grad_norm": 2.571403980255127, + "learning_rate": 1.941514757717392e-06, + "logits/chosen": 253.2911376953125, + "logits/rejected": 254.0371551513672, + "logps/chosen": -1.2021260261535645, + "logps/rejected": -1.443331003189087, + "loss": 1.2653, + "odds_ratio_loss": 0.6321113705635071, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1202125996351242, + "rewards/margins": 0.02412049099802971, + "rewards/rejected": -0.1443330943584442, + "sft_loss": 1.2021260261535645, + "step": 1060 + }, + { + "epoch": 1.729642351990301, + "grad_norm": 4.061903476715088, + "learning_rate": 1.9003029050445953e-06, + "logits/chosen": 254.00650024414062, + "logits/rejected": 254.38876342773438, + "logps/chosen": -1.2242114543914795, + "logps/rejected": -1.4163745641708374, + "loss": 1.2891, + "odds_ratio_loss": 0.6486276984214783, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.12242114543914795, + "rewards/margins": 0.019216306507587433, + "rewards/rejected": -0.14163745939731598, + "sft_loss": 1.2242114543914795, + "step": 1070 + }, + { + "epoch": 1.745807233784603, + "grad_norm": 2.371570110321045, + "learning_rate": 1.8592632585342523e-06, + "logits/chosen": 254.29745483398438, + "logits/rejected": 254.67745971679688, + "logps/chosen": -1.1612073183059692, + "logps/rejected": -1.4247183799743652, + "loss": 1.2246, + "odds_ratio_loss": 0.6341406106948853, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.11612071841955185, + "rewards/margins": 0.026351114735007286, + "rewards/rejected": -0.1424718201160431, + "sft_loss": 1.1612073183059692, + "step": 1080 + }, + { + "epoch": 1.7619721155789048, + "grad_norm": 8.819137573242188, + "learning_rate": 1.8184076029358527e-06, + "logits/chosen": 253.06661987304688, + "logits/rejected": 252.38119506835938, + "logps/chosen": -1.161278486251831, + "logps/rejected": -1.2557886838912964, + "loss": 1.2286, + "odds_ratio_loss": 0.6734786033630371, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1161278635263443, + "rewards/margins": 0.009451002813875675, + "rewards/rejected": -0.1255788505077362, + "sft_loss": 1.161278486251831, + "step": 1090 + }, + { + "epoch": 1.7781369973732066, + "grad_norm": 1.7281618118286133, + "learning_rate": 1.7777476701649318e-06, + "logits/chosen": 251.3661651611328, + "logits/rejected": 252.11477661132812, + "logps/chosen": -1.1861474514007568, + "logps/rejected": -1.3936630487442017, + "loss": 1.2518, + "odds_ratio_loss": 0.6561599373817444, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.11861474812030792, + "rewards/margins": 0.020751552656292915, + "rewards/rejected": -0.1393662989139557, + "sft_loss": 1.1861474514007568, + "step": 1100 + }, + { + "epoch": 1.7943018791675085, + "grad_norm": 3.3538103103637695, + "learning_rate": 1.7372951359341925e-06, + "logits/chosen": 253.0167236328125, + "logits/rejected": 253.53396606445312, + "logps/chosen": -1.137481451034546, + "logps/rejected": -1.2894176244735718, + "loss": 1.2055, + "odds_ratio_loss": 0.6805364489555359, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.11374815553426743, + "rewards/margins": 0.01519359927624464, + "rewards/rejected": -0.12894175946712494, + "sft_loss": 1.137481451034546, + "step": 1110 + }, + { + "epoch": 1.8104667609618104, + "grad_norm": 3.6225833892822266, + "learning_rate": 1.6970616164007547e-06, + "logits/chosen": 252.6470489501953, + "logits/rejected": 252.9353485107422, + "logps/chosen": -1.1084340810775757, + "logps/rejected": -1.3413022756576538, + "loss": 1.1728, + "odds_ratio_loss": 0.6432042717933655, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11084340512752533, + "rewards/margins": 0.02328682318329811, + "rewards/rejected": -0.13413023948669434, + "sft_loss": 1.1084340810775757, + "step": 1120 + }, + { + "epoch": 1.8266316427561122, + "grad_norm": 4.332692623138428, + "learning_rate": 1.6570586648305276e-06, + "logits/chosen": 253.6255645751953, + "logits/rejected": 253.76217651367188, + "logps/chosen": -1.1925103664398193, + "logps/rejected": -1.4342319965362549, + "loss": 1.2579, + "odds_ratio_loss": 0.6541949510574341, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.11925105005502701, + "rewards/margins": 0.024172160774469376, + "rewards/rejected": -0.14342321455478668, + "sft_loss": 1.1925103664398193, + "step": 1130 + }, + { + "epoch": 1.842796524550414, + "grad_norm": 3.238105535507202, + "learning_rate": 1.6172977682806151e-06, + "logits/chosen": 253.7568817138672, + "logits/rejected": 254.8863525390625, + "logps/chosen": -1.2200841903686523, + "logps/rejected": -1.4592787027359009, + "loss": 1.2837, + "odds_ratio_loss": 0.6365170478820801, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.12200842052698135, + "rewards/margins": 0.023919429630041122, + "rewards/rejected": -0.14592786133289337, + "sft_loss": 1.2200841903686523, + "step": 1140 + }, + { + "epoch": 1.858961406344716, + "grad_norm": 2.5219290256500244, + "learning_rate": 1.5777903443007586e-06, + "logits/chosen": 253.631103515625, + "logits/rejected": 253.82504272460938, + "logps/chosen": -1.235215425491333, + "logps/rejected": -1.4535129070281982, + "loss": 1.3023, + "odds_ratio_loss": 0.6708552241325378, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.12352155148983002, + "rewards/margins": 0.021829739212989807, + "rewards/rejected": -0.14535130560398102, + "sft_loss": 1.235215425491333, + "step": 1150 + }, + { + "epoch": 1.8751262881390178, + "grad_norm": 3.190958261489868, + "learning_rate": 1.5385477376547226e-06, + "logits/chosen": 255.1521759033203, + "logits/rejected": 255.2525634765625, + "logps/chosen": -1.229001760482788, + "logps/rejected": -1.4793845415115356, + "loss": 1.2891, + "odds_ratio_loss": 0.601111888885498, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.12290020287036896, + "rewards/margins": 0.02503824792802334, + "rewards/rejected": -0.14793843030929565, + "sft_loss": 1.229001760482788, + "step": 1160 + }, + { + "epoch": 1.89129116993332, + "grad_norm": 2.217510461807251, + "learning_rate": 1.4995812170625845e-06, + "logits/chosen": 253.1751251220703, + "logits/rejected": 253.72238159179688, + "logps/chosen": -1.2252581119537354, + "logps/rejected": -1.5921032428741455, + "loss": 1.2851, + "odds_ratio_loss": 0.598137617111206, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.12252581119537354, + "rewards/margins": 0.036684513092041016, + "rewards/rejected": -0.15921030938625336, + "sft_loss": 1.2252581119537354, + "step": 1170 + }, + { + "epoch": 1.9074560517276218, + "grad_norm": 3.0452287197113037, + "learning_rate": 1.4609019719648666e-06, + "logits/chosen": 254.07901000976562, + "logits/rejected": 254.59957885742188, + "logps/chosen": -1.2207356691360474, + "logps/rejected": -1.4706141948699951, + "loss": 1.2826, + "odds_ratio_loss": 0.6183902025222778, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.12207356840372086, + "rewards/margins": 0.024987850338220596, + "rewards/rejected": -0.14706142246723175, + "sft_loss": 1.2207356691360474, + "step": 1180 + }, + { + "epoch": 1.9236209335219236, + "grad_norm": 4.679479122161865, + "learning_rate": 1.42252110930943e-06, + "logits/chosen": 252.7305450439453, + "logits/rejected": 252.6374969482422, + "logps/chosen": -1.064835786819458, + "logps/rejected": -1.2910759449005127, + "loss": 1.1283, + "odds_ratio_loss": 0.6346200704574585, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1064835786819458, + "rewards/margins": 0.02262401580810547, + "rewards/rejected": -0.12910759449005127, + "sft_loss": 1.064835786819458, + "step": 1190 + }, + { + "epoch": 1.9397858153162255, + "grad_norm": 3.286461353302002, + "learning_rate": 1.3844496503620493e-06, + "logits/chosen": 253.310302734375, + "logits/rejected": 253.27023315429688, + "logps/chosen": -1.2112998962402344, + "logps/rejected": -1.3967139720916748, + "loss": 1.2737, + "odds_ratio_loss": 0.6242542862892151, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.12112998962402344, + "rewards/margins": 0.01854141615331173, + "rewards/rejected": -0.1396714150905609, + "sft_loss": 1.2112998962402344, + "step": 1200 + }, + { + "epoch": 1.9559506971105274, + "grad_norm": 2.8077545166015625, + "learning_rate": 1.3466985275416081e-06, + "logits/chosen": 254.2769775390625, + "logits/rejected": 254.47360229492188, + "logps/chosen": -1.2563018798828125, + "logps/rejected": -1.4514508247375488, + "loss": 1.3239, + "odds_ratio_loss": 0.6757391691207886, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1256301999092102, + "rewards/margins": 0.019514882937073708, + "rewards/rejected": -0.14514507353305817, + "sft_loss": 1.2563018798828125, + "step": 1210 + }, + { + "epoch": 1.9721155789048292, + "grad_norm": 2.275397777557373, + "learning_rate": 1.309278581280791e-06, + "logits/chosen": 253.5750274658203, + "logits/rejected": 253.99935913085938, + "logps/chosen": -1.1356334686279297, + "logps/rejected": -1.4314597845077515, + "loss": 1.1934, + "odds_ratio_loss": 0.5772610902786255, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11356334388256073, + "rewards/margins": 0.029582645744085312, + "rewards/rejected": -0.14314597845077515, + "sft_loss": 1.1356334686279297, + "step": 1220 + }, + { + "epoch": 1.9882804606991311, + "grad_norm": 1.454276204109192, + "learning_rate": 1.272200556913199e-06, + "logits/chosen": 254.544677734375, + "logits/rejected": 254.67251586914062, + "logps/chosen": -1.1884077787399292, + "logps/rejected": -1.395115613937378, + "loss": 1.2599, + "odds_ratio_loss": 0.7147720456123352, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1188407689332962, + "rewards/margins": 0.020670795813202858, + "rewards/rejected": -0.1395115852355957, + "sft_loss": 1.1884077787399292, + "step": 1230 + }, + { + "epoch": 2.004445342493433, + "grad_norm": 3.6475422382354736, + "learning_rate": 1.2354751015877698e-06, + "logits/chosen": 252.74777221679688, + "logits/rejected": 253.66641235351562, + "logps/chosen": -1.1167339086532593, + "logps/rejected": -1.4450454711914062, + "loss": 1.1791, + "odds_ratio_loss": 0.6237870454788208, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.11167339980602264, + "rewards/margins": 0.03283114731311798, + "rewards/rejected": -0.14450454711914062, + "sft_loss": 1.1167339086532593, + "step": 1240 + }, + { + "epoch": 2.020610224287735, + "grad_norm": 3.52698016166687, + "learning_rate": 1.1991127612113945e-06, + "logits/chosen": 254.6741943359375, + "logits/rejected": 254.9825897216797, + "logps/chosen": -1.1792643070220947, + "logps/rejected": -1.4326034784317017, + "loss": 1.2387, + "odds_ratio_loss": 0.5942111611366272, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.11792641878128052, + "rewards/margins": 0.02533392235636711, + "rewards/rejected": -0.14326035976409912, + "sft_loss": 1.1792643070220947, + "step": 1250 + }, + { + "epoch": 2.036775106082037, + "grad_norm": 3.579160690307617, + "learning_rate": 1.1631239774206035e-06, + "logits/chosen": 253.5153350830078, + "logits/rejected": 253.659912109375, + "logps/chosen": -1.1673438549041748, + "logps/rejected": -1.4458467960357666, + "loss": 1.2314, + "odds_ratio_loss": 0.6410170793533325, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.11673440039157867, + "rewards/margins": 0.027850273996591568, + "rewards/rejected": -0.14458468556404114, + "sft_loss": 1.1673438549041748, + "step": 1260 + }, + { + "epoch": 2.052939987876339, + "grad_norm": 3.0812463760375977, + "learning_rate": 1.1275190845831978e-06, + "logits/chosen": 254.5985870361328, + "logits/rejected": 254.2525177001953, + "logps/chosen": -1.1342524290084839, + "logps/rejected": -1.3948237895965576, + "loss": 1.1925, + "odds_ratio_loss": 0.5824798345565796, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11342523247003555, + "rewards/margins": 0.026057133451104164, + "rewards/rejected": -0.13948237895965576, + "sft_loss": 1.1342524290084839, + "step": 1270 + }, + { + "epoch": 2.0691048696706407, + "grad_norm": 2.4549710750579834, + "learning_rate": 1.0923083068306778e-06, + "logits/chosen": 254.7982635498047, + "logits/rejected": 255.1488494873047, + "logps/chosen": -1.1482160091400146, + "logps/rejected": -1.4850049018859863, + "loss": 1.2055, + "odds_ratio_loss": 0.5724589824676514, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.11482159048318863, + "rewards/margins": 0.033678896725177765, + "rewards/rejected": -0.1485004872083664, + "sft_loss": 1.1482160091400146, + "step": 1280 + }, + { + "epoch": 2.0852697514649425, + "grad_norm": 1.778605580329895, + "learning_rate": 1.0575017551223348e-06, + "logits/chosen": 253.03524780273438, + "logits/rejected": 253.53366088867188, + "logps/chosen": -1.087461233139038, + "logps/rejected": -1.321656584739685, + "loss": 1.1524, + "odds_ratio_loss": 0.6496065855026245, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.10874611139297485, + "rewards/margins": 0.023419544100761414, + "rewards/rejected": -0.13216565549373627, + "sft_loss": 1.087461233139038, + "step": 1290 + }, + { + "epoch": 2.1014346332592444, + "grad_norm": 1.522445797920227, + "learning_rate": 1.023109424341833e-06, + "logits/chosen": 254.5054168701172, + "logits/rejected": 255.06100463867188, + "logps/chosen": -1.2142359018325806, + "logps/rejected": -1.448194146156311, + "loss": 1.2781, + "odds_ratio_loss": 0.6386287808418274, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12142357975244522, + "rewards/margins": 0.023395827040076256, + "rewards/rejected": -0.14481940865516663, + "sft_loss": 1.2142359018325806, + "step": 1300 + }, + { + "epoch": 2.1175995150535463, + "grad_norm": 2.577580690383911, + "learning_rate": 9.891411904271273e-07, + "logits/chosen": 254.14779663085938, + "logits/rejected": 254.121826171875, + "logps/chosen": -1.100303292274475, + "logps/rejected": -1.3276548385620117, + "loss": 1.1632, + "odds_ratio_loss": 0.628852128982544, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11003033071756363, + "rewards/margins": 0.02273516170680523, + "rewards/rejected": -0.1327655017375946, + "sft_loss": 1.100303292274475, + "step": 1310 + }, + { + "epoch": 2.133764396847848, + "grad_norm": 1.5676363706588745, + "learning_rate": 9.556068075345363e-07, + "logits/chosen": 255.0603485107422, + "logits/rejected": 255.1541748046875, + "logps/chosen": -1.1494947671890259, + "logps/rejected": -1.3283547163009644, + "loss": 1.2112, + "odds_ratio_loss": 0.6172733306884766, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.11494947969913483, + "rewards/margins": 0.017885997891426086, + "rewards/rejected": -0.1328354775905609, + "sft_loss": 1.1494947671890259, + "step": 1320 + }, + { + "epoch": 2.14992927864215, + "grad_norm": 1.964956521987915, + "learning_rate": 9.225159052377838e-07, + "logits/chosen": 254.16183471679688, + "logits/rejected": 254.3532257080078, + "logps/chosen": -1.1823852062225342, + "logps/rejected": -1.4285701513290405, + "loss": 1.2468, + "odds_ratio_loss": 0.6443654894828796, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.11823852360248566, + "rewards/margins": 0.024618491530418396, + "rewards/rejected": -0.14285701513290405, + "sft_loss": 1.1823852062225342, + "step": 1330 + }, + { + "epoch": 2.166094160436452, + "grad_norm": 4.320827484130859, + "learning_rate": 8.898779857628184e-07, + "logits/chosen": 253.94775390625, + "logits/rejected": 253.83328247070312, + "logps/chosen": -1.0813744068145752, + "logps/rejected": -1.289052963256836, + "loss": 1.1442, + "odds_ratio_loss": 0.6285432577133179, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.10813745111227036, + "rewards/margins": 0.020767847076058388, + "rewards/rejected": -0.1289052963256836, + "sft_loss": 1.0813744068145752, + "step": 1340 + }, + { + "epoch": 2.1822590422307537, + "grad_norm": 1.9166721105575562, + "learning_rate": 8.577024212591975e-07, + "logits/chosen": 255.42715454101562, + "logits/rejected": 255.6570281982422, + "logps/chosen": -1.2112232446670532, + "logps/rejected": -1.4022705554962158, + "loss": 1.2754, + "odds_ratio_loss": 0.6421025991439819, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.1211223155260086, + "rewards/margins": 0.019104719161987305, + "rewards/rejected": -0.1402270495891571, + "sft_loss": 1.2112232446670532, + "step": 1350 + }, + { + "epoch": 2.1984239240250556, + "grad_norm": 2.231593370437622, + "learning_rate": 8.259984511088276e-07, + "logits/chosen": 252.95217895507812, + "logits/rejected": 253.24972534179688, + "logps/chosen": -1.1978521347045898, + "logps/rejected": -1.4085915088653564, + "loss": 1.2643, + "odds_ratio_loss": 0.6643570065498352, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11978521198034286, + "rewards/margins": 0.021073944866657257, + "rewards/rejected": -0.14085917174816132, + "sft_loss": 1.1978521347045898, + "step": 1360 + }, + { + "epoch": 2.2145888058193575, + "grad_norm": 1.891438364982605, + "learning_rate": 7.947751792728237e-07, + "logits/chosen": 252.89163208007812, + "logits/rejected": 252.9368133544922, + "logps/chosen": -1.1386100053787231, + "logps/rejected": -1.3931357860565186, + "loss": 1.2001, + "odds_ratio_loss": 0.6149393320083618, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.11386100947856903, + "rewards/margins": 0.02545258030295372, + "rewards/rejected": -0.13931360840797424, + "sft_loss": 1.1386100053787231, + "step": 1370 + }, + { + "epoch": 2.2307536876136593, + "grad_norm": 11.893668174743652, + "learning_rate": 7.640415716772626e-07, + "logits/chosen": 254.87893676757812, + "logits/rejected": 254.9901123046875, + "logps/chosen": -1.2301312685012817, + "logps/rejected": -1.4715359210968018, + "loss": 1.2969, + "odds_ratio_loss": 0.6676316857337952, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12301311641931534, + "rewards/margins": 0.02414046786725521, + "rewards/rejected": -0.1471536010503769, + "sft_loss": 1.2301312685012817, + "step": 1380 + }, + { + "epoch": 2.246918569407961, + "grad_norm": 1.648759365081787, + "learning_rate": 7.338064536385722e-07, + "logits/chosen": 253.27536010742188, + "logits/rejected": 253.39450073242188, + "logps/chosen": -1.172890543937683, + "logps/rejected": -1.4573774337768555, + "loss": 1.2306, + "odds_ratio_loss": 0.5775946974754333, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11728904396295547, + "rewards/margins": 0.02844870649278164, + "rewards/rejected": -0.14573773741722107, + "sft_loss": 1.172890543937683, + "step": 1390 + }, + { + "epoch": 2.263083451202263, + "grad_norm": 2.644590377807617, + "learning_rate": 7.040785073292883e-07, + "logits/chosen": 254.41006469726562, + "logits/rejected": 254.5663299560547, + "logps/chosen": -1.243436336517334, + "logps/rejected": -1.455594778060913, + "loss": 1.3115, + "odds_ratio_loss": 0.6802859902381897, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.12434364855289459, + "rewards/margins": 0.021215861663222313, + "rewards/rejected": -0.14555948972702026, + "sft_loss": 1.243436336517334, + "step": 1400 + }, + { + "epoch": 2.279248332996565, + "grad_norm": 3.2420878410339355, + "learning_rate": 6.748662692849297e-07, + "logits/chosen": 253.18417358398438, + "logits/rejected": 254.11279296875, + "logps/chosen": -1.1471028327941895, + "logps/rejected": -1.5119264125823975, + "loss": 1.2055, + "odds_ratio_loss": 0.5840214490890503, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11471028625965118, + "rewards/margins": 0.03648235648870468, + "rewards/rejected": -0.15119265019893646, + "sft_loss": 1.1471028327941895, + "step": 1410 + }, + { + "epoch": 2.295413214790867, + "grad_norm": 4.394900798797607, + "learning_rate": 6.46178127952686e-07, + "logits/chosen": 254.48062133789062, + "logits/rejected": 254.9115753173828, + "logps/chosen": -1.1684550046920776, + "logps/rejected": -1.38850998878479, + "loss": 1.2286, + "odds_ratio_loss": 0.601581871509552, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1168455109000206, + "rewards/margins": 0.022005509585142136, + "rewards/rejected": -0.13885101675987244, + "sft_loss": 1.1684550046920776, + "step": 1420 + }, + { + "epoch": 2.3115780965851687, + "grad_norm": 2.162309169769287, + "learning_rate": 6.180223212826289e-07, + "logits/chosen": 253.58633422851562, + "logits/rejected": 253.8734588623047, + "logps/chosen": -1.1496318578720093, + "logps/rejected": -1.364654779434204, + "loss": 1.213, + "odds_ratio_loss": 0.633824348449707, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.11496319621801376, + "rewards/margins": 0.021502288058400154, + "rewards/rejected": -0.13646547496318817, + "sft_loss": 1.1496318578720093, + "step": 1430 + }, + { + "epoch": 2.3277429783794705, + "grad_norm": 1.522935152053833, + "learning_rate": 5.904069343621443e-07, + "logits/chosen": 255.19082641601562, + "logits/rejected": 255.11474609375, + "logps/chosen": -1.1330249309539795, + "logps/rejected": -1.386264443397522, + "loss": 1.195, + "odds_ratio_loss": 0.6193984746932983, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.11330248415470123, + "rewards/margins": 0.025323981419205666, + "rewards/rejected": -0.13862647116184235, + "sft_loss": 1.1330249309539795, + "step": 1440 + }, + { + "epoch": 2.3439078601737724, + "grad_norm": 2.982042074203491, + "learning_rate": 5.633398970942544e-07, + "logits/chosen": 254.9903564453125, + "logits/rejected": 255.1248321533203, + "logps/chosen": -1.1471365690231323, + "logps/rejected": -1.3323371410369873, + "loss": 1.2137, + "odds_ratio_loss": 0.6657688617706299, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.114713653922081, + "rewards/margins": 0.01852005161345005, + "rewards/rejected": -0.1332337111234665, + "sft_loss": 1.1471365690231323, + "step": 1450 + }, + { + "epoch": 2.3600727419680743, + "grad_norm": 3.2461607456207275, + "learning_rate": 5.368289819205069e-07, + "logits/chosen": 254.27847290039062, + "logits/rejected": 255.0779571533203, + "logps/chosen": -1.11297607421875, + "logps/rejected": -1.3122992515563965, + "loss": 1.1805, + "odds_ratio_loss": 0.6752298474311829, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1112975925207138, + "rewards/margins": 0.019932324066758156, + "rewards/rejected": -0.1312299221754074, + "sft_loss": 1.11297607421875, + "step": 1460 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 2.7223591804504395, + "learning_rate": 5.108818015890785e-07, + "logits/chosen": 255.47216796875, + "logits/rejected": 255.6534423828125, + "logps/chosen": -1.2367959022521973, + "logps/rejected": -1.4369171857833862, + "loss": 1.3029, + "odds_ratio_loss": 0.6609222888946533, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.12367959320545197, + "rewards/margins": 0.020012129098176956, + "rewards/rejected": -0.14369171857833862, + "sft_loss": 1.2367959022521973, + "step": 1470 + }, + { + "epoch": 2.392402505556678, + "grad_norm": 2.912327527999878, + "learning_rate": 4.855058069687291e-07, + "logits/chosen": 253.0759735107422, + "logits/rejected": 253.6752471923828, + "logps/chosen": -1.111169695854187, + "logps/rejected": -1.4328919649124146, + "loss": 1.1697, + "odds_ratio_loss": 0.5851289629936218, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.11111698299646378, + "rewards/margins": 0.03217221051454544, + "rewards/rejected": -0.1432892084121704, + "sft_loss": 1.111169695854187, + "step": 1480 + }, + { + "epoch": 2.40856738735098, + "grad_norm": 2.995020627975464, + "learning_rate": 4.607082849092523e-07, + "logits/chosen": 253.9425811767578, + "logits/rejected": 254.0526580810547, + "logps/chosen": -1.2607060670852661, + "logps/rejected": -1.4026780128479004, + "loss": 1.3291, + "odds_ratio_loss": 0.6835006475448608, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12607058882713318, + "rewards/margins": 0.014197212643921375, + "rewards/rejected": -0.14026781916618347, + "sft_loss": 1.2607060670852661, + "step": 1490 + }, + { + "epoch": 2.4247322691452817, + "grad_norm": 3.760835886001587, + "learning_rate": 4.3649635614901405e-07, + "logits/chosen": 254.07601928710938, + "logits/rejected": 254.50985717773438, + "logps/chosen": -1.1233417987823486, + "logps/rejected": -1.2859314680099487, + "loss": 1.1874, + "odds_ratio_loss": 0.6403074860572815, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.11233416944742203, + "rewards/margins": 0.01625899039208889, + "rewards/rejected": -0.12859316170215607, + "sft_loss": 1.1233417987823486, + "step": 1500 + }, + { + "epoch": 2.4247322691452817, + "eval_logits/chosen": 253.6037139892578, + "eval_logits/rejected": 253.95994567871094, + "eval_logps/chosen": -1.1982638835906982, + "eval_logps/rejected": -1.4377323389053345, + "eval_loss": 1.265723466873169, + "eval_odds_ratio_loss": 0.6745957732200623, + "eval_rewards/accuracies": 0.5699999928474426, + "eval_rewards/chosen": -0.11982638388872147, + "eval_rewards/margins": 0.023946860805153847, + "eval_rewards/rejected": -0.14377322793006897, + "eval_runtime": 221.0804, + "eval_samples_per_second": 4.976, + "eval_sft_loss": 1.1982638835906982, + "eval_steps_per_second": 2.488, + "step": 1500 + }, + { + "epoch": 2.4408971509395836, + "grad_norm": 2.074381113052368, + "learning_rate": 4.128769732701973e-07, + "logits/chosen": 254.7397918701172, + "logits/rejected": 254.7943572998047, + "logps/chosen": -1.1791332960128784, + "logps/rejected": -1.4214551448822021, + "loss": 1.2445, + "odds_ratio_loss": 0.6540807485580444, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.11791334301233292, + "rewards/margins": 0.02423218823969364, + "rewards/rejected": -0.14214551448822021, + "sft_loss": 1.1791332960128784, + "step": 1510 + }, + { + "epoch": 2.4570620327338855, + "grad_norm": 3.3741965293884277, + "learning_rate": 3.8985691870233046e-07, + "logits/chosen": 254.44534301757812, + "logits/rejected": 254.93307495117188, + "logps/chosen": -1.209789514541626, + "logps/rejected": -1.4953523874282837, + "loss": 1.2744, + "odds_ratio_loss": 0.645904004573822, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1209789514541626, + "rewards/margins": 0.02855629101395607, + "rewards/rejected": -0.14953525364398956, + "sft_loss": 1.209789514541626, + "step": 1520 + }, + { + "epoch": 2.4732269145281873, + "grad_norm": 4.19984245300293, + "learning_rate": 3.6744280277467904e-07, + "logits/chosen": 253.19186401367188, + "logits/rejected": 253.959228515625, + "logps/chosen": -1.1898527145385742, + "logps/rejected": -1.418869972229004, + "loss": 1.2551, + "odds_ratio_loss": 0.6521813273429871, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.11898528039455414, + "rewards/margins": 0.022901728749275208, + "rewards/rejected": -0.14188699424266815, + "sft_loss": 1.1898527145385742, + "step": 1530 + }, + { + "epoch": 2.489391796322489, + "grad_norm": 3.145732879638672, + "learning_rate": 3.456410618180503e-07, + "logits/chosen": 252.92282104492188, + "logits/rejected": 253.6022491455078, + "logps/chosen": -1.0681793689727783, + "logps/rejected": -1.446299433708191, + "loss": 1.1295, + "odds_ratio_loss": 0.6130428910255432, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.10681793838739395, + "rewards/margins": 0.0378120057284832, + "rewards/rejected": -0.14462995529174805, + "sft_loss": 1.0681793689727783, + "step": 1540 + }, + { + "epoch": 2.5055566781167915, + "grad_norm": 2.4530389308929443, + "learning_rate": 3.244579563165753e-07, + "logits/chosen": 252.8817138671875, + "logits/rejected": 253.0416259765625, + "logps/chosen": -1.1208980083465576, + "logps/rejected": -1.4661897420883179, + "loss": 1.1805, + "odds_ratio_loss": 0.5958081483840942, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.11208979785442352, + "rewards/margins": 0.034529171884059906, + "rewards/rejected": -0.14661899209022522, + "sft_loss": 1.1208980083465576, + "step": 1550 + }, + { + "epoch": 2.521721559911093, + "grad_norm": 1.9151691198349, + "learning_rate": 3.038995691099697e-07, + "logits/chosen": 252.8979949951172, + "logits/rejected": 253.34262084960938, + "logps/chosen": -1.2310686111450195, + "logps/rejected": -1.493896484375, + "loss": 1.2954, + "odds_ratio_loss": 0.6432778239250183, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.12310687452554703, + "rewards/margins": 0.02628278359770775, + "rewards/rejected": -0.14938965439796448, + "sft_loss": 1.2310686111450195, + "step": 1560 + }, + { + "epoch": 2.5378864417053952, + "grad_norm": 4.0270304679870605, + "learning_rate": 2.839718036468192e-07, + "logits/chosen": 255.1189422607422, + "logits/rejected": 255.6085662841797, + "logps/chosen": -1.2376972436904907, + "logps/rejected": -1.4336402416229248, + "loss": 1.306, + "odds_ratio_loss": 0.6828280091285706, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.12376973778009415, + "rewards/margins": 0.019594285637140274, + "rewards/rejected": -0.14336401224136353, + "sft_loss": 1.2376972436904907, + "step": 1570 + }, + { + "epoch": 2.5540513234996967, + "grad_norm": 2.8464980125427246, + "learning_rate": 2.646803822893723e-07, + "logits/chosen": 254.5944366455078, + "logits/rejected": 254.6045379638672, + "logps/chosen": -1.1911519765853882, + "logps/rejected": -1.4335906505584717, + "loss": 1.255, + "odds_ratio_loss": 0.6387141346931458, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1191151887178421, + "rewards/margins": 0.024243878200650215, + "rewards/rejected": -0.14335909485816956, + "sft_loss": 1.1911519765853882, + "step": 1580 + }, + { + "epoch": 2.570216205293999, + "grad_norm": 2.3468587398529053, + "learning_rate": 2.460308446703341e-07, + "logits/chosen": 255.0273895263672, + "logits/rejected": 255.33963012695312, + "logps/chosen": -1.1678217649459839, + "logps/rejected": -1.3280802965164185, + "loss": 1.2319, + "odds_ratio_loss": 0.6405949592590332, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.11678217351436615, + "rewards/margins": 0.016025854274630547, + "rewards/rejected": -0.13280804455280304, + "sft_loss": 1.1678217649459839, + "step": 1590 + }, + { + "epoch": 2.5863810870883004, + "grad_norm": 2.52286434173584, + "learning_rate": 2.2802854610213143e-07, + "logits/chosen": 253.905517578125, + "logits/rejected": 254.2148895263672, + "logps/chosen": -1.0991406440734863, + "logps/rejected": -1.5185635089874268, + "loss": 1.1551, + "odds_ratio_loss": 0.5599113702774048, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.10991404950618744, + "rewards/margins": 0.041942298412323, + "rewards/rejected": -0.15185634791851044, + "sft_loss": 1.0991406440734863, + "step": 1600 + }, + { + "epoch": 2.6025459688826027, + "grad_norm": 6.791391849517822, + "learning_rate": 2.106786560391072e-07, + "logits/chosen": 253.763671875, + "logits/rejected": 253.8026580810547, + "logps/chosen": -1.2003083229064941, + "logps/rejected": -1.4066941738128662, + "loss": 1.2644, + "odds_ratio_loss": 0.6404808163642883, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.12003083527088165, + "rewards/margins": 0.020638594403862953, + "rewards/rejected": -0.14066943526268005, + "sft_loss": 1.2003083229064941, + "step": 1610 + }, + { + "epoch": 2.6187108506769046, + "grad_norm": 2.421247720718384, + "learning_rate": 1.9398615659308255e-07, + "logits/chosen": 254.5044403076172, + "logits/rejected": 255.14804077148438, + "logps/chosen": -1.145989179611206, + "logps/rejected": -1.2900917530059814, + "loss": 1.2124, + "odds_ratio_loss": 0.663682222366333, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.11459891498088837, + "rewards/margins": 0.014410244300961494, + "rewards/rejected": -0.1290091574192047, + "sft_loss": 1.145989179611206, + "step": 1620 + }, + { + "epoch": 2.6348757324712064, + "grad_norm": 2.5792062282562256, + "learning_rate": 1.7795584110272184e-07, + "logits/chosen": 254.8577117919922, + "logits/rejected": 254.72903442382812, + "logps/chosen": -1.1733181476593018, + "logps/rejected": -1.3872268199920654, + "loss": 1.2372, + "odds_ratio_loss": 0.6392764449119568, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11733181774616241, + "rewards/margins": 0.021390849724411964, + "rewards/rejected": -0.13872265815734863, + "sft_loss": 1.1733181476593018, + "step": 1630 + }, + { + "epoch": 2.6510406142655083, + "grad_norm": 4.67177152633667, + "learning_rate": 1.6259231275709636e-07, + "logits/chosen": 254.6771697998047, + "logits/rejected": 254.81906127929688, + "logps/chosen": -1.1654198169708252, + "logps/rejected": -1.331291913986206, + "loss": 1.2329, + "odds_ratio_loss": 0.6746650338172913, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.11654196679592133, + "rewards/margins": 0.016587218269705772, + "rewards/rejected": -0.13312919437885284, + "sft_loss": 1.1654198169708252, + "step": 1640 + }, + { + "epoch": 2.66720549605981, + "grad_norm": 2.415234327316284, + "learning_rate": 1.478999832738548e-07, + "logits/chosen": 253.84658813476562, + "logits/rejected": 254.2846221923828, + "logps/chosen": -1.1557317972183228, + "logps/rejected": -1.4208415746688843, + "loss": 1.2199, + "odds_ratio_loss": 0.6416669487953186, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.11557319015264511, + "rewards/margins": 0.026510965079069138, + "rewards/rejected": -0.14208415150642395, + "sft_loss": 1.1557317972183228, + "step": 1650 + }, + { + "epoch": 2.683370377854112, + "grad_norm": 2.4805846214294434, + "learning_rate": 1.338830716323769e-07, + "logits/chosen": 253.04605102539062, + "logits/rejected": 253.3199005126953, + "logps/chosen": -1.1211105585098267, + "logps/rejected": -1.3149446249008179, + "loss": 1.1846, + "odds_ratio_loss": 0.6351101994514465, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.11211104691028595, + "rewards/margins": 0.019383419305086136, + "rewards/rejected": -0.13149447739124298, + "sft_loss": 1.1211105585098267, + "step": 1660 + }, + { + "epoch": 2.699535259648414, + "grad_norm": 4.832085609436035, + "learning_rate": 1.205456028622723e-07, + "logits/chosen": 254.3978729248047, + "logits/rejected": 254.5135498046875, + "logps/chosen": -1.0987465381622314, + "logps/rejected": -1.4824903011322021, + "loss": 1.157, + "odds_ratio_loss": 0.582126259803772, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1098746508359909, + "rewards/margins": 0.038374386727809906, + "rewards/rejected": -0.1482490599155426, + "sft_loss": 1.0987465381622314, + "step": 1670 + }, + { + "epoch": 2.7157001414427158, + "grad_norm": 1.8406291007995605, + "learning_rate": 1.0789140688756805e-07, + "logits/chosen": 254.9768524169922, + "logits/rejected": 255.4145965576172, + "logps/chosen": -1.1487500667572021, + "logps/rejected": -1.416771650314331, + "loss": 1.2068, + "odds_ratio_loss": 0.5807241201400757, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.11487500369548798, + "rewards/margins": 0.026802152395248413, + "rewards/rejected": -0.14167717099189758, + "sft_loss": 1.1487500667572021, + "step": 1680 + }, + { + "epoch": 2.7318650232370176, + "grad_norm": 9.125133514404297, + "learning_rate": 9.592411742693098e-08, + "logits/chosen": 253.77490234375, + "logits/rejected": 253.99368286132812, + "logps/chosen": -1.2172834873199463, + "logps/rejected": -1.371734857559204, + "loss": 1.2881, + "odds_ratio_loss": 0.7082632780075073, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.12172834575176239, + "rewards/margins": 0.015445133671164513, + "rewards/rejected": -0.13717348873615265, + "sft_loss": 1.2172834873199463, + "step": 1690 + }, + { + "epoch": 2.7480299050313195, + "grad_norm": 1.7780921459197998, + "learning_rate": 8.464717095022168e-08, + "logits/chosen": 251.9024658203125, + "logits/rejected": 253.18222045898438, + "logps/chosen": -1.1575608253479004, + "logps/rejected": -1.4255679845809937, + "loss": 1.2219, + "odds_ratio_loss": 0.6432427167892456, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.115756094455719, + "rewards/margins": 0.026800716295838356, + "rewards/rejected": -0.1425568014383316, + "sft_loss": 1.1575608253479004, + "step": 1700 + }, + { + "epoch": 2.7641947868256214, + "grad_norm": 2.707679271697998, + "learning_rate": 7.406380569169841e-08, + "logits/chosen": 254.63046264648438, + "logits/rejected": 255.2324676513672, + "logps/chosen": -1.21084725856781, + "logps/rejected": -1.339179277420044, + "loss": 1.278, + "odds_ratio_loss": 0.6712638139724731, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.12108473479747772, + "rewards/margins": 0.012833192944526672, + "rewards/rejected": -0.1339179426431656, + "sft_loss": 1.21084725856781, + "step": 1710 + }, + { + "epoch": 2.7803596686199232, + "grad_norm": 9.321467399597168, + "learning_rate": 6.417706072013808e-08, + "logits/chosen": 255.14419555664062, + "logits/rejected": 255.5975341796875, + "logps/chosen": -1.1439273357391357, + "logps/rejected": -1.3444888591766357, + "loss": 1.2084, + "odds_ratio_loss": 0.644874095916748, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.1143927350640297, + "rewards/margins": 0.020056165754795074, + "rewards/rejected": -0.13444891571998596, + "sft_loss": 1.1439273357391357, + "step": 1720 + }, + { + "epoch": 2.796524550414225, + "grad_norm": 4.651902675628662, + "learning_rate": 5.498977506615294e-08, + "logits/chosen": 254.2846221923828, + "logits/rejected": 255.0900421142578, + "logps/chosen": -1.2078436613082886, + "logps/rejected": -1.3565863370895386, + "loss": 1.2795, + "odds_ratio_loss": 0.7164067029953003, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.12078437954187393, + "rewards/margins": 0.014874264597892761, + "rewards/rejected": -0.1356586515903473, + "sft_loss": 1.2078436613082886, + "step": 1730 + }, + { + "epoch": 2.812689432208527, + "grad_norm": 2.109281301498413, + "learning_rate": 4.6504586906947756e-08, + "logits/chosen": 255.70327758789062, + "logits/rejected": 255.75326538085938, + "logps/chosen": -1.204192876815796, + "logps/rejected": -1.3763076066970825, + "loss": 1.2661, + "odds_ratio_loss": 0.6194810271263123, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.12041930109262466, + "rewards/margins": 0.01721145212650299, + "rewards/rejected": -0.13763076066970825, + "sft_loss": 1.204192876815796, + "step": 1740 + }, + { + "epoch": 2.828854314002829, + "grad_norm": 11.28996467590332, + "learning_rate": 3.8723932808754914e-08, + "logits/chosen": 254.12826538085938, + "logits/rejected": 254.1260528564453, + "logps/chosen": -1.2869278192520142, + "logps/rejected": -1.4129821062088013, + "loss": 1.3563, + "odds_ratio_loss": 0.6939128637313843, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.12869277596473694, + "rewards/margins": 0.012605440802872181, + "rewards/rejected": -0.14129820466041565, + "sft_loss": 1.2869278192520142, + "step": 1750 + }, + { + "epoch": 2.8450191957971307, + "grad_norm": 4.297213077545166, + "learning_rate": 3.1650047027158014e-08, + "logits/chosen": 254.026123046875, + "logits/rejected": 254.0368194580078, + "logps/chosen": -1.1498368978500366, + "logps/rejected": -1.3505172729492188, + "loss": 1.212, + "odds_ratio_loss": 0.6218072175979614, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.11498367786407471, + "rewards/margins": 0.020068055018782616, + "rewards/rejected": -0.13505175709724426, + "sft_loss": 1.1498368978500366, + "step": 1760 + }, + { + "epoch": 2.8611840775914326, + "grad_norm": 3.5106639862060547, + "learning_rate": 2.5284960865517848e-08, + "logits/chosen": 253.19677734375, + "logits/rejected": 253.49380493164062, + "logps/chosen": -1.0654290914535522, + "logps/rejected": -1.4208238124847412, + "loss": 1.1222, + "odds_ratio_loss": 0.5680567026138306, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1065429076552391, + "rewards/margins": 0.035539474338293076, + "rewards/rejected": -0.14208237826824188, + "sft_loss": 1.0654290914535522, + "step": 1770 + }, + { + "epoch": 2.8773489593857344, + "grad_norm": 1.9635688066482544, + "learning_rate": 1.9630502091670388e-08, + "logits/chosen": 254.31576538085938, + "logits/rejected": 254.73184204101562, + "logps/chosen": -1.16495680809021, + "logps/rejected": -1.438716173171997, + "loss": 1.2222, + "odds_ratio_loss": 0.5726233720779419, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11649568378925323, + "rewards/margins": 0.027375921607017517, + "rewards/rejected": -0.14387162029743195, + "sft_loss": 1.16495680809021, + "step": 1780 + }, + { + "epoch": 2.8935138411800363, + "grad_norm": 7.4126458168029785, + "learning_rate": 1.4688294413074677e-08, + "logits/chosen": 253.7817840576172, + "logits/rejected": 254.4253692626953, + "logps/chosen": -1.081469178199768, + "logps/rejected": -1.3936357498168945, + "loss": 1.1414, + "odds_ratio_loss": 0.5996376872062683, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.10814690589904785, + "rewards/margins": 0.03121664747595787, + "rewards/rejected": -0.13936355710029602, + "sft_loss": 1.081469178199768, + "step": 1790 + }, + { + "epoch": 2.909678722974338, + "grad_norm": 1.8516889810562134, + "learning_rate": 1.0459757010556626e-08, + "logits/chosen": 252.57345581054688, + "logits/rejected": 252.3190155029297, + "logps/chosen": -1.173514723777771, + "logps/rejected": -1.303662657737732, + "loss": 1.2401, + "odds_ratio_loss": 0.6658231019973755, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1173514723777771, + "rewards/margins": 0.013014810159802437, + "rewards/rejected": -0.13036629557609558, + "sft_loss": 1.173514723777771, + "step": 1800 + }, + { + "epoch": 2.92584360476864, + "grad_norm": 7.74896764755249, + "learning_rate": 6.94610413078306e-09, + "logits/chosen": 253.01602172851562, + "logits/rejected": 253.71841430664062, + "logps/chosen": -1.2020865678787231, + "logps/rejected": -1.5033478736877441, + "loss": 1.2671, + "odds_ratio_loss": 0.6497219204902649, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12020864337682724, + "rewards/margins": 0.030126124620437622, + "rewards/rejected": -0.15033479034900665, + "sft_loss": 1.2020865678787231, + "step": 1810 + }, + { + "epoch": 2.942008486562942, + "grad_norm": 2.0265376567840576, + "learning_rate": 4.14834473758563e-09, + "logits/chosen": 252.50656127929688, + "logits/rejected": 252.5145721435547, + "logps/chosen": -1.0974245071411133, + "logps/rejected": -1.4048916101455688, + "loss": 1.1558, + "odds_ratio_loss": 0.5838974714279175, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.10974244773387909, + "rewards/margins": 0.030746713280677795, + "rewards/rejected": -0.14048916101455688, + "sft_loss": 1.0974245071411133, + "step": 1820 + }, + { + "epoch": 2.9581733683572438, + "grad_norm": 2.291332721710205, + "learning_rate": 2.067282222230349e-09, + "logits/chosen": 254.31192016601562, + "logits/rejected": 254.5823974609375, + "logps/chosen": -1.1228351593017578, + "logps/rejected": -1.4580986499786377, + "loss": 1.1841, + "odds_ratio_loss": 0.6124657392501831, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.11228351294994354, + "rewards/margins": 0.03352636098861694, + "rewards/rejected": -0.1458098590373993, + "sft_loss": 1.1228351593017578, + "step": 1830 + }, + { + "epoch": 2.9743382501515456, + "grad_norm": 8.377201080322266, + "learning_rate": 7.035141727212979e-10, + "logits/chosen": 252.5790252685547, + "logits/rejected": 253.33993530273438, + "logps/chosen": -1.062239646911621, + "logps/rejected": -1.3180148601531982, + "loss": 1.1223, + "odds_ratio_loss": 0.6001896858215332, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.10622396320104599, + "rewards/margins": 0.025577524676918983, + "rewards/rejected": -0.13180148601531982, + "sft_loss": 1.062239646911621, + "step": 1840 + }, + { + "epoch": 2.9905031319458475, + "grad_norm": 5.447793006896973, + "learning_rate": 5.743220219761592e-11, + "logits/chosen": 254.24691772460938, + "logits/rejected": 254.78369140625, + "logps/chosen": -1.195462942123413, + "logps/rejected": -1.4066945314407349, + "loss": 1.2627, + "odds_ratio_loss": 0.6723325252532959, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1195463091135025, + "rewards/margins": 0.021123168990015984, + "rewards/rejected": -0.14066946506500244, + "sft_loss": 1.195462942123413, + "step": 1850 + }, + { + "epoch": 2.9969690846635686, + "step": 1854, + "total_flos": 2.1935611788745114e+18, + "train_loss": 1.3469306265266197, + "train_runtime": 24131.5713, + "train_samples_per_second": 1.231, + "train_steps_per_second": 0.077 + } + ], + "logging_steps": 10, + "max_steps": 1854, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 2.1935611788745114e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}