diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7110 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 1, + "global_step": 472, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01694915254237288, + "grad_norm": 39.081620832935286, + "learning_rate": 1.0416666666666666e-08, + "logits/chosen": -0.04004784673452377, + "logits/rejected": -0.012884330004453659, + "logps/chosen": -24.14839744567871, + "logps/rejected": -35.14466094970703, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.03389830508474576, + "grad_norm": 41.95997929051618, + "learning_rate": 2.083333333333333e-08, + "logits/chosen": 0.18785351514816284, + "logits/rejected": 0.21833035349845886, + "logps/chosen": -31.55377197265625, + "logps/rejected": -35.9189567565918, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.05084745762711865, + "grad_norm": 41.574477134990545, + "learning_rate": 3.125e-08, + "logits/chosen": -0.13298606872558594, + "logits/rejected": -0.12034030258655548, + "logps/chosen": -27.085824966430664, + "logps/rejected": -44.451595306396484, + "loss": 0.6789, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.043108198791742325, + "rewards/margins": 0.03870103508234024, + "rewards/rejected": 0.004407165572047234, + "step": 3 + }, + { + "epoch": 0.06779661016949153, + "grad_norm": 38.12229749762995, + "learning_rate": 4.166666666666666e-08, + "logits/chosen": -0.02340121753513813, + "logits/rejected": 0.04097435995936394, + "logps/chosen": -26.125139236450195, + "logps/rejected": -34.786293029785156, + "loss": 0.7018, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.005571034736931324, + "rewards/margins": -0.0023282519541680813, + "rewards/rejected": 0.007899284362792969, + "step": 4 + }, + { + "epoch": 0.0847457627118644, + "grad_norm": 43.98516972909633, + "learning_rate": 5.208333333333333e-08, + "logits/chosen": -0.07847192883491516, + "logits/rejected": -0.08863978832960129, + "logps/chosen": -28.029014587402344, + "logps/rejected": -24.517436981201172, + "loss": 0.6959, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.022370003163814545, + "rewards/margins": 0.06014883145689964, + "rewards/rejected": -0.0377788320183754, + "step": 5 + }, + { + "epoch": 0.1016949152542373, + "grad_norm": 37.8616646433652, + "learning_rate": 6.25e-08, + "logits/chosen": 0.01001177728176117, + "logits/rejected": 0.03767494484782219, + "logps/chosen": -34.69060134887695, + "logps/rejected": -34.56515884399414, + "loss": 0.6964, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0483599528670311, + "rewards/margins": -0.05064802244305611, + "rewards/rejected": 0.002288064919412136, + "step": 6 + }, + { + "epoch": 0.11864406779661017, + "grad_norm": 39.48592290044396, + "learning_rate": 7.291666666666667e-08, + "logits/chosen": 0.09730193018913269, + "logits/rejected": 0.12533338367938995, + "logps/chosen": -26.894184112548828, + "logps/rejected": -29.685768127441406, + "loss": 0.6871, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.014862039126455784, + "rewards/margins": 0.04402291774749756, + "rewards/rejected": -0.05888495221734047, + "step": 7 + }, + { + "epoch": 0.13559322033898305, + "grad_norm": 41.45250718986053, + "learning_rate": 8.333333333333333e-08, + "logits/chosen": -0.07943608611822128, + "logits/rejected": -0.05526775121688843, + "logps/chosen": -23.665637969970703, + "logps/rejected": -35.581138610839844, + "loss": 0.7069, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0016717063263058662, + "rewards/margins": -0.02722988836467266, + "rewards/rejected": 0.028901590034365654, + "step": 8 + }, + { + "epoch": 0.15254237288135594, + "grad_norm": 36.01634420144333, + "learning_rate": 9.375e-08, + "logits/chosen": -0.0029595959931612015, + "logits/rejected": 0.01232635322958231, + "logps/chosen": -30.279748916625977, + "logps/rejected": -24.777137756347656, + "loss": 0.6932, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.024993158876895905, + "rewards/margins": 0.01761629246175289, + "rewards/rejected": -0.04260944947600365, + "step": 9 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 39.56478667920128, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": 0.18664813041687012, + "logits/rejected": 0.15227466821670532, + "logps/chosen": -33.973602294921875, + "logps/rejected": -33.727115631103516, + "loss": 0.7043, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0049431659281253815, + "rewards/margins": -0.026568636298179626, + "rewards/rejected": 0.031511805951595306, + "step": 10 + }, + { + "epoch": 0.1864406779661017, + "grad_norm": 39.96998647964932, + "learning_rate": 1.1458333333333332e-07, + "logits/chosen": 0.22770923376083374, + "logits/rejected": 0.2530755400657654, + "logps/chosen": -25.40655517578125, + "logps/rejected": -39.74527359008789, + "loss": 0.6944, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05294986814260483, + "rewards/margins": -0.01789700984954834, + "rewards/rejected": -0.035052862018346786, + "step": 11 + }, + { + "epoch": 0.2033898305084746, + "grad_norm": 41.34319202972142, + "learning_rate": 1.25e-07, + "logits/chosen": 0.05755678564310074, + "logits/rejected": 0.05909465625882149, + "logps/chosen": -23.82120704650879, + "logps/rejected": -29.727937698364258, + "loss": 0.6877, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.028387153521180153, + "rewards/margins": 0.04089733213186264, + "rewards/rejected": -0.012510182335972786, + "step": 12 + }, + { + "epoch": 0.22033898305084745, + "grad_norm": 39.52994008664552, + "learning_rate": 1.3541666666666666e-07, + "logits/chosen": 0.010963734239339828, + "logits/rejected": -0.006987990811467171, + "logps/chosen": -23.91936683654785, + "logps/rejected": -30.996225357055664, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.013032305985689163, + "rewards/margins": -0.01489502377808094, + "rewards/rejected": 0.001862717792391777, + "step": 13 + }, + { + "epoch": 0.23728813559322035, + "grad_norm": 43.22009535631131, + "learning_rate": 1.4583333333333335e-07, + "logits/chosen": 0.1792532503604889, + "logits/rejected": 0.23038198053836823, + "logps/chosen": -38.606624603271484, + "logps/rejected": -52.0256462097168, + "loss": 0.6851, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.030375886708498, + "rewards/margins": 0.07139457017183304, + "rewards/rejected": -0.04101867973804474, + "step": 14 + }, + { + "epoch": 0.2542372881355932, + "grad_norm": 38.55173749063397, + "learning_rate": 1.5624999999999999e-07, + "logits/chosen": -0.00039753085002303123, + "logits/rejected": 0.006743618752807379, + "logps/chosen": -20.85459327697754, + "logps/rejected": -31.867145538330078, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.015277398750185966, + "rewards/margins": -0.01538792997598648, + "rewards/rejected": 0.00011053076013922691, + "step": 15 + }, + { + "epoch": 0.2711864406779661, + "grad_norm": 36.132422216008756, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -0.036632318049669266, + "logits/rejected": -0.05143912881612778, + "logps/chosen": -25.975902557373047, + "logps/rejected": -30.601673126220703, + "loss": 0.6783, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.025256335735321045, + "rewards/margins": -0.011055359616875648, + "rewards/rejected": -0.014200975187122822, + "step": 16 + }, + { + "epoch": 0.288135593220339, + "grad_norm": 38.93415568334601, + "learning_rate": 1.7708333333333334e-07, + "logits/chosen": -0.03795609995722771, + "logits/rejected": -0.04627775773406029, + "logps/chosen": -22.88838768005371, + "logps/rejected": -28.53569984436035, + "loss": 0.6799, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0017479183152318, + "rewards/margins": 0.0676286369562149, + "rewards/rejected": -0.06588071584701538, + "step": 17 + }, + { + "epoch": 0.3050847457627119, + "grad_norm": 37.216443506833954, + "learning_rate": 1.875e-07, + "logits/chosen": 0.12987589836120605, + "logits/rejected": 0.16591012477874756, + "logps/chosen": -20.29220962524414, + "logps/rejected": -27.848968505859375, + "loss": 0.6856, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02715757116675377, + "rewards/margins": 0.0035054399631917477, + "rewards/rejected": -0.030663013458251953, + "step": 18 + }, + { + "epoch": 0.3220338983050847, + "grad_norm": 36.09119961798322, + "learning_rate": 1.9791666666666664e-07, + "logits/chosen": 0.11148576438426971, + "logits/rejected": 0.1186145693063736, + "logps/chosen": -19.455955505371094, + "logps/rejected": -30.798999786376953, + "loss": 0.6764, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03620731830596924, + "rewards/margins": 0.08103629946708679, + "rewards/rejected": -0.044828981161117554, + "step": 19 + }, + { + "epoch": 0.3389830508474576, + "grad_norm": 34.52699754862708, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -0.040645819157361984, + "logits/rejected": -0.09117074310779572, + "logps/chosen": -30.70236587524414, + "logps/rejected": -31.846435546875, + "loss": 0.6725, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025728441774845123, + "rewards/margins": 0.05640077590942383, + "rewards/rejected": -0.030672335997223854, + "step": 20 + }, + { + "epoch": 0.3559322033898305, + "grad_norm": 38.450864425486444, + "learning_rate": 2.1875e-07, + "logits/chosen": -0.04155284911394119, + "logits/rejected": -0.08195465058088303, + "logps/chosen": -24.620819091796875, + "logps/rejected": -35.44722366333008, + "loss": 0.6753, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007430051453411579, + "rewards/margins": 0.049906615167856216, + "rewards/rejected": -0.04247656092047691, + "step": 21 + }, + { + "epoch": 0.3728813559322034, + "grad_norm": 38.60964633502043, + "learning_rate": 2.2916666666666663e-07, + "logits/chosen": 0.037601783871650696, + "logits/rejected": 0.051545850932598114, + "logps/chosen": -20.464923858642578, + "logps/rejected": -25.813556671142578, + "loss": 0.656, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0019244614522904158, + "rewards/margins": 0.10572130233049393, + "rewards/rejected": -0.10379683971405029, + "step": 22 + }, + { + "epoch": 0.3898305084745763, + "grad_norm": 37.53125515825806, + "learning_rate": 2.3958333333333335e-07, + "logits/chosen": -0.04523741453886032, + "logits/rejected": -0.08811002969741821, + "logps/chosen": -26.055984497070312, + "logps/rejected": -25.679134368896484, + "loss": 0.6379, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.054784782230854034, + "rewards/margins": 0.07096240669488907, + "rewards/rejected": -0.1257471889257431, + "step": 23 + }, + { + "epoch": 0.4067796610169492, + "grad_norm": 40.26892670789944, + "learning_rate": 2.5e-07, + "logits/chosen": -0.08595943450927734, + "logits/rejected": -0.09404819458723068, + "logps/chosen": -30.186988830566406, + "logps/rejected": -33.44403076171875, + "loss": 0.6393, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04100564867258072, + "rewards/margins": 0.07639746367931366, + "rewards/rejected": -0.11740311980247498, + "step": 24 + }, + { + "epoch": 0.423728813559322, + "grad_norm": 47.582895505174676, + "learning_rate": 2.604166666666667e-07, + "logits/chosen": 0.002766113728284836, + "logits/rejected": 0.002811681479215622, + "logps/chosen": -35.549591064453125, + "logps/rejected": -32.83184051513672, + "loss": 0.6482, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07030116766691208, + "rewards/margins": 0.06389589607715607, + "rewards/rejected": -0.13419707119464874, + "step": 25 + }, + { + "epoch": 0.4406779661016949, + "grad_norm": 34.075640070522816, + "learning_rate": 2.708333333333333e-07, + "logits/chosen": -0.02534855529665947, + "logits/rejected": -0.011001847684383392, + "logps/chosen": -22.414587020874023, + "logps/rejected": -28.95859146118164, + "loss": 0.621, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008834121748805046, + "rewards/margins": 0.2195996344089508, + "rewards/rejected": -0.228433758020401, + "step": 26 + }, + { + "epoch": 0.4576271186440678, + "grad_norm": 34.47879927670914, + "learning_rate": 2.8125e-07, + "logits/chosen": 0.0005891900509595871, + "logits/rejected": -0.04569123312830925, + "logps/chosen": -27.095754623413086, + "logps/rejected": -34.3789176940918, + "loss": 0.622, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.00932928267866373, + "rewards/margins": 0.28314852714538574, + "rewards/rejected": -0.2924777865409851, + "step": 27 + }, + { + "epoch": 0.4745762711864407, + "grad_norm": 33.381546864263576, + "learning_rate": 2.916666666666667e-07, + "logits/chosen": -0.03613307327032089, + "logits/rejected": -0.07326073944568634, + "logps/chosen": -20.990463256835938, + "logps/rejected": -26.562923431396484, + "loss": 0.6157, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00729251466691494, + "rewards/margins": 0.15824466943740845, + "rewards/rejected": -0.16553716361522675, + "step": 28 + }, + { + "epoch": 0.4915254237288136, + "grad_norm": 39.396295244537285, + "learning_rate": 3.020833333333333e-07, + "logits/chosen": 0.06360377371311188, + "logits/rejected": 0.0748274177312851, + "logps/chosen": -23.62378692626953, + "logps/rejected": -31.0860595703125, + "loss": 0.6277, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.017558498308062553, + "rewards/margins": 0.13148798048496246, + "rewards/rejected": -0.14904648065567017, + "step": 29 + }, + { + "epoch": 0.5084745762711864, + "grad_norm": 35.102940131398256, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": 0.06532293558120728, + "logits/rejected": 0.06247016414999962, + "logps/chosen": -26.590116500854492, + "logps/rejected": -34.515804290771484, + "loss": 0.5964, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0014454489573836327, + "rewards/margins": 0.5317557454109192, + "rewards/rejected": -0.5332012176513672, + "step": 30 + }, + { + "epoch": 0.5254237288135594, + "grad_norm": 34.11889459677525, + "learning_rate": 3.2291666666666666e-07, + "logits/chosen": 0.09973854571580887, + "logits/rejected": 0.1072133332490921, + "logps/chosen": -25.892887115478516, + "logps/rejected": -32.363502502441406, + "loss": 0.5721, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.006193090230226517, + "rewards/margins": 0.30112165212631226, + "rewards/rejected": -0.29492852091789246, + "step": 31 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 32.761918192518266, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": 0.037455491721630096, + "logits/rejected": -0.05081958696246147, + "logps/chosen": -33.243309020996094, + "logps/rejected": -35.219573974609375, + "loss": 0.5398, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.022554118186235428, + "rewards/margins": 0.5276864767074585, + "rewards/rejected": -0.5502405166625977, + "step": 32 + }, + { + "epoch": 0.559322033898305, + "grad_norm": 30.482548567561853, + "learning_rate": 3.4375e-07, + "logits/chosen": 0.041740238666534424, + "logits/rejected": 0.10962522029876709, + "logps/chosen": -24.476438522338867, + "logps/rejected": -38.58897399902344, + "loss": 0.5268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.003558039665222168, + "rewards/margins": 0.8583400249481201, + "rewards/rejected": -0.8618981838226318, + "step": 33 + }, + { + "epoch": 0.576271186440678, + "grad_norm": 31.30582576136025, + "learning_rate": 3.541666666666667e-07, + "logits/chosen": 0.002660442143678665, + "logits/rejected": 0.017039887607097626, + "logps/chosen": -27.219778060913086, + "logps/rejected": -33.36122131347656, + "loss": 0.5383, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02310222014784813, + "rewards/margins": 0.5289267301559448, + "rewards/rejected": -0.5520289540290833, + "step": 34 + }, + { + "epoch": 0.5932203389830508, + "grad_norm": 32.89490941791439, + "learning_rate": 3.645833333333333e-07, + "logits/chosen": 0.03442692011594772, + "logits/rejected": 0.06397214531898499, + "logps/chosen": -20.274240493774414, + "logps/rejected": -44.2073974609375, + "loss": 0.5019, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0024875528179109097, + "rewards/margins": 1.1822435855865479, + "rewards/rejected": -1.1797560453414917, + "step": 35 + }, + { + "epoch": 0.6101694915254238, + "grad_norm": 31.03945146034194, + "learning_rate": 3.75e-07, + "logits/chosen": -0.0311665628105402, + "logits/rejected": -0.02556237392127514, + "logps/chosen": -22.00820541381836, + "logps/rejected": -27.99129295349121, + "loss": 0.5159, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.02089952491223812, + "rewards/margins": 0.5325387716293335, + "rewards/rejected": -0.5534383058547974, + "step": 36 + }, + { + "epoch": 0.6271186440677966, + "grad_norm": 29.670396668547138, + "learning_rate": 3.8541666666666665e-07, + "logits/chosen": 0.0932985171675682, + "logits/rejected": 0.08139631897211075, + "logps/chosen": -26.00881576538086, + "logps/rejected": -29.33023452758789, + "loss": 0.4997, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10078494995832443, + "rewards/margins": 0.5040473341941833, + "rewards/rejected": -0.6048322916030884, + "step": 37 + }, + { + "epoch": 0.6440677966101694, + "grad_norm": 33.08140356711789, + "learning_rate": 3.958333333333333e-07, + "logits/chosen": -0.01641334407031536, + "logits/rejected": -0.005850490182638168, + "logps/chosen": -28.798660278320312, + "logps/rejected": -50.10844421386719, + "loss": 0.5076, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.050310466438531876, + "rewards/margins": 0.6731055974960327, + "rewards/rejected": -0.7234160304069519, + "step": 38 + }, + { + "epoch": 0.6610169491525424, + "grad_norm": 34.20951880392297, + "learning_rate": 4.0625e-07, + "logits/chosen": -0.1090591624379158, + "logits/rejected": -0.12284770607948303, + "logps/chosen": -33.75372314453125, + "logps/rejected": -42.935585021972656, + "loss": 0.5746, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08654585480690002, + "rewards/margins": 1.2381523847579956, + "rewards/rejected": -1.3246984481811523, + "step": 39 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 30.690269873517938, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -0.0014614351093769073, + "logits/rejected": 0.08014758676290512, + "logps/chosen": -25.105735778808594, + "logps/rejected": -36.967323303222656, + "loss": 0.5205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09284328669309616, + "rewards/margins": 0.7982729077339172, + "rewards/rejected": -0.8911161422729492, + "step": 40 + }, + { + "epoch": 0.6949152542372882, + "grad_norm": 30.04204632805336, + "learning_rate": 4.270833333333333e-07, + "logits/chosen": 0.053642358630895615, + "logits/rejected": 0.044470448046922684, + "logps/chosen": -24.64603042602539, + "logps/rejected": -41.87240219116211, + "loss": 0.4837, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10667266696691513, + "rewards/margins": 1.2056063413619995, + "rewards/rejected": -1.3122789859771729, + "step": 41 + }, + { + "epoch": 0.711864406779661, + "grad_norm": 31.020878298393608, + "learning_rate": 4.375e-07, + "logits/chosen": 0.019134098663926125, + "logits/rejected": 0.01840081252157688, + "logps/chosen": -23.039093017578125, + "logps/rejected": -33.015777587890625, + "loss": 0.4991, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04813046008348465, + "rewards/margins": 0.9387863874435425, + "rewards/rejected": -0.9869168996810913, + "step": 42 + }, + { + "epoch": 0.7288135593220338, + "grad_norm": 32.27135984427571, + "learning_rate": 4.479166666666667e-07, + "logits/chosen": 0.008926652371883392, + "logits/rejected": -0.005259339697659016, + "logps/chosen": -42.513465881347656, + "logps/rejected": -36.392086029052734, + "loss": 0.4953, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.16403140127658844, + "rewards/margins": 0.2477284073829651, + "rewards/rejected": -0.4117598235607147, + "step": 43 + }, + { + "epoch": 0.7457627118644068, + "grad_norm": 31.70747032110601, + "learning_rate": 4.5833333333333327e-07, + "logits/chosen": 0.08293592184782028, + "logits/rejected": 0.14042136073112488, + "logps/chosen": -27.64384651184082, + "logps/rejected": -43.646812438964844, + "loss": 0.5279, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.002585211768746376, + "rewards/margins": 0.8831788897514343, + "rewards/rejected": -0.8805936574935913, + "step": 44 + }, + { + "epoch": 0.7627118644067796, + "grad_norm": 32.92194369706788, + "learning_rate": 4.6874999999999996e-07, + "logits/chosen": 0.0978875681757927, + "logits/rejected": 0.07510063052177429, + "logps/chosen": -25.6392822265625, + "logps/rejected": -43.59218215942383, + "loss": 0.4975, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05219133943319321, + "rewards/margins": 1.5015443563461304, + "rewards/rejected": -1.553735613822937, + "step": 45 + }, + { + "epoch": 0.7796610169491526, + "grad_norm": 31.09847853088202, + "learning_rate": 4.791666666666667e-07, + "logits/chosen": 0.05425513535737991, + "logits/rejected": 0.060507796704769135, + "logps/chosen": -31.77846908569336, + "logps/rejected": -39.067787170410156, + "loss": 0.4798, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.18950122594833374, + "rewards/margins": 0.7816174626350403, + "rewards/rejected": -0.971118688583374, + "step": 46 + }, + { + "epoch": 0.7966101694915254, + "grad_norm": 31.87817139649752, + "learning_rate": 4.895833333333333e-07, + "logits/chosen": 0.06690789759159088, + "logits/rejected": 0.06767144054174423, + "logps/chosen": -29.99129867553711, + "logps/rejected": -34.969505310058594, + "loss": 0.4447, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09975402057170868, + "rewards/margins": 0.3266308903694153, + "rewards/rejected": -0.42638492584228516, + "step": 47 + }, + { + "epoch": 0.8135593220338984, + "grad_norm": 26.865616424406536, + "learning_rate": 5e-07, + "logits/chosen": -0.09880068153142929, + "logits/rejected": -0.10087430477142334, + "logps/chosen": -28.3320369720459, + "logps/rejected": -43.12381362915039, + "loss": 0.3955, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10569320619106293, + "rewards/margins": 1.8550941944122314, + "rewards/rejected": -1.960787296295166, + "step": 48 + }, + { + "epoch": 0.8305084745762712, + "grad_norm": 25.76308856645317, + "learning_rate": 4.999931375995349e-07, + "logits/chosen": -0.12734848260879517, + "logits/rejected": -0.11239587515592575, + "logps/chosen": -23.94550132751465, + "logps/rejected": -32.49237823486328, + "loss": 0.4445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11973586678504944, + "rewards/margins": 0.8172601461410522, + "rewards/rejected": -0.9369959831237793, + "step": 49 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 30.25637466477477, + "learning_rate": 4.999725507748798e-07, + "logits/chosen": -0.015037477016448975, + "logits/rejected": -0.009709347039461136, + "logps/chosen": -25.780975341796875, + "logps/rejected": -41.78852462768555, + "loss": 0.4786, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08354266732931137, + "rewards/margins": 1.2830588817596436, + "rewards/rejected": -1.3666014671325684, + "step": 50 + }, + { + "epoch": 0.864406779661017, + "grad_norm": 33.41966487787268, + "learning_rate": 4.99938240656235e-07, + "logits/chosen": 0.04738205671310425, + "logits/rejected": 0.07401569187641144, + "logps/chosen": -26.12303924560547, + "logps/rejected": -49.93025207519531, + "loss": 0.4347, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.14142322540283203, + "rewards/margins": 0.954620361328125, + "rewards/rejected": -1.096043586730957, + "step": 51 + }, + { + "epoch": 0.8813559322033898, + "grad_norm": 27.061236838799616, + "learning_rate": 4.998902091271985e-07, + "logits/chosen": -0.06941650807857513, + "logits/rejected": -0.05763792619109154, + "logps/chosen": -23.328826904296875, + "logps/rejected": -35.76228713989258, + "loss": 0.389, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26782581210136414, + "rewards/margins": 0.8718900680541992, + "rewards/rejected": -1.1397159099578857, + "step": 52 + }, + { + "epoch": 0.8983050847457628, + "grad_norm": 27.553317644610285, + "learning_rate": 4.998284588246634e-07, + "logits/chosen": -0.03946888446807861, + "logits/rejected": -0.03690715879201889, + "logps/chosen": -28.930063247680664, + "logps/rejected": -32.62754440307617, + "loss": 0.4152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32019758224487305, + "rewards/margins": 1.2472233772277832, + "rewards/rejected": -1.5674208402633667, + "step": 53 + }, + { + "epoch": 0.9152542372881356, + "grad_norm": 29.763675864173276, + "learning_rate": 4.997529931386719e-07, + "logits/chosen": -0.17749209702014923, + "logits/rejected": -0.16170337796211243, + "logps/chosen": -30.868289947509766, + "logps/rejected": -32.478729248046875, + "loss": 0.4555, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2578313648700714, + "rewards/margins": 0.5673401355743408, + "rewards/rejected": -0.8251715898513794, + "step": 54 + }, + { + "epoch": 0.9322033898305084, + "grad_norm": 33.13736711358155, + "learning_rate": 4.996638162122302e-07, + "logits/chosen": -0.06908832490444183, + "logits/rejected": -0.05076206475496292, + "logps/chosen": -30.415069580078125, + "logps/rejected": -35.18532180786133, + "loss": 0.4454, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.15613248944282532, + "rewards/margins": 1.203932523727417, + "rewards/rejected": -1.36006498336792, + "step": 55 + }, + { + "epoch": 0.9491525423728814, + "grad_norm": 43.11633871701129, + "learning_rate": 4.995609329410804e-07, + "logits/chosen": -0.008376002311706543, + "logits/rejected": 0.001994941383600235, + "logps/chosen": -20.613399505615234, + "logps/rejected": -35.50030517578125, + "loss": 0.4126, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1881721019744873, + "rewards/margins": 1.925746202468872, + "rewards/rejected": -2.1139183044433594, + "step": 56 + }, + { + "epoch": 0.9661016949152542, + "grad_norm": 24.797060027751225, + "learning_rate": 4.994443489734322e-07, + "logits/chosen": -0.015878597274422646, + "logits/rejected": 0.03222089633345604, + "logps/chosen": -26.61467742919922, + "logps/rejected": -43.46265411376953, + "loss": 0.3777, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.20103216171264648, + "rewards/margins": 2.2574026584625244, + "rewards/rejected": -2.458434820175171, + "step": 57 + }, + { + "epoch": 0.9830508474576272, + "grad_norm": 31.271746187315504, + "learning_rate": 4.993140707096525e-07, + "logits/chosen": -0.010781673714518547, + "logits/rejected": 0.019774336367845535, + "logps/chosen": -32.57569885253906, + "logps/rejected": -40.327457427978516, + "loss": 0.3741, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2783823311328888, + "rewards/margins": 1.6766613721847534, + "rewards/rejected": -1.9550437927246094, + "step": 58 + }, + { + "epoch": 1.0, + "grad_norm": 24.272642085140525, + "learning_rate": 4.991701053019145e-07, + "logits/chosen": -0.01512301154434681, + "logits/rejected": -0.009732574224472046, + "logps/chosen": -26.456878662109375, + "logps/rejected": -43.373043060302734, + "loss": 0.3705, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21655352413654327, + "rewards/margins": 1.63704514503479, + "rewards/rejected": -1.8535985946655273, + "step": 59 + }, + { + "epoch": 1.0169491525423728, + "grad_norm": 21.176773022731307, + "learning_rate": 4.990124606538042e-07, + "logits/chosen": -0.06877182424068451, + "logits/rejected": -0.03728486970067024, + "logps/chosen": -18.644493103027344, + "logps/rejected": -34.91282272338867, + "loss": 0.2702, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03750162199139595, + "rewards/margins": 2.20272159576416, + "rewards/rejected": -2.165220022201538, + "step": 60 + }, + { + "epoch": 1.0338983050847457, + "grad_norm": 22.165507363954195, + "learning_rate": 4.988411454198874e-07, + "logits/chosen": 0.04961461201310158, + "logits/rejected": 0.038518860936164856, + "logps/chosen": -26.093852996826172, + "logps/rejected": -32.088096618652344, + "loss": 0.3406, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02193521521985531, + "rewards/margins": 0.8063233494758606, + "rewards/rejected": -0.7843881249427795, + "step": 61 + }, + { + "epoch": 1.0508474576271187, + "grad_norm": 23.06392685939665, + "learning_rate": 4.98656169005234e-07, + "logits/chosen": 0.16032031178474426, + "logits/rejected": 0.11802197992801666, + "logps/chosen": -28.6109676361084, + "logps/rejected": -37.80739974975586, + "loss": 0.2784, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09771008789539337, + "rewards/margins": 2.0157761573791504, + "rewards/rejected": -1.9180662631988525, + "step": 62 + }, + { + "epoch": 1.0677966101694916, + "grad_norm": 19.615263753046836, + "learning_rate": 4.984575415649018e-07, + "logits/chosen": -0.06321832537651062, + "logits/rejected": -0.0122019462287426, + "logps/chosen": -26.929264068603516, + "logps/rejected": -45.03318405151367, + "loss": 0.2581, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.20472079515457153, + "rewards/margins": 2.59661602973938, + "rewards/rejected": -2.8013365268707275, + "step": 63 + }, + { + "epoch": 1.0847457627118644, + "grad_norm": 18.50797643625125, + "learning_rate": 4.982452740033792e-07, + "logits/chosen": -0.06859354674816132, + "logits/rejected": -0.07365603744983673, + "logps/chosen": -26.131860733032227, + "logps/rejected": -34.671546936035156, + "loss": 0.2422, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.026859302073717117, + "rewards/margins": 2.035529375076294, + "rewards/rejected": -2.0623886585235596, + "step": 64 + }, + { + "epoch": 1.1016949152542372, + "grad_norm": 20.414520001604362, + "learning_rate": 4.980193779739863e-07, + "logits/chosen": 0.009079991839826107, + "logits/rejected": -0.0031675295904278755, + "logps/chosen": -29.644994735717773, + "logps/rejected": -45.55342102050781, + "loss": 0.2681, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13062885403633118, + "rewards/margins": 2.6173148155212402, + "rewards/rejected": -2.747943878173828, + "step": 65 + }, + { + "epoch": 1.11864406779661, + "grad_norm": 20.9255946117037, + "learning_rate": 4.977798658782351e-07, + "logits/chosen": -0.08888844400644302, + "logits/rejected": -0.0911368578672409, + "logps/chosen": -26.463741302490234, + "logps/rejected": -41.51061248779297, + "loss": 0.2946, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03352803736925125, + "rewards/margins": 1.772619605064392, + "rewards/rejected": -1.806147575378418, + "step": 66 + }, + { + "epoch": 1.1355932203389831, + "grad_norm": 21.302356946411365, + "learning_rate": 4.975267508651491e-07, + "logits/chosen": -0.028940977528691292, + "logits/rejected": 0.0028336727991700172, + "logps/chosen": -25.707382202148438, + "logps/rejected": -30.72091293334961, + "loss": 0.2749, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02922699600458145, + "rewards/margins": 1.9206253290176392, + "rewards/rejected": -1.9498521089553833, + "step": 67 + }, + { + "epoch": 1.152542372881356, + "grad_norm": 19.180516279847765, + "learning_rate": 4.97260046830541e-07, + "logits/chosen": -0.1452866494655609, + "logits/rejected": -0.038837701082229614, + "logps/chosen": -20.76878547668457, + "logps/rejected": -42.36342239379883, + "loss": 0.2481, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08528683334589005, + "rewards/margins": 2.6560869216918945, + "rewards/rejected": -2.5708000659942627, + "step": 68 + }, + { + "epoch": 1.1694915254237288, + "grad_norm": 21.190018630764428, + "learning_rate": 4.969797684162497e-07, + "logits/chosen": -0.12156227976083755, + "logits/rejected": -0.0709511935710907, + "logps/chosen": -22.62305450439453, + "logps/rejected": -36.76183319091797, + "loss": 0.2828, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10709138959646225, + "rewards/margins": 2.4480578899383545, + "rewards/rejected": -2.3409664630889893, + "step": 69 + }, + { + "epoch": 1.1864406779661016, + "grad_norm": 17.29217666731802, + "learning_rate": 4.966859310093372e-07, + "logits/chosen": 0.007492711767554283, + "logits/rejected": 0.019001876935362816, + "logps/chosen": -27.733966827392578, + "logps/rejected": -40.42127227783203, + "loss": 0.2438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1447516232728958, + "rewards/margins": 2.107698678970337, + "rewards/rejected": -2.252450466156006, + "step": 70 + }, + { + "epoch": 1.2033898305084745, + "grad_norm": 25.122032977225658, + "learning_rate": 4.96378550741243e-07, + "logits/chosen": -0.057199642062187195, + "logits/rejected": -0.06447561085224152, + "logps/chosen": -27.951690673828125, + "logps/rejected": -37.76457977294922, + "loss": 0.2896, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13775676488876343, + "rewards/margins": 1.7086197137832642, + "rewards/rejected": -1.8463765382766724, + "step": 71 + }, + { + "epoch": 1.2203389830508475, + "grad_norm": 17.44185897051635, + "learning_rate": 4.960576444868992e-07, + "logits/chosen": -0.03605864569544792, + "logits/rejected": -0.08552936464548111, + "logps/chosen": -26.663238525390625, + "logps/rejected": -49.157798767089844, + "loss": 0.2207, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12792766094207764, + "rewards/margins": 3.0712804794311523, + "rewards/rejected": -3.1992080211639404, + "step": 72 + }, + { + "epoch": 1.2372881355932204, + "grad_norm": 23.51722551646514, + "learning_rate": 4.957232298638035e-07, + "logits/chosen": -0.14576715230941772, + "logits/rejected": -0.1281927525997162, + "logps/chosen": -26.146411895751953, + "logps/rejected": -39.19955825805664, + "loss": 0.2843, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11095957458019257, + "rewards/margins": 2.2008328437805176, + "rewards/rejected": -2.3117926120758057, + "step": 73 + }, + { + "epoch": 1.2542372881355932, + "grad_norm": 17.504748122629483, + "learning_rate": 4.953753252310525e-07, + "logits/chosen": -0.10337841510772705, + "logits/rejected": -0.11298589408397675, + "logps/chosen": -26.215497970581055, + "logps/rejected": -36.04429244995117, + "loss": 0.2075, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.19672133028507233, + "rewards/margins": 1.8031116724014282, + "rewards/rejected": -1.9998328685760498, + "step": 74 + }, + { + "epoch": 1.271186440677966, + "grad_norm": 20.652812065700907, + "learning_rate": 4.950139496883334e-07, + "logits/chosen": 0.06242116168141365, + "logits/rejected": 0.06666561216115952, + "logps/chosen": -23.245695114135742, + "logps/rejected": -31.755294799804688, + "loss": 0.2429, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.064823217689991, + "rewards/margins": 2.3041014671325684, + "rewards/rejected": -2.2392783164978027, + "step": 75 + }, + { + "epoch": 1.288135593220339, + "grad_norm": 23.08981113112083, + "learning_rate": 4.94639123074876e-07, + "logits/chosen": -0.0955105572938919, + "logits/rejected": -0.06442946940660477, + "logps/chosen": -23.934703826904297, + "logps/rejected": -35.5153694152832, + "loss": 0.2569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10075034201145172, + "rewards/margins": 2.1841156482696533, + "rewards/rejected": -2.2848658561706543, + "step": 76 + }, + { + "epoch": 1.305084745762712, + "grad_norm": 21.41973590257042, + "learning_rate": 4.942508659683626e-07, + "logits/chosen": -0.04648435115814209, + "logits/rejected": -0.013210049830377102, + "logps/chosen": -32.94620132446289, + "logps/rejected": -53.122039794921875, + "loss": 0.269, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09716464579105377, + "rewards/margins": 3.2333667278289795, + "rewards/rejected": -3.1362016201019287, + "step": 77 + }, + { + "epoch": 1.3220338983050848, + "grad_norm": 22.84510019593904, + "learning_rate": 4.938491996837994e-07, + "logits/chosen": -0.005726225674152374, + "logits/rejected": -0.0035298746079206467, + "logps/chosen": -21.76548957824707, + "logps/rejected": -39.55729293823242, + "loss": 0.2568, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0918298214673996, + "rewards/margins": 2.4565834999084473, + "rewards/rejected": -2.3647537231445312, + "step": 78 + }, + { + "epoch": 1.3389830508474576, + "grad_norm": 17.384288528010632, + "learning_rate": 4.934341462723454e-07, + "logits/chosen": -0.14137157797813416, + "logits/rejected": -0.1316397786140442, + "logps/chosen": -20.925193786621094, + "logps/rejected": -36.4559211730957, + "loss": 0.2113, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10143867135047913, + "rewards/margins": 2.8934736251831055, + "rewards/rejected": -2.7920351028442383, + "step": 79 + }, + { + "epoch": 1.3559322033898304, + "grad_norm": 20.990326447186, + "learning_rate": 4.930057285201027e-07, + "logits/chosen": -0.09045147150754929, + "logits/rejected": -0.08031099289655685, + "logps/chosen": -21.96762466430664, + "logps/rejected": -36.81184387207031, + "loss": 0.2569, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19079303741455078, + "rewards/margins": 2.315279483795166, + "rewards/rejected": -2.506072759628296, + "step": 80 + }, + { + "epoch": 1.3728813559322033, + "grad_norm": 18.098050286729354, + "learning_rate": 4.925639699468645e-07, + "logits/chosen": -0.08457757532596588, + "logits/rejected": -0.07319922745227814, + "logps/chosen": -21.135604858398438, + "logps/rejected": -33.960086822509766, + "loss": 0.1857, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.054994210600852966, + "rewards/margins": 2.582826852798462, + "rewards/rejected": -2.5278327465057373, + "step": 81 + }, + { + "epoch": 1.3898305084745763, + "grad_norm": 18.355783625838907, + "learning_rate": 4.921088948048246e-07, + "logits/chosen": 0.0004070308059453964, + "logits/rejected": 0.010508737526834011, + "logps/chosen": -19.553733825683594, + "logps/rejected": -24.943431854248047, + "loss": 0.2258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1966938078403473, + "rewards/margins": 2.039564609527588, + "rewards/rejected": -1.8428709506988525, + "step": 82 + }, + { + "epoch": 1.4067796610169492, + "grad_norm": 18.59890208951988, + "learning_rate": 4.916405280772462e-07, + "logits/chosen": 0.061064671725034714, + "logits/rejected": 0.04233198240399361, + "logps/chosen": -31.1833553314209, + "logps/rejected": -37.992191314697266, + "loss": 0.2471, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24824562668800354, + "rewards/margins": 2.010815143585205, + "rewards/rejected": -2.259060859680176, + "step": 83 + }, + { + "epoch": 1.423728813559322, + "grad_norm": 18.608818270077023, + "learning_rate": 4.911588954770896e-07, + "logits/chosen": 0.006485683843493462, + "logits/rejected": 0.017345350235700607, + "logps/chosen": -23.56964683532715, + "logps/rejected": -33.626216888427734, + "loss": 0.2325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10479970276355743, + "rewards/margins": 2.1866378784179688, + "rewards/rejected": -2.2914376258850098, + "step": 84 + }, + { + "epoch": 1.4406779661016949, + "grad_norm": 27.860656554762212, + "learning_rate": 4.906640234456011e-07, + "logits/chosen": -0.10989750176668167, + "logits/rejected": -0.08497381210327148, + "logps/chosen": -20.454971313476562, + "logps/rejected": -33.20934295654297, + "loss": 0.2399, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07277680188417435, + "rewards/margins": 2.7808988094329834, + "rewards/rejected": -2.7081220149993896, + "step": 85 + }, + { + "epoch": 1.457627118644068, + "grad_norm": 17.529622871109098, + "learning_rate": 4.90155939150861e-07, + "logits/chosen": -0.01597762666642666, + "logits/rejected": -0.02296941541135311, + "logps/chosen": -25.70912742614746, + "logps/rejected": -41.43511199951172, + "loss": 0.1949, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09175632894039154, + "rewards/margins": 3.4984822273254395, + "rewards/rejected": -3.590238571166992, + "step": 86 + }, + { + "epoch": 1.4745762711864407, + "grad_norm": 19.778355379129565, + "learning_rate": 4.896346704862927e-07, + "logits/chosen": -0.00542130321264267, + "logits/rejected": -0.00442717969417572, + "logps/chosen": -25.11708641052246, + "logps/rejected": -38.2928581237793, + "loss": 0.2137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42681190371513367, + "rewards/margins": 2.821324348449707, + "rewards/rejected": -3.248136281967163, + "step": 87 + }, + { + "epoch": 1.4915254237288136, + "grad_norm": 20.237221371085674, + "learning_rate": 4.891002460691305e-07, + "logits/chosen": -0.12523381412029266, + "logits/rejected": -0.12707139551639557, + "logps/chosen": -28.615737915039062, + "logps/rejected": -44.548152923583984, + "loss": 0.2198, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43669962882995605, + "rewards/margins": 3.5562210083007812, + "rewards/rejected": -3.992920160293579, + "step": 88 + }, + { + "epoch": 1.5084745762711864, + "grad_norm": 31.896672790729536, + "learning_rate": 4.885526952388497e-07, + "logits/chosen": -0.15658609569072723, + "logits/rejected": -0.15329544246196747, + "logps/chosen": -26.822874069213867, + "logps/rejected": -40.6098747253418, + "loss": 0.2059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17557695508003235, + "rewards/margins": 3.500253200531006, + "rewards/rejected": -3.675830364227295, + "step": 89 + }, + { + "epoch": 1.5254237288135593, + "grad_norm": 19.488400567309405, + "learning_rate": 4.879920480555549e-07, + "logits/chosen": -0.08191860467195511, + "logits/rejected": -0.008589975535869598, + "logps/chosen": -31.191484451293945, + "logps/rejected": -51.83546829223633, + "loss": 0.2254, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1294003576040268, + "rewards/margins": 2.9513542652130127, + "rewards/rejected": -3.080754518508911, + "step": 90 + }, + { + "epoch": 1.542372881355932, + "grad_norm": 20.01485074144144, + "learning_rate": 4.874183352983297e-07, + "logits/chosen": -0.022624505683779716, + "logits/rejected": -0.03187233582139015, + "logps/chosen": -24.933706283569336, + "logps/rejected": -31.99811363220215, + "loss": 0.2481, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09345364570617676, + "rewards/margins": 2.6890523433685303, + "rewards/rejected": -2.5955986976623535, + "step": 91 + }, + { + "epoch": 1.559322033898305, + "grad_norm": 17.44552952468708, + "learning_rate": 4.868315884635478e-07, + "logits/chosen": -0.13437671959400177, + "logits/rejected": -0.09966325759887695, + "logps/chosen": -28.581546783447266, + "logps/rejected": -40.725303649902344, + "loss": 0.1702, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3545893132686615, + "rewards/margins": 2.1073248386383057, + "rewards/rejected": -2.4619140625, + "step": 92 + }, + { + "epoch": 1.576271186440678, + "grad_norm": 25.932478195676993, + "learning_rate": 4.862318397631433e-07, + "logits/chosen": -0.04836834594607353, + "logits/rejected": -0.06467059254646301, + "logps/chosen": -24.941530227661133, + "logps/rejected": -38.25274658203125, + "loss": 0.252, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04554582014679909, + "rewards/margins": 2.8091211318969727, + "rewards/rejected": -2.8546671867370605, + "step": 93 + }, + { + "epoch": 1.5932203389830508, + "grad_norm": 17.31152835419153, + "learning_rate": 4.856191221228422e-07, + "logits/chosen": -0.14374472200870514, + "logits/rejected": -0.1499704271554947, + "logps/chosen": -25.189186096191406, + "logps/rejected": -48.39046859741211, + "loss": 0.2548, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014746442437171936, + "rewards/margins": 3.3370161056518555, + "rewards/rejected": -3.351762533187866, + "step": 94 + }, + { + "epoch": 1.6101694915254239, + "grad_norm": 21.553200648682367, + "learning_rate": 4.84993469180355e-07, + "logits/chosen": -0.25248920917510986, + "logits/rejected": -0.1786680817604065, + "logps/chosen": -21.31267547607422, + "logps/rejected": -40.57464599609375, + "loss": 0.1897, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06702820956707001, + "rewards/margins": 3.686950206756592, + "rewards/rejected": -3.619922399520874, + "step": 95 + }, + { + "epoch": 1.6271186440677967, + "grad_norm": 16.618810404954317, + "learning_rate": 4.843549152835302e-07, + "logits/chosen": -0.17732582986354828, + "logits/rejected": -0.15217895805835724, + "logps/chosen": -29.09910774230957, + "logps/rejected": -38.864524841308594, + "loss": 0.1892, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05224495008587837, + "rewards/margins": 2.6117098331451416, + "rewards/rejected": -2.559464693069458, + "step": 96 + }, + { + "epoch": 1.6440677966101696, + "grad_norm": 16.96144669030696, + "learning_rate": 4.837034954884681e-07, + "logits/chosen": -0.13769695162773132, + "logits/rejected": -0.09738799184560776, + "logps/chosen": -16.64884376525879, + "logps/rejected": -34.0985107421875, + "loss": 0.2166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0295465886592865, + "rewards/margins": 3.098619222640991, + "rewards/rejected": -3.0690724849700928, + "step": 97 + }, + { + "epoch": 1.6610169491525424, + "grad_norm": 17.80864093537469, + "learning_rate": 4.83039245557597e-07, + "logits/chosen": -0.016016261652112007, + "logits/rejected": -0.05212865397334099, + "logps/chosen": -26.810836791992188, + "logps/rejected": -38.81320571899414, + "loss": 0.1875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07362563908100128, + "rewards/margins": 2.9003326892852783, + "rewards/rejected": -2.9739584922790527, + "step": 98 + }, + { + "epoch": 1.6779661016949152, + "grad_norm": 20.332172117010963, + "learning_rate": 4.823622019577088e-07, + "logits/chosen": -0.22029350697994232, + "logits/rejected": -0.1754826307296753, + "logps/chosen": -24.44580841064453, + "logps/rejected": -31.48262596130371, + "loss": 0.2123, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.004874859936535358, + "rewards/margins": 2.326341152191162, + "rewards/rejected": -2.3312156200408936, + "step": 99 + }, + { + "epoch": 1.694915254237288, + "grad_norm": 20.940720757392302, + "learning_rate": 4.816724018579583e-07, + "logits/chosen": -0.08975666761398315, + "logits/rejected": -0.03957574442028999, + "logps/chosen": -36.57925796508789, + "logps/rejected": -41.47373962402344, + "loss": 0.2237, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10056591033935547, + "rewards/margins": 3.393941879272461, + "rewards/rejected": -3.2933762073516846, + "step": 100 + }, + { + "epoch": 1.711864406779661, + "grad_norm": 17.000783102847848, + "learning_rate": 4.809698831278217e-07, + "logits/chosen": -0.09356296807527542, + "logits/rejected": -0.09570194780826569, + "logps/chosen": -25.839569091796875, + "logps/rejected": -42.873077392578125, + "loss": 0.1959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1941157877445221, + "rewards/margins": 3.0593459606170654, + "rewards/rejected": -3.2534618377685547, + "step": 101 + }, + { + "epoch": 1.7288135593220337, + "grad_norm": 26.43033048122211, + "learning_rate": 4.802546843350177e-07, + "logits/chosen": -0.03907548263669014, + "logits/rejected": -0.0613831952214241, + "logps/chosen": -25.94208335876465, + "logps/rejected": -34.799400329589844, + "loss": 0.257, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1075030267238617, + "rewards/margins": 2.6531782150268555, + "rewards/rejected": -2.545675277709961, + "step": 102 + }, + { + "epoch": 1.7457627118644068, + "grad_norm": 19.25248915197079, + "learning_rate": 4.795268447433906e-07, + "logits/chosen": -0.23271867632865906, + "logits/rejected": -0.2442181557416916, + "logps/chosen": -21.609224319458008, + "logps/rejected": -39.6169319152832, + "loss": 0.1843, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4244083762168884, + "rewards/margins": 3.7757644653320312, + "rewards/rejected": -4.2001729011535645, + "step": 103 + }, + { + "epoch": 1.7627118644067796, + "grad_norm": 21.40405538405152, + "learning_rate": 4.787864043107546e-07, + "logits/chosen": -0.10186932981014252, + "logits/rejected": -0.10761649906635284, + "logps/chosen": -24.1138858795166, + "logps/rejected": -23.169330596923828, + "loss": 0.2512, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08395804464817047, + "rewards/margins": 0.9992507696151733, + "rewards/rejected": -0.9152926802635193, + "step": 104 + }, + { + "epoch": 1.7796610169491527, + "grad_norm": 20.65970281462911, + "learning_rate": 4.780334036866996e-07, + "logits/chosen": -0.1446046382188797, + "logits/rejected": -0.16783642768859863, + "logps/chosen": -29.0926513671875, + "logps/rejected": -47.739131927490234, + "loss": 0.1819, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4550026059150696, + "rewards/margins": 3.175567865371704, + "rewards/rejected": -3.630570650100708, + "step": 105 + }, + { + "epoch": 1.7966101694915255, + "grad_norm": 16.55598459027438, + "learning_rate": 4.772678842103605e-07, + "logits/chosen": -0.06549476087093353, + "logits/rejected": -0.04416227340698242, + "logps/chosen": -25.375438690185547, + "logps/rejected": -39.032981872558594, + "loss": 0.138, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.16781294345855713, + "rewards/margins": 3.484158992767334, + "rewards/rejected": -3.6519718170166016, + "step": 106 + }, + { + "epoch": 1.8135593220338984, + "grad_norm": 16.11829115416798, + "learning_rate": 4.764898879081467e-07, + "logits/chosen": -0.05152374878525734, + "logits/rejected": -0.07160673290491104, + "logps/chosen": -23.518722534179688, + "logps/rejected": -43.82634735107422, + "loss": 0.1763, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2105274647474289, + "rewards/margins": 3.0297629833221436, + "rewards/rejected": -2.819235324859619, + "step": 107 + }, + { + "epoch": 1.8305084745762712, + "grad_norm": 18.544747915953614, + "learning_rate": 4.7569945749143586e-07, + "logits/chosen": -0.00994398258626461, + "logits/rejected": 0.006802310235798359, + "logps/chosen": -23.792747497558594, + "logps/rejected": -47.211280822753906, + "loss": 0.2052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3990446925163269, + "rewards/margins": 3.575429916381836, + "rewards/rejected": -3.9744746685028076, + "step": 108 + }, + { + "epoch": 1.847457627118644, + "grad_norm": 15.674768365246683, + "learning_rate": 4.748966363542285e-07, + "logits/chosen": -0.10318706929683685, + "logits/rejected": -0.04973382502794266, + "logps/chosen": -20.84232521057129, + "logps/rejected": -39.88136672973633, + "loss": 0.1698, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17250564694404602, + "rewards/margins": 3.216583251953125, + "rewards/rejected": -3.0440773963928223, + "step": 109 + }, + { + "epoch": 1.8644067796610169, + "grad_norm": 16.323100274211107, + "learning_rate": 4.7408146857076563e-07, + "logits/chosen": 0.08578380197286606, + "logits/rejected": 0.04284593090415001, + "logps/chosen": -37.73735809326172, + "logps/rejected": -38.75680923461914, + "loss": 0.1792, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11772266030311584, + "rewards/margins": 2.448854446411133, + "rewards/rejected": -2.33113169670105, + "step": 110 + }, + { + "epoch": 1.8813559322033897, + "grad_norm": 16.578710310200407, + "learning_rate": 4.732539988931096e-07, + "logits/chosen": -0.26771169900894165, + "logits/rejected": -0.26380079984664917, + "logps/chosen": -23.918312072753906, + "logps/rejected": -43.63589096069336, + "loss": 0.1382, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2503085136413574, + "rewards/margins": 3.4693069458007812, + "rewards/rejected": -3.7196154594421387, + "step": 111 + }, + { + "epoch": 1.8983050847457628, + "grad_norm": 19.979288606666017, + "learning_rate": 4.7241427274868683e-07, + "logits/chosen": -0.048879463225603104, + "logits/rejected": 0.00943760946393013, + "logps/chosen": -24.316715240478516, + "logps/rejected": -42.57545471191406, + "loss": 0.2025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01973732002079487, + "rewards/margins": 3.4818313121795654, + "rewards/rejected": -3.5015687942504883, + "step": 112 + }, + { + "epoch": 1.9152542372881356, + "grad_norm": 16.77919383034577, + "learning_rate": 4.7156233623779383e-07, + "logits/chosen": -0.017183750867843628, + "logits/rejected": -0.02489522099494934, + "logps/chosen": -30.669607162475586, + "logps/rejected": -35.61785125732422, + "loss": 0.171, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1518259346485138, + "rewards/margins": 2.721503973007202, + "rewards/rejected": -2.8733298778533936, + "step": 113 + }, + { + "epoch": 1.9322033898305084, + "grad_norm": 25.510192937611073, + "learning_rate": 4.7069823613106687e-07, + "logits/chosen": -0.25519174337387085, + "logits/rejected": -0.21938219666481018, + "logps/chosen": -32.64997100830078, + "logps/rejected": -46.399112701416016, + "loss": 0.198, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.36455288529396057, + "rewards/margins": 3.620523452758789, + "rewards/rejected": -3.985076904296875, + "step": 114 + }, + { + "epoch": 1.9491525423728815, + "grad_norm": 21.709479844123084, + "learning_rate": 4.698220198669136e-07, + "logits/chosen": -0.15014870464801788, + "logits/rejected": -0.14446985721588135, + "logps/chosen": -23.829439163208008, + "logps/rejected": -37.09071350097656, + "loss": 0.2222, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.201849102973938, + "rewards/margins": 3.0588748455047607, + "rewards/rejected": -3.26072359085083, + "step": 115 + }, + { + "epoch": 1.9661016949152543, + "grad_norm": 20.84348155110451, + "learning_rate": 4.6893373554890917e-07, + "logits/chosen": -0.1855657547712326, + "logits/rejected": -0.1457989662885666, + "logps/chosen": -30.961164474487305, + "logps/rejected": -47.25037384033203, + "loss": 0.217, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3446941375732422, + "rewards/margins": 3.6179933547973633, + "rewards/rejected": -3.9626879692077637, + "step": 116 + }, + { + "epoch": 1.9830508474576272, + "grad_norm": 14.188597523254197, + "learning_rate": 4.6803343194315546e-07, + "logits/chosen": -0.09809039533138275, + "logits/rejected": -0.060599129647016525, + "logps/chosen": -29.427833557128906, + "logps/rejected": -46.29072952270508, + "loss": 0.1172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34794139862060547, + "rewards/margins": 3.890174388885498, + "rewards/rejected": -4.2381157875061035, + "step": 117 + }, + { + "epoch": 2.0, + "grad_norm": 14.21262907810025, + "learning_rate": 4.6712115847560353e-07, + "logits/chosen": -0.0804528221487999, + "logits/rejected": -0.0880361869931221, + "logps/chosen": -22.719079971313477, + "logps/rejected": -47.828243255615234, + "loss": 0.1696, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2510998249053955, + "rewards/margins": 4.143679618835449, + "rewards/rejected": -3.8925797939300537, + "step": 118 + }, + { + "epoch": 2.016949152542373, + "grad_norm": 7.256194218627331, + "learning_rate": 4.661969652293402e-07, + "logits/chosen": -0.057237230241298676, + "logits/rejected": -0.03790592402219772, + "logps/chosen": -21.60989761352539, + "logps/rejected": -43.51523208618164, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11654786765575409, + "rewards/margins": 3.8127760887145996, + "rewards/rejected": -3.69622802734375, + "step": 119 + }, + { + "epoch": 2.0338983050847457, + "grad_norm": 8.74634777891102, + "learning_rate": 4.652609029418388e-07, + "logits/chosen": 0.03335125744342804, + "logits/rejected": 0.031772270798683167, + "logps/chosen": -21.453704833984375, + "logps/rejected": -40.3062858581543, + "loss": 0.0893, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10782061517238617, + "rewards/margins": 4.060611248016357, + "rewards/rejected": -3.9527902603149414, + "step": 120 + }, + { + "epoch": 2.0508474576271185, + "grad_norm": 8.313099929127045, + "learning_rate": 4.6431302300217366e-07, + "logits/chosen": -0.20796310901641846, + "logits/rejected": -0.18069806694984436, + "logps/chosen": -27.584365844726562, + "logps/rejected": -37.579673767089844, + "loss": 0.0917, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3969431519508362, + "rewards/margins": 3.233177900314331, + "rewards/rejected": -2.8362350463867188, + "step": 121 + }, + { + "epoch": 2.0677966101694913, + "grad_norm": 10.855796103467934, + "learning_rate": 4.633533774481987e-07, + "logits/chosen": -0.07592164725065231, + "logits/rejected": -0.0696810930967331, + "logps/chosen": -27.249908447265625, + "logps/rejected": -45.94511413574219, + "loss": 0.0845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021982379257678986, + "rewards/margins": 4.154269695281982, + "rewards/rejected": -4.176252365112305, + "step": 122 + }, + { + "epoch": 2.084745762711864, + "grad_norm": 7.255720151076396, + "learning_rate": 4.623820189636905e-07, + "logits/chosen": -0.19116753339767456, + "logits/rejected": -0.1705985963344574, + "logps/chosen": -26.491065979003906, + "logps/rejected": -50.236698150634766, + "loss": 0.0909, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.14240173995494843, + "rewards/margins": 4.492888927459717, + "rewards/rejected": -4.350486755371094, + "step": 123 + }, + { + "epoch": 2.1016949152542375, + "grad_norm": 8.03074731997706, + "learning_rate": 4.613990008754565e-07, + "logits/chosen": -0.12923955917358398, + "logits/rejected": -0.14741843938827515, + "logps/chosen": -28.261474609375, + "logps/rejected": -36.72936248779297, + "loss": 0.1005, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6439403295516968, + "rewards/margins": 3.5893638134002686, + "rewards/rejected": -2.9454240798950195, + "step": 124 + }, + { + "epoch": 2.1186440677966103, + "grad_norm": 7.19275728016155, + "learning_rate": 4.60404377150407e-07, + "logits/chosen": -0.09195713698863983, + "logits/rejected": -0.042211033403873444, + "logps/chosen": -23.310510635375977, + "logps/rejected": -41.93342590332031, + "loss": 0.0925, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0040088072419166565, + "rewards/margins": 3.2483134269714355, + "rewards/rejected": -3.2443044185638428, + "step": 125 + }, + { + "epoch": 2.135593220338983, + "grad_norm": 7.466339863674321, + "learning_rate": 4.593982023925925e-07, + "logits/chosen": -0.07431389391422272, + "logits/rejected": -0.06840626150369644, + "logps/chosen": -25.431446075439453, + "logps/rejected": -39.0665168762207, + "loss": 0.09, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14491936564445496, + "rewards/margins": 3.5672109127044678, + "rewards/rejected": -3.4222917556762695, + "step": 126 + }, + { + "epoch": 2.152542372881356, + "grad_norm": 8.19688100505555, + "learning_rate": 4.58380531840206e-07, + "logits/chosen": -0.120096854865551, + "logits/rejected": -0.10113926976919174, + "logps/chosen": -26.030086517333984, + "logps/rejected": -37.91970443725586, + "loss": 0.0953, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21827784180641174, + "rewards/margins": 4.060682773590088, + "rewards/rejected": -3.842404842376709, + "step": 127 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 9.892790899219712, + "learning_rate": 4.5735142136255045e-07, + "logits/chosen": -0.23804128170013428, + "logits/rejected": -0.23227332532405853, + "logps/chosen": -27.41203498840332, + "logps/rejected": -49.19248962402344, + "loss": 0.0933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12989288568496704, + "rewards/margins": 4.297806262969971, + "rewards/rejected": -4.427699565887451, + "step": 128 + }, + { + "epoch": 2.1864406779661016, + "grad_norm": 6.571853125948924, + "learning_rate": 4.5631092745697164e-07, + "logits/chosen": -0.00046368176117539406, + "logits/rejected": 0.014133242890238762, + "logps/chosen": -25.415313720703125, + "logps/rejected": -41.508079528808594, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3947104215621948, + "rewards/margins": 4.418177127838135, + "rewards/rejected": -4.023467063903809, + "step": 129 + }, + { + "epoch": 2.2033898305084745, + "grad_norm": 7.081057065438042, + "learning_rate": 4.5525910724575645e-07, + "logits/chosen": -0.20635852217674255, + "logits/rejected": -0.1863619089126587, + "logps/chosen": -27.593435287475586, + "logps/rejected": -50.18062210083008, + "loss": 0.0915, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3062703311443329, + "rewards/margins": 4.946234703063965, + "rewards/rejected": -4.639964580535889, + "step": 130 + }, + { + "epoch": 2.2203389830508473, + "grad_norm": 6.94722893216983, + "learning_rate": 4.54196018472997e-07, + "logits/chosen": -0.1825593113899231, + "logits/rejected": -0.18460941314697266, + "logps/chosen": -25.40302276611328, + "logps/rejected": -57.28022003173828, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3234606981277466, + "rewards/margins": 5.864286422729492, + "rewards/rejected": -6.187747001647949, + "step": 131 + }, + { + "epoch": 2.23728813559322, + "grad_norm": 6.703220344523385, + "learning_rate": 4.5312171950142033e-07, + "logits/chosen": -0.1518273502588272, + "logits/rejected": -0.09540899842977524, + "logps/chosen": -21.725143432617188, + "logps/rejected": -38.91670608520508, + "loss": 0.0716, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3190383315086365, + "rewards/margins": 4.065824508666992, + "rewards/rejected": -3.746786117553711, + "step": 132 + }, + { + "epoch": 2.2542372881355934, + "grad_norm": 7.318607428943175, + "learning_rate": 4.520362693091845e-07, + "logits/chosen": -0.12475726008415222, + "logits/rejected": -0.12865117192268372, + "logps/chosen": -23.161043167114258, + "logps/rejected": -36.68880081176758, + "loss": 0.0762, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.030918624252080917, + "rewards/margins": 3.1682627201080322, + "rewards/rejected": -3.1373443603515625, + "step": 133 + }, + { + "epoch": 2.2711864406779663, + "grad_norm": 6.4216049351024065, + "learning_rate": 4.5093972748664087e-07, + "logits/chosen": -0.09874700009822845, + "logits/rejected": -0.10628420114517212, + "logps/chosen": -28.58932113647461, + "logps/rejected": -47.10905075073242, + "loss": 0.055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31862539052963257, + "rewards/margins": 4.816265106201172, + "rewards/rejected": -4.4976396560668945, + "step": 134 + }, + { + "epoch": 2.288135593220339, + "grad_norm": 5.595876217706418, + "learning_rate": 4.498321542330622e-07, + "logits/chosen": -0.17151176929473877, + "logits/rejected": -0.18770024180412292, + "logps/chosen": -22.070384979248047, + "logps/rejected": -49.778038024902344, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06101692467927933, + "rewards/margins": 5.113625526428223, + "rewards/rejected": -5.052608013153076, + "step": 135 + }, + { + "epoch": 2.305084745762712, + "grad_norm": 8.583744234061204, + "learning_rate": 4.4871361035333833e-07, + "logits/chosen": -0.1267111748456955, + "logits/rejected": -0.11681263148784637, + "logps/chosen": -21.870920181274414, + "logps/rejected": -39.6839714050293, + "loss": 0.0796, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31499701738357544, + "rewards/margins": 3.777963638305664, + "rewards/rejected": -3.4629664421081543, + "step": 136 + }, + { + "epoch": 2.3220338983050848, + "grad_norm": 7.125850476151505, + "learning_rate": 4.475841572546374e-07, + "logits/chosen": -0.19854867458343506, + "logits/rejected": -0.16304975748062134, + "logps/chosen": -28.775941848754883, + "logps/rejected": -39.197044372558594, + "loss": 0.0799, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10516883432865143, + "rewards/margins": 3.753281593322754, + "rewards/rejected": -3.858450174331665, + "step": 137 + }, + { + "epoch": 2.3389830508474576, + "grad_norm": 8.162386927617444, + "learning_rate": 4.464438569430353e-07, + "logits/chosen": -0.18249069154262543, + "logits/rejected": -0.19290274381637573, + "logps/chosen": -25.261497497558594, + "logps/rejected": -37.97518539428711, + "loss": 0.0598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16362299025058746, + "rewards/margins": 3.9931089878082275, + "rewards/rejected": -3.829486131668091, + "step": 138 + }, + { + "epoch": 2.3559322033898304, + "grad_norm": 6.79825948010009, + "learning_rate": 4.452927720201112e-07, + "logits/chosen": -0.15876157581806183, + "logits/rejected": -0.15914849936962128, + "logps/chosen": -23.805156707763672, + "logps/rejected": -43.227264404296875, + "loss": 0.0702, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14774608612060547, + "rewards/margins": 4.2368483543396, + "rewards/rejected": -4.089102268218994, + "step": 139 + }, + { + "epoch": 2.3728813559322033, + "grad_norm": 5.9070394129722565, + "learning_rate": 4.441309656795106e-07, + "logits/chosen": -0.1470584124326706, + "logits/rejected": -0.12824571132659912, + "logps/chosen": -24.07137107849121, + "logps/rejected": -51.49998474121094, + "loss": 0.0575, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09202487766742706, + "rewards/margins": 4.7118940353393555, + "rewards/rejected": -4.619868755340576, + "step": 140 + }, + { + "epoch": 2.389830508474576, + "grad_norm": 6.6818032600348864, + "learning_rate": 4.429585017034766e-07, + "logits/chosen": -0.12072446942329407, + "logits/rejected": -0.1437748223543167, + "logps/chosen": -26.129920959472656, + "logps/rejected": -50.33393096923828, + "loss": 0.0723, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.024979308247566223, + "rewards/margins": 5.7934794425964355, + "rewards/rejected": -5.768500328063965, + "step": 141 + }, + { + "epoch": 2.406779661016949, + "grad_norm": 5.465459623937437, + "learning_rate": 4.417754444593478e-07, + "logits/chosen": -0.17397671937942505, + "logits/rejected": -0.18419091403484344, + "logps/chosen": -27.539466857910156, + "logps/rejected": -45.487571716308594, + "loss": 0.0487, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023346930742263794, + "rewards/margins": 4.994349479675293, + "rewards/rejected": -4.97100305557251, + "step": 142 + }, + { + "epoch": 2.423728813559322, + "grad_norm": 10.390645074466443, + "learning_rate": 4.4058185889602497e-07, + "logits/chosen": -0.22157034277915955, + "logits/rejected": -0.22870029509067535, + "logps/chosen": -16.434494018554688, + "logps/rejected": -37.32805633544922, + "loss": 0.0972, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3774448335170746, + "rewards/margins": 4.384706497192383, + "rewards/rejected": -4.007261276245117, + "step": 143 + }, + { + "epoch": 2.440677966101695, + "grad_norm": 9.44436087598635, + "learning_rate": 4.39377810540405e-07, + "logits/chosen": -0.21542900800704956, + "logits/rejected": -0.22131392359733582, + "logps/chosen": -36.0152702331543, + "logps/rejected": -38.466373443603516, + "loss": 0.1026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5316247344017029, + "rewards/margins": 2.671638250350952, + "rewards/rejected": -3.2032630443573, + "step": 144 + }, + { + "epoch": 2.457627118644068, + "grad_norm": 5.963157138060162, + "learning_rate": 4.38163365493784e-07, + "logits/chosen": -0.17747551202774048, + "logits/rejected": -0.1994229406118393, + "logps/chosen": -32.599082946777344, + "logps/rejected": -62.15748596191406, + "loss": 0.0596, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2045070230960846, + "rewards/margins": 4.970805644989014, + "rewards/rejected": -4.766298770904541, + "step": 145 + }, + { + "epoch": 2.4745762711864407, + "grad_norm": 6.998829586239467, + "learning_rate": 4.3693859042822774e-07, + "logits/chosen": -0.06130817532539368, + "logits/rejected": -0.04164750128984451, + "logps/chosen": -28.672290802001953, + "logps/rejected": -44.092681884765625, + "loss": 0.064, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5117320418357849, + "rewards/margins": 5.0716657638549805, + "rewards/rejected": -4.559933662414551, + "step": 146 + }, + { + "epoch": 2.4915254237288136, + "grad_norm": 7.186169716835621, + "learning_rate": 4.3570355258291223e-07, + "logits/chosen": -0.16528643667697906, + "logits/rejected": -0.14484813809394836, + "logps/chosen": -27.115493774414062, + "logps/rejected": -36.884578704833984, + "loss": 0.0723, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5564872026443481, + "rewards/margins": 3.266021251678467, + "rewards/rejected": -2.709534168243408, + "step": 147 + }, + { + "epoch": 2.5084745762711864, + "grad_norm": 5.1159064429292735, + "learning_rate": 4.344583197604318e-07, + "logits/chosen": -0.20358271896839142, + "logits/rejected": -0.20041170716285706, + "logps/chosen": -23.109371185302734, + "logps/rejected": -51.53319549560547, + "loss": 0.0434, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.036565251648426056, + "rewards/margins": 5.316205024719238, + "rewards/rejected": -5.279640197753906, + "step": 148 + }, + { + "epoch": 2.5254237288135593, + "grad_norm": 8.402984257771724, + "learning_rate": 4.332029603230767e-07, + "logits/chosen": -0.08776924759149551, + "logits/rejected": -0.07819744944572449, + "logps/chosen": -36.21211624145508, + "logps/rejected": -42.74664306640625, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16677923500537872, + "rewards/margins": 4.416428089141846, + "rewards/rejected": -4.583207130432129, + "step": 149 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 6.450537035637719, + "learning_rate": 4.319375431890806e-07, + "logits/chosen": -0.21261297166347504, + "logits/rejected": -0.15842606127262115, + "logps/chosen": -23.646146774291992, + "logps/rejected": -36.388458251953125, + "loss": 0.0696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24000459909439087, + "rewards/margins": 5.469123363494873, + "rewards/rejected": -5.229118824005127, + "step": 150 + }, + { + "epoch": 2.559322033898305, + "grad_norm": 6.100900257526249, + "learning_rate": 4.306621378288364e-07, + "logits/chosen": -0.12006445229053497, + "logits/rejected": -0.09317637979984283, + "logps/chosen": -25.193214416503906, + "logps/rejected": -50.55509948730469, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.055319640785455704, + "rewards/margins": 4.907276153564453, + "rewards/rejected": -4.9625959396362305, + "step": 151 + }, + { + "epoch": 2.576271186440678, + "grad_norm": 5.335466869594214, + "learning_rate": 4.2937681426108275e-07, + "logits/chosen": -0.156333327293396, + "logits/rejected": -0.1703069657087326, + "logps/chosen": -25.732696533203125, + "logps/rejected": -37.75965118408203, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1128598153591156, + "rewards/margins": 3.560478448867798, + "rewards/rejected": -3.4476187229156494, + "step": 152 + }, + { + "epoch": 2.593220338983051, + "grad_norm": 6.414862486449905, + "learning_rate": 4.280816430490602e-07, + "logits/chosen": -0.14309167861938477, + "logits/rejected": -0.14619530737400055, + "logps/chosen": -23.593332290649414, + "logps/rejected": -41.5565071105957, + "loss": 0.0688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.14477895200252533, + "rewards/margins": 4.543487071990967, + "rewards/rejected": -4.398708343505859, + "step": 153 + }, + { + "epoch": 2.610169491525424, + "grad_norm": 5.895188410626077, + "learning_rate": 4.2677669529663686e-07, + "logits/chosen": -0.1784745752811432, + "logits/rejected": -0.16759036481380463, + "logps/chosen": -22.0533390045166, + "logps/rejected": -35.54384231567383, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02956150844693184, + "rewards/margins": 4.328366756439209, + "rewards/rejected": -4.298805236816406, + "step": 154 + }, + { + "epoch": 2.6271186440677967, + "grad_norm": 5.912717779717486, + "learning_rate": 4.254620426444053e-07, + "logits/chosen": -0.15713754296302795, + "logits/rejected": -0.1796114146709442, + "logps/chosen": -25.46520233154297, + "logps/rejected": -48.37349319458008, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28247907757759094, + "rewards/margins": 5.51485013961792, + "rewards/rejected": -5.2323713302612305, + "step": 155 + }, + { + "epoch": 2.6440677966101696, + "grad_norm": 5.922436242193146, + "learning_rate": 4.2413775726574923e-07, + "logits/chosen": -0.11942790448665619, + "logits/rejected": -0.11864694207906723, + "logps/chosen": -24.162601470947266, + "logps/rejected": -47.01225280761719, + "loss": 0.0543, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3353565037250519, + "rewards/margins": 4.963751316070557, + "rewards/rejected": -5.299108505249023, + "step": 156 + }, + { + "epoch": 2.6610169491525424, + "grad_norm": 6.106867092542455, + "learning_rate": 4.228039118628815e-07, + "logits/chosen": -0.12817731499671936, + "logits/rejected": -0.09794219583272934, + "logps/chosen": -23.699031829833984, + "logps/rejected": -43.58228302001953, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08896563202142715, + "rewards/margins": 4.032917499542236, + "rewards/rejected": -4.121883392333984, + "step": 157 + }, + { + "epoch": 2.6779661016949152, + "grad_norm": 5.803302086144925, + "learning_rate": 4.214605796628526e-07, + "logits/chosen": -0.2880489230155945, + "logits/rejected": -0.23902469873428345, + "logps/chosen": -23.32792091369629, + "logps/rejected": -45.10264587402344, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43519750237464905, + "rewards/margins": 4.654225826263428, + "rewards/rejected": -5.089423656463623, + "step": 158 + }, + { + "epoch": 2.694915254237288, + "grad_norm": 5.177802734038862, + "learning_rate": 4.201078344135306e-07, + "logits/chosen": -0.24913498759269714, + "logits/rejected": -0.2534574270248413, + "logps/chosen": -24.795732498168945, + "logps/rejected": -42.07280349731445, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02173246443271637, + "rewards/margins": 4.118818283081055, + "rewards/rejected": -4.14055061340332, + "step": 159 + }, + { + "epoch": 2.711864406779661, + "grad_norm": 9.038983465853134, + "learning_rate": 4.187457503795526e-07, + "logits/chosen": -0.18585993349552155, + "logits/rejected": -0.16700756549835205, + "logps/chosen": -27.172670364379883, + "logps/rejected": -34.79685592651367, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2640396058559418, + "rewards/margins": 4.567864894866943, + "rewards/rejected": -4.303825378417969, + "step": 160 + }, + { + "epoch": 2.7288135593220337, + "grad_norm": 5.702053280294616, + "learning_rate": 4.173744023382474e-07, + "logits/chosen": -0.2842308282852173, + "logits/rejected": -0.29381710290908813, + "logps/chosen": -21.896320343017578, + "logps/rejected": -41.444732666015625, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02725343219935894, + "rewards/margins": 4.254402160644531, + "rewards/rejected": -4.227148532867432, + "step": 161 + }, + { + "epoch": 2.7457627118644066, + "grad_norm": 6.4501142174750825, + "learning_rate": 4.159938655755306e-07, + "logits/chosen": -0.1036592572927475, + "logits/rejected": -0.052220165729522705, + "logps/chosen": -26.139209747314453, + "logps/rejected": -46.38983154296875, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15247440338134766, + "rewards/margins": 5.240863800048828, + "rewards/rejected": -5.393338680267334, + "step": 162 + }, + { + "epoch": 2.7627118644067794, + "grad_norm": 5.150964666613272, + "learning_rate": 4.1460421588177094e-07, + "logits/chosen": -0.25343507528305054, + "logits/rejected": -0.24906288087368011, + "logps/chosen": -21.305830001831055, + "logps/rejected": -43.92711639404297, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2159092128276825, + "rewards/margins": 5.304495811462402, + "rewards/rejected": -5.520405292510986, + "step": 163 + }, + { + "epoch": 2.7796610169491527, + "grad_norm": 6.220860659821832, + "learning_rate": 4.1320552954763037e-07, + "logits/chosen": -0.06625357270240784, + "logits/rejected": -0.0591760016977787, + "logps/chosen": -32.38239288330078, + "logps/rejected": -39.54067611694336, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11683804541826248, + "rewards/margins": 3.713731288909912, + "rewards/rejected": -3.83056902885437, + "step": 164 + }, + { + "epoch": 2.7966101694915255, + "grad_norm": 6.305844556479963, + "learning_rate": 4.117978833598747e-07, + "logits/chosen": -0.31626027822494507, + "logits/rejected": -0.28030937910079956, + "logps/chosen": -32.548240661621094, + "logps/rejected": -42.81690979003906, + "loss": 0.0607, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21723094582557678, + "rewards/margins": 4.100663185119629, + "rewards/rejected": -3.883432388305664, + "step": 165 + }, + { + "epoch": 2.8135593220338984, + "grad_norm": 6.559589012838323, + "learning_rate": 4.1038135459715885e-07, + "logits/chosen": -0.2386135458946228, + "logits/rejected": -0.23032473027706146, + "logps/chosen": -15.93246078491211, + "logps/rejected": -36.63377380371094, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09315376728773117, + "rewards/margins": 5.372439861297607, + "rewards/rejected": -5.279285907745361, + "step": 166 + }, + { + "epoch": 2.830508474576271, + "grad_norm": 8.346466429496452, + "learning_rate": 4.0895602102578373e-07, + "logits/chosen": -0.19355379045009613, + "logits/rejected": -0.2431831657886505, + "logps/chosen": -29.353004455566406, + "logps/rejected": -47.65980911254883, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30096274614334106, + "rewards/margins": 4.469476699829102, + "rewards/rejected": -4.770439624786377, + "step": 167 + }, + { + "epoch": 2.847457627118644, + "grad_norm": 7.84040587215191, + "learning_rate": 4.075219608954278e-07, + "logits/chosen": -0.0895601287484169, + "logits/rejected": -0.06131096929311752, + "logps/chosen": -21.794588088989258, + "logps/rejected": -46.49802780151367, + "loss": 0.0639, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.053712397813797, + "rewards/margins": 5.101894855499268, + "rewards/rejected": -5.155607223510742, + "step": 168 + }, + { + "epoch": 2.864406779661017, + "grad_norm": 10.599854581213274, + "learning_rate": 4.0607925293484997e-07, + "logits/chosen": -0.26595553755760193, + "logits/rejected": -0.25741392374038696, + "logps/chosen": -26.43805503845215, + "logps/rejected": -34.98290252685547, + "loss": 0.1256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20261424779891968, + "rewards/margins": 3.2389473915100098, + "rewards/rejected": -3.441561222076416, + "step": 169 + }, + { + "epoch": 2.8813559322033897, + "grad_norm": 7.045992493613005, + "learning_rate": 4.046279763475687e-07, + "logits/chosen": -0.36673855781555176, + "logits/rejected": -0.37882646918296814, + "logps/chosen": -23.698484420776367, + "logps/rejected": -42.687042236328125, + "loss": 0.0617, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29158052802085876, + "rewards/margins": 4.799960136413574, + "rewards/rejected": -5.091540336608887, + "step": 170 + }, + { + "epoch": 2.898305084745763, + "grad_norm": 5.4596269860548645, + "learning_rate": 4.031682108075128e-07, + "logits/chosen": -0.23533686995506287, + "logits/rejected": -0.2579227685928345, + "logps/chosen": -24.494571685791016, + "logps/rejected": -50.30744552612305, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4669819474220276, + "rewards/margins": 5.18317985534668, + "rewards/rejected": -5.6501617431640625, + "step": 171 + }, + { + "epoch": 2.915254237288136, + "grad_norm": 6.6964632868094, + "learning_rate": 4.0170003645464835e-07, + "logits/chosen": -0.28077659010887146, + "logits/rejected": -0.2605874836444855, + "logps/chosen": -30.141586303710938, + "logps/rejected": -43.39360046386719, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2600446939468384, + "rewards/margins": 4.748435020446777, + "rewards/rejected": -5.008480072021484, + "step": 172 + }, + { + "epoch": 2.9322033898305087, + "grad_norm": 6.25941157775491, + "learning_rate": 4.0022353389057793e-07, + "logits/chosen": -0.18370503187179565, + "logits/rejected": -0.15738657116889954, + "logps/chosen": -28.340681076049805, + "logps/rejected": -49.75542068481445, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15175539255142212, + "rewards/margins": 4.8475141525268555, + "rewards/rejected": -4.999269485473633, + "step": 173 + }, + { + "epoch": 2.9491525423728815, + "grad_norm": 5.2666179841342755, + "learning_rate": 3.9873878417411685e-07, + "logits/chosen": -0.25363242626190186, + "logits/rejected": -0.22387123107910156, + "logps/chosen": -30.49943733215332, + "logps/rejected": -51.61265563964844, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4314861297607422, + "rewards/margins": 5.466277122497559, + "rewards/rejected": -5.897763252258301, + "step": 174 + }, + { + "epoch": 2.9661016949152543, + "grad_norm": 6.6142603605122705, + "learning_rate": 3.97245868816842e-07, + "logits/chosen": -0.18011420965194702, + "logits/rejected": -0.14474789798259735, + "logps/chosen": -22.61705207824707, + "logps/rejected": -34.74039840698242, + "loss": 0.0656, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35052689909935, + "rewards/margins": 4.783888816833496, + "rewards/rejected": -4.433361530303955, + "step": 175 + }, + { + "epoch": 2.983050847457627, + "grad_norm": 7.350936104887415, + "learning_rate": 3.95744869778618e-07, + "logits/chosen": -0.09902404993772507, + "logits/rejected": -0.08743295818567276, + "logps/chosen": -33.22180938720703, + "logps/rejected": -48.17066192626953, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3400125801563263, + "rewards/margins": 4.35988712310791, + "rewards/rejected": -4.699898719787598, + "step": 176 + }, + { + "epoch": 3.0, + "grad_norm": 6.41090986992918, + "learning_rate": 3.942358694630967e-07, + "logits/chosen": -0.3509863615036011, + "logits/rejected": -0.3755185306072235, + "logps/chosen": -24.426481246948242, + "logps/rejected": -49.73809051513672, + "loss": 0.0751, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2657313942909241, + "rewards/margins": 4.6201324462890625, + "rewards/rejected": -4.885863780975342, + "step": 177 + }, + { + "epoch": 3.016949152542373, + "grad_norm": 3.543481556246516, + "learning_rate": 3.927189507131938e-07, + "logits/chosen": -0.2855956554412842, + "logits/rejected": -0.2373581826686859, + "logps/chosen": -25.790422439575195, + "logps/rejected": -42.86233139038086, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47834354639053345, + "rewards/margins": 4.5599493980407715, + "rewards/rejected": -5.03829288482666, + "step": 178 + }, + { + "epoch": 3.0338983050847457, + "grad_norm": 4.068888114820521, + "learning_rate": 3.9119419680654083e-07, + "logits/chosen": -0.2456224113702774, + "logits/rejected": -0.23849861323833466, + "logps/chosen": -26.366769790649414, + "logps/rejected": -45.77360153198242, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29546892642974854, + "rewards/margins": 5.436995029449463, + "rewards/rejected": -5.141526222229004, + "step": 179 + }, + { + "epoch": 3.0508474576271185, + "grad_norm": 3.4882014800516408, + "learning_rate": 3.896616914509131e-07, + "logits/chosen": -0.28572219610214233, + "logits/rejected": -0.24028098583221436, + "logps/chosen": -25.306299209594727, + "logps/rejected": -41.360389709472656, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11109927296638489, + "rewards/margins": 5.036979675292969, + "rewards/rejected": -5.148078918457031, + "step": 180 + }, + { + "epoch": 3.0677966101694913, + "grad_norm": 4.061881260336592, + "learning_rate": 3.881215187796344e-07, + "logits/chosen": -0.17325271666049957, + "logits/rejected": -0.15583127737045288, + "logps/chosen": -22.642131805419922, + "logps/rejected": -49.67926025390625, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09581390023231506, + "rewards/margins": 5.447430610656738, + "rewards/rejected": -5.351616382598877, + "step": 181 + }, + { + "epoch": 3.084745762711864, + "grad_norm": 4.520714234908951, + "learning_rate": 3.865737633469579e-07, + "logits/chosen": -0.21125821769237518, + "logits/rejected": -0.16403470933437347, + "logps/chosen": -33.79856872558594, + "logps/rejected": -48.687171936035156, + "loss": 0.0492, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7709572315216064, + "rewards/margins": 5.184902191162109, + "rewards/rejected": -5.955859661102295, + "step": 182 + }, + { + "epoch": 3.1016949152542375, + "grad_norm": 4.245352342549904, + "learning_rate": 3.8501851012342444e-07, + "logits/chosen": -0.28263112902641296, + "logits/rejected": -0.24399010837078094, + "logps/chosen": -29.092899322509766, + "logps/rejected": -49.18566131591797, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2785920202732086, + "rewards/margins": 5.724462032318115, + "rewards/rejected": -6.003054141998291, + "step": 183 + }, + { + "epoch": 3.1186440677966103, + "grad_norm": 4.31037076617115, + "learning_rate": 3.834558444911977e-07, + "logits/chosen": -0.22499172389507294, + "logits/rejected": -0.2413562387228012, + "logps/chosen": -28.549692153930664, + "logps/rejected": -54.757652282714844, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07061734795570374, + "rewards/margins": 5.652264595031738, + "rewards/rejected": -5.722881317138672, + "step": 184 + }, + { + "epoch": 3.135593220338983, + "grad_norm": 4.866640213250526, + "learning_rate": 3.818858522393763e-07, + "logits/chosen": -0.14125032722949982, + "logits/rejected": -0.14179250597953796, + "logps/chosen": -22.976459503173828, + "logps/rejected": -49.11492156982422, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.040695205330848694, + "rewards/margins": 5.625366687774658, + "rewards/rejected": -5.584671974182129, + "step": 185 + }, + { + "epoch": 3.152542372881356, + "grad_norm": 4.2109878427330685, + "learning_rate": 3.8030861955928496e-07, + "logits/chosen": -0.30937284231185913, + "logits/rejected": -0.31210747361183167, + "logps/chosen": -30.636043548583984, + "logps/rejected": -59.81259536743164, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36846521496772766, + "rewards/margins": 5.769496917724609, + "rewards/rejected": -6.137962341308594, + "step": 186 + }, + { + "epoch": 3.169491525423729, + "grad_norm": 4.057066326184392, + "learning_rate": 3.787242330397418e-07, + "logits/chosen": -0.21361833810806274, + "logits/rejected": -0.18969151377677917, + "logps/chosen": -25.21249008178711, + "logps/rejected": -47.042659759521484, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09672415256500244, + "rewards/margins": 5.194358825683594, + "rewards/rejected": -5.291082859039307, + "step": 187 + }, + { + "epoch": 3.1864406779661016, + "grad_norm": 3.447901220325472, + "learning_rate": 3.7713277966230513e-07, + "logits/chosen": -0.2784624397754669, + "logits/rejected": -0.28683120012283325, + "logps/chosen": -36.1049690246582, + "logps/rejected": -57.15819549560547, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007891565561294556, + "rewards/margins": 5.449771881103516, + "rewards/rejected": -5.457663059234619, + "step": 188 + }, + { + "epoch": 3.2033898305084745, + "grad_norm": 4.442046435541958, + "learning_rate": 3.755343467964981e-07, + "logits/chosen": -0.31062349677085876, + "logits/rejected": -0.3004721999168396, + "logps/chosen": -28.58712387084961, + "logps/rejected": -64.2608413696289, + "loss": 0.0371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4109271168708801, + "rewards/margins": 7.114888668060303, + "rewards/rejected": -7.525815010070801, + "step": 189 + }, + { + "epoch": 3.2203389830508473, + "grad_norm": 3.0719724662002896, + "learning_rate": 3.739290221950123e-07, + "logits/chosen": -0.17614498734474182, + "logits/rejected": -0.1161608174443245, + "logps/chosen": -19.90385627746582, + "logps/rejected": -48.33121871948242, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1976543366909027, + "rewards/margins": 6.546693325042725, + "rewards/rejected": -6.349039077758789, + "step": 190 + }, + { + "epoch": 3.23728813559322, + "grad_norm": 3.947699710282849, + "learning_rate": 3.723168939888901e-07, + "logits/chosen": -0.2788640558719635, + "logits/rejected": -0.2216426283121109, + "logps/chosen": -31.930301666259766, + "logps/rejected": -48.188316345214844, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2625292241573334, + "rewards/margins": 6.2579474449157715, + "rewards/rejected": -5.995418548583984, + "step": 191 + }, + { + "epoch": 3.2542372881355934, + "grad_norm": 3.948675289926565, + "learning_rate": 3.7069805068268624e-07, + "logits/chosen": -0.24821209907531738, + "logits/rejected": -0.2691497802734375, + "logps/chosen": -23.103912353515625, + "logps/rejected": -45.67485427856445, + "loss": 0.051, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5271704792976379, + "rewards/margins": 5.4129743576049805, + "rewards/rejected": -5.9401445388793945, + "step": 192 + }, + { + "epoch": 3.2711864406779663, + "grad_norm": 3.204036420155872, + "learning_rate": 3.6907258114960915e-07, + "logits/chosen": -0.20090129971504211, + "logits/rejected": -0.1883653998374939, + "logps/chosen": -21.614791870117188, + "logps/rejected": -36.44792556762695, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26388826966285706, + "rewards/margins": 5.233245372772217, + "rewards/rejected": -5.497133255004883, + "step": 193 + }, + { + "epoch": 3.288135593220339, + "grad_norm": 4.608553625728515, + "learning_rate": 3.6744057462664194e-07, + "logits/chosen": -0.22761565446853638, + "logits/rejected": -0.18411225080490112, + "logps/chosen": -33.556297302246094, + "logps/rejected": -45.10346984863281, + "loss": 0.0426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32763671875, + "rewards/margins": 5.759217262268066, + "rewards/rejected": -6.086853504180908, + "step": 194 + }, + { + "epoch": 3.305084745762712, + "grad_norm": 3.3003212602613052, + "learning_rate": 3.658021207096432e-07, + "logits/chosen": -0.26821860671043396, + "logits/rejected": -0.23487797379493713, + "logps/chosen": -26.26876449584961, + "logps/rejected": -39.17176818847656, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06157127395272255, + "rewards/margins": 4.7874369621276855, + "rewards/rejected": -4.725865364074707, + "step": 195 + }, + { + "epoch": 3.3220338983050848, + "grad_norm": 4.8557388954783915, + "learning_rate": 3.6415730934842825e-07, + "logits/chosen": -0.2502498924732208, + "logits/rejected": -0.21418914198875427, + "logps/chosen": -24.12335205078125, + "logps/rejected": -39.51020431518555, + "loss": 0.047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3541201651096344, + "rewards/margins": 5.43333101272583, + "rewards/rejected": -5.07921028137207, + "step": 196 + }, + { + "epoch": 3.3389830508474576, + "grad_norm": 3.1710739557100025, + "learning_rate": 3.625062308418311e-07, + "logits/chosen": -0.19088196754455566, + "logits/rejected": -0.1449725329875946, + "logps/chosen": -41.92289733886719, + "logps/rejected": -52.62822341918945, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7417705059051514, + "rewards/margins": 5.704789161682129, + "rewards/rejected": -6.446559906005859, + "step": 197 + }, + { + "epoch": 3.3559322033898304, + "grad_norm": 3.8833880103526273, + "learning_rate": 3.6084897583274715e-07, + "logits/chosen": -0.33713212609291077, + "logits/rejected": -0.32788529992103577, + "logps/chosen": -18.311298370361328, + "logps/rejected": -47.206260681152344, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28597769141197205, + "rewards/margins": 5.729028701782227, + "rewards/rejected": -6.015005588531494, + "step": 198 + }, + { + "epoch": 3.3728813559322033, + "grad_norm": 3.971746818851194, + "learning_rate": 3.591856353031566e-07, + "logits/chosen": -0.388487309217453, + "logits/rejected": -0.3937668800354004, + "logps/chosen": -20.602941513061523, + "logps/rejected": -46.418514251708984, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0760723352432251, + "rewards/margins": 6.133167266845703, + "rewards/rejected": -6.209239959716797, + "step": 199 + }, + { + "epoch": 3.389830508474576, + "grad_norm": 2.669544955188557, + "learning_rate": 3.5751630056913013e-07, + "logits/chosen": -0.28054508566856384, + "logits/rejected": -0.24293102324008942, + "logps/chosen": -24.345874786376953, + "logps/rejected": -43.055397033691406, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006254836916923523, + "rewards/margins": 5.404486179351807, + "rewards/rejected": -5.410740375518799, + "step": 200 + }, + { + "epoch": 3.406779661016949, + "grad_norm": 3.472014476230378, + "learning_rate": 3.558410632758153e-07, + "logits/chosen": -0.3892117142677307, + "logits/rejected": -0.3841942548751831, + "logps/chosen": -22.507129669189453, + "logps/rejected": -45.49005126953125, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08385208249092102, + "rewards/margins": 4.855816841125488, + "rewards/rejected": -4.939668655395508, + "step": 201 + }, + { + "epoch": 3.423728813559322, + "grad_norm": 3.5814887606335124, + "learning_rate": 3.5416001539240574e-07, + "logits/chosen": -0.300984263420105, + "logits/rejected": -0.28749731183052063, + "logps/chosen": -22.618236541748047, + "logps/rejected": -54.328731536865234, + "loss": 0.0344, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42270928621292114, + "rewards/margins": 6.060704708099365, + "rewards/rejected": -6.4834136962890625, + "step": 202 + }, + { + "epoch": 3.440677966101695, + "grad_norm": 3.9783986017754, + "learning_rate": 3.5247324920709147e-07, + "logits/chosen": -0.11381550878286362, + "logits/rejected": -0.10474348813295364, + "logps/chosen": -29.523387908935547, + "logps/rejected": -44.939971923828125, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5131514668464661, + "rewards/margins": 4.85312557220459, + "rewards/rejected": -5.36627721786499, + "step": 203 + }, + { + "epoch": 3.457627118644068, + "grad_norm": 2.90882629880929, + "learning_rate": 3.5078085732199307e-07, + "logits/chosen": -0.17035694420337677, + "logits/rejected": -0.14843972027301788, + "logps/chosen": -24.29421615600586, + "logps/rejected": -47.5906982421875, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.775327742099762, + "rewards/margins": 5.236928462982178, + "rewards/rejected": -6.012256145477295, + "step": 204 + }, + { + "epoch": 3.4745762711864407, + "grad_norm": 3.5359065761216906, + "learning_rate": 3.490829326480773e-07, + "logits/chosen": -0.2077549546957016, + "logits/rejected": -0.139791339635849, + "logps/chosen": -29.458728790283203, + "logps/rejected": -46.196311950683594, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43501347303390503, + "rewards/margins": 5.490588188171387, + "rewards/rejected": -5.925601005554199, + "step": 205 + }, + { + "epoch": 3.4915254237288136, + "grad_norm": 3.5732057063389924, + "learning_rate": 3.4737956840005684e-07, + "logits/chosen": -0.24159546196460724, + "logits/rejected": -0.21804997324943542, + "logps/chosen": -22.523195266723633, + "logps/rejected": -40.27927780151367, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2203037589788437, + "rewards/margins": 4.918404579162598, + "rewards/rejected": -5.138708114624023, + "step": 206 + }, + { + "epoch": 3.5084745762711864, + "grad_norm": 3.2368948031127402, + "learning_rate": 3.4567085809127245e-07, + "logits/chosen": -0.3044562339782715, + "logits/rejected": -0.28132855892181396, + "logps/chosen": -23.9556827545166, + "logps/rejected": -54.27796173095703, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4134722352027893, + "rewards/margins": 6.515480041503906, + "rewards/rejected": -6.928952217102051, + "step": 207 + }, + { + "epoch": 3.5254237288135593, + "grad_norm": 3.9848937033562515, + "learning_rate": 3.439568955285595e-07, + "logits/chosen": -0.3248399794101715, + "logits/rejected": -0.2991315722465515, + "logps/chosen": -19.110692977905273, + "logps/rejected": -47.77824401855469, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6731768846511841, + "rewards/margins": 6.201772689819336, + "rewards/rejected": -6.874949932098389, + "step": 208 + }, + { + "epoch": 3.542372881355932, + "grad_norm": 3.1498741156916186, + "learning_rate": 3.4223777480709804e-07, + "logits/chosen": -0.3734952211380005, + "logits/rejected": -0.32552629709243774, + "logps/chosen": -18.623991012573242, + "logps/rejected": -42.553443908691406, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30939486622810364, + "rewards/margins": 5.721473217010498, + "rewards/rejected": -6.030868053436279, + "step": 209 + }, + { + "epoch": 3.559322033898305, + "grad_norm": 4.040639255967625, + "learning_rate": 3.405135903052465e-07, + "logits/chosen": -0.4112386703491211, + "logits/rejected": -0.3649882376194, + "logps/chosen": -28.818723678588867, + "logps/rejected": -44.70659637451172, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48197856545448303, + "rewards/margins": 5.537832260131836, + "rewards/rejected": -6.019810676574707, + "step": 210 + }, + { + "epoch": 3.576271186440678, + "grad_norm": 3.3478217712753966, + "learning_rate": 3.3878443667936136e-07, + "logits/chosen": -0.16748064756393433, + "logits/rejected": -0.19592073559761047, + "logps/chosen": -37.14228439331055, + "logps/rejected": -62.434722900390625, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.086951494216919, + "rewards/margins": 6.2296953201293945, + "rewards/rejected": -7.316647529602051, + "step": 211 + }, + { + "epoch": 3.593220338983051, + "grad_norm": 3.9463700359583074, + "learning_rate": 3.3705040885859967e-07, + "logits/chosen": -0.3255730867385864, + "logits/rejected": -0.27438968420028687, + "logps/chosen": -34.4691276550293, + "logps/rejected": -47.688350677490234, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7244514226913452, + "rewards/margins": 5.521853446960449, + "rewards/rejected": -6.246304988861084, + "step": 212 + }, + { + "epoch": 3.610169491525424, + "grad_norm": 3.6196960397708686, + "learning_rate": 3.3531160203970805e-07, + "logits/chosen": -0.3483354151248932, + "logits/rejected": -0.317913681268692, + "logps/chosen": -28.75990867614746, + "logps/rejected": -48.366981506347656, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.642256498336792, + "rewards/margins": 5.73888635635376, + "rewards/rejected": -6.381142616271973, + "step": 213 + }, + { + "epoch": 3.6271186440677967, + "grad_norm": 4.95065620942278, + "learning_rate": 3.3356811168179627e-07, + "logits/chosen": -0.20646288990974426, + "logits/rejected": -0.18285736441612244, + "logps/chosen": -29.683345794677734, + "logps/rejected": -42.32093811035156, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3692317008972168, + "rewards/margins": 6.064602851867676, + "rewards/rejected": -6.433835029602051, + "step": 214 + }, + { + "epoch": 3.6440677966101696, + "grad_norm": 3.3699006260035813, + "learning_rate": 3.318200335010967e-07, + "logits/chosen": -0.42737993597984314, + "logits/rejected": -0.3845828175544739, + "logps/chosen": -25.335176467895508, + "logps/rejected": -42.636924743652344, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28532662987709045, + "rewards/margins": 6.151418685913086, + "rewards/rejected": -5.866091728210449, + "step": 215 + }, + { + "epoch": 3.6610169491525424, + "grad_norm": 3.8837939121598777, + "learning_rate": 3.3006746346570935e-07, + "logits/chosen": -0.40326201915740967, + "logits/rejected": -0.40920883417129517, + "logps/chosen": -22.64775848388672, + "logps/rejected": -39.44330596923828, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18482859432697296, + "rewards/margins": 5.748718738555908, + "rewards/rejected": -5.933547019958496, + "step": 216 + }, + { + "epoch": 3.6779661016949152, + "grad_norm": 4.333458578457773, + "learning_rate": 3.2831049779033395e-07, + "logits/chosen": -0.443619042634964, + "logits/rejected": -0.41168978810310364, + "logps/chosen": -37.534263610839844, + "logps/rejected": -64.37035369873047, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5087466835975647, + "rewards/margins": 7.267013072967529, + "rewards/rejected": -7.775759696960449, + "step": 217 + }, + { + "epoch": 3.694915254237288, + "grad_norm": 4.250140275463436, + "learning_rate": 3.2654923293098666e-07, + "logits/chosen": -0.2549651861190796, + "logits/rejected": -0.1890694946050644, + "logps/chosen": -26.34837532043457, + "logps/rejected": -43.935028076171875, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7253862023353577, + "rewards/margins": 5.679473400115967, + "rewards/rejected": -6.40485954284668, + "step": 218 + }, + { + "epoch": 3.711864406779661, + "grad_norm": 3.310632766464627, + "learning_rate": 3.247837655797061e-07, + "logits/chosen": -0.25092679262161255, + "logits/rejected": -0.28778067231178284, + "logps/chosen": -24.404443740844727, + "logps/rejected": -47.01846694946289, + "loss": 0.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29181113839149475, + "rewards/margins": 6.665236473083496, + "rewards/rejected": -6.957046985626221, + "step": 219 + }, + { + "epoch": 3.7288135593220337, + "grad_norm": 3.1436162956199496, + "learning_rate": 3.2301419265924393e-07, + "logits/chosen": -0.4150010645389557, + "logits/rejected": -0.36361223459243774, + "logps/chosen": -24.460697174072266, + "logps/rejected": -44.857032775878906, + "loss": 0.0291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22335419058799744, + "rewards/margins": 6.003718852996826, + "rewards/rejected": -6.227072715759277, + "step": 220 + }, + { + "epoch": 3.7457627118644066, + "grad_norm": 3.729031618521559, + "learning_rate": 3.2124061131774443e-07, + "logits/chosen": -0.3509747385978699, + "logits/rejected": -0.358395516872406, + "logps/chosen": -24.089895248413086, + "logps/rejected": -52.84262466430664, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14366820454597473, + "rewards/margins": 5.806227207183838, + "rewards/rejected": -5.94989538192749, + "step": 221 + }, + { + "epoch": 3.7627118644067794, + "grad_norm": 3.3066593649570315, + "learning_rate": 3.194631189234109e-07, + "logits/chosen": -0.4065392017364502, + "logits/rejected": -0.37751972675323486, + "logps/chosen": -32.56217956542969, + "logps/rejected": -45.78569412231445, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3193157911300659, + "rewards/margins": 5.7366862297058105, + "rewards/rejected": -6.056003093719482, + "step": 222 + }, + { + "epoch": 3.7796610169491527, + "grad_norm": 2.793162644598459, + "learning_rate": 3.1768181305916063e-07, + "logits/chosen": -0.25837022066116333, + "logits/rejected": -0.22268140316009521, + "logps/chosen": -35.988895416259766, + "logps/rejected": -54.8642463684082, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6435793042182922, + "rewards/margins": 6.045925140380859, + "rewards/rejected": -6.689504623413086, + "step": 223 + }, + { + "epoch": 3.7966101694915255, + "grad_norm": 5.31296637675809, + "learning_rate": 3.158967915172669e-07, + "logits/chosen": -0.25623688101768494, + "logits/rejected": -0.2494334727525711, + "logps/chosen": -25.375301361083984, + "logps/rejected": -41.08918380737305, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.390929639339447, + "rewards/margins": 5.473989009857178, + "rewards/rejected": -5.864918231964111, + "step": 224 + }, + { + "epoch": 3.8135593220338984, + "grad_norm": 3.9032619129323582, + "learning_rate": 3.141081522939911e-07, + "logits/chosen": -0.31211555004119873, + "logits/rejected": -0.23420506715774536, + "logps/chosen": -35.506065368652344, + "logps/rejected": -45.37016296386719, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28194302320480347, + "rewards/margins": 5.861372947692871, + "rewards/rejected": -6.14331579208374, + "step": 225 + }, + { + "epoch": 3.830508474576271, + "grad_norm": 3.3703773992777712, + "learning_rate": 3.1231599358420233e-07, + "logits/chosen": -0.2667548954486847, + "logits/rejected": -0.237786203622818, + "logps/chosen": -25.19987678527832, + "logps/rejected": -42.388084411621094, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0639249086380005, + "rewards/margins": 5.1388325691223145, + "rewards/rejected": -6.202757835388184, + "step": 226 + }, + { + "epoch": 3.847457627118644, + "grad_norm": 3.2509261883963583, + "learning_rate": 3.105204137759867e-07, + "logits/chosen": -0.35733070969581604, + "logits/rejected": -0.29906269907951355, + "logps/chosen": -31.326122283935547, + "logps/rejected": -54.50325012207031, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14473173022270203, + "rewards/margins": 6.546075820922852, + "rewards/rejected": -6.690806865692139, + "step": 227 + }, + { + "epoch": 3.864406779661017, + "grad_norm": 4.276773716118761, + "learning_rate": 3.0872151144524594e-07, + "logits/chosen": -0.40903520584106445, + "logits/rejected": -0.42379483580589294, + "logps/chosen": -25.51406478881836, + "logps/rejected": -56.04070281982422, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5232114791870117, + "rewards/margins": 7.154451370239258, + "rewards/rejected": -7.6776628494262695, + "step": 228 + }, + { + "epoch": 3.8813559322033897, + "grad_norm": 3.0586357868954885, + "learning_rate": 3.069193853502855e-07, + "logits/chosen": -0.35119858384132385, + "logits/rejected": -0.31669121980667114, + "logps/chosen": -26.634798049926758, + "logps/rejected": -43.51852798461914, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6264432668685913, + "rewards/margins": 5.7952752113342285, + "rewards/rejected": -6.421718597412109, + "step": 229 + }, + { + "epoch": 3.898305084745763, + "grad_norm": 3.809867857045704, + "learning_rate": 3.0511413442639297e-07, + "logits/chosen": -0.3418273329734802, + "logits/rejected": -0.3366440534591675, + "logps/chosen": -26.767898559570312, + "logps/rejected": -66.91107940673828, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3621063232421875, + "rewards/margins": 8.028979301452637, + "rewards/rejected": -9.391084671020508, + "step": 230 + }, + { + "epoch": 3.915254237288136, + "grad_norm": 1.7319311965224584, + "learning_rate": 3.0330585778040675e-07, + "logits/chosen": -0.22780543565750122, + "logits/rejected": -0.1367052048444748, + "logps/chosen": -19.499248504638672, + "logps/rejected": -37.6104736328125, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3209352195262909, + "rewards/margins": 6.441976070404053, + "rewards/rejected": -6.1210408210754395, + "step": 231 + }, + { + "epoch": 3.9322033898305087, + "grad_norm": 2.919480742746747, + "learning_rate": 3.0149465468527457e-07, + "logits/chosen": -0.3633422255516052, + "logits/rejected": -0.3510938286781311, + "logps/chosen": -24.75160026550293, + "logps/rejected": -43.96453094482422, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.055519312620162964, + "rewards/margins": 6.384317874908447, + "rewards/rejected": -6.328798294067383, + "step": 232 + }, + { + "epoch": 3.9491525423728815, + "grad_norm": 2.6875831264015626, + "learning_rate": 2.9968062457460437e-07, + "logits/chosen": -0.30877232551574707, + "logits/rejected": -0.2673957049846649, + "logps/chosen": -22.01394271850586, + "logps/rejected": -46.45256042480469, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.722076416015625, + "rewards/margins": 6.105856895446777, + "rewards/rejected": -6.827932834625244, + "step": 233 + }, + { + "epoch": 3.9661016949152543, + "grad_norm": 5.291054230890989, + "learning_rate": 2.978638670372047e-07, + "logits/chosen": -0.33912044763565063, + "logits/rejected": -0.2657839357852936, + "logps/chosen": -30.723812103271484, + "logps/rejected": -52.49626159667969, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.223615050315857, + "rewards/margins": 6.520167350769043, + "rewards/rejected": -7.743781089782715, + "step": 234 + }, + { + "epoch": 3.983050847457627, + "grad_norm": 4.5082449746889495, + "learning_rate": 2.9604448181161755e-07, + "logits/chosen": -0.2287699282169342, + "logits/rejected": -0.278522789478302, + "logps/chosen": -21.338584899902344, + "logps/rejected": -43.86865234375, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26738277077674866, + "rewards/margins": 5.24444580078125, + "rewards/rejected": -5.511828422546387, + "step": 235 + }, + { + "epoch": 4.0, + "grad_norm": 2.760408994676017, + "learning_rate": 2.9422256878064324e-07, + "logits/chosen": -0.25730714201927185, + "logits/rejected": -0.24561913311481476, + "logps/chosen": -39.164676666259766, + "logps/rejected": -58.313934326171875, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.588261127471924, + "rewards/margins": 6.12579870223999, + "rewards/rejected": -8.71406078338623, + "step": 236 + }, + { + "epoch": 4.016949152542373, + "grad_norm": 2.2785410277469302, + "learning_rate": 2.923982279658564e-07, + "logits/chosen": -0.34395280480384827, + "logits/rejected": -0.23966065049171448, + "logps/chosen": -38.35492706298828, + "logps/rejected": -53.40243148803711, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2421057224273682, + "rewards/margins": 6.917564392089844, + "rewards/rejected": -8.159669876098633, + "step": 237 + }, + { + "epoch": 4.033898305084746, + "grad_norm": 3.2845417722614507, + "learning_rate": 2.90571559522115e-07, + "logits/chosen": -0.13574184477329254, + "logits/rejected": -0.11650273948907852, + "logps/chosen": -27.581148147583008, + "logps/rejected": -39.88399887084961, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3537464737892151, + "rewards/margins": 5.475383281707764, + "rewards/rejected": -5.829129695892334, + "step": 238 + }, + { + "epoch": 4.0508474576271185, + "grad_norm": 2.1631508501013315, + "learning_rate": 2.8874266373206215e-07, + "logits/chosen": -0.3121250867843628, + "logits/rejected": -0.24592992663383484, + "logps/chosen": -29.24790382385254, + "logps/rejected": -47.294334411621094, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47773995995521545, + "rewards/margins": 5.754822254180908, + "rewards/rejected": -6.2325615882873535, + "step": 239 + }, + { + "epoch": 4.067796610169491, + "grad_norm": 2.4004940122434544, + "learning_rate": 2.8691164100062034e-07, + "logits/chosen": -0.36053359508514404, + "logits/rejected": -0.34572604298591614, + "logps/chosen": -31.519865036010742, + "logps/rejected": -59.80055618286133, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.718724250793457, + "rewards/margins": 7.598670959472656, + "rewards/rejected": -8.317395210266113, + "step": 240 + }, + { + "epoch": 4.084745762711864, + "grad_norm": 2.6119447962907367, + "learning_rate": 2.8507859184947953e-07, + "logits/chosen": -0.43051332235336304, + "logits/rejected": -0.4282737076282501, + "logps/chosen": -26.50347137451172, + "logps/rejected": -52.22574234008789, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.289501428604126, + "rewards/margins": 6.555995941162109, + "rewards/rejected": -6.845498085021973, + "step": 241 + }, + { + "epoch": 4.101694915254237, + "grad_norm": 2.239976713467154, + "learning_rate": 2.8324361691157853e-07, + "logits/chosen": -0.24347716569900513, + "logits/rejected": -0.24979354441165924, + "logps/chosen": -30.006914138793945, + "logps/rejected": -59.73139190673828, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8718441128730774, + "rewards/margins": 6.680701732635498, + "rewards/rejected": -7.55254602432251, + "step": 242 + }, + { + "epoch": 4.11864406779661, + "grad_norm": 2.6364819568694497, + "learning_rate": 2.8140681692558034e-07, + "logits/chosen": -0.25327029824256897, + "logits/rejected": -0.21109545230865479, + "logps/chosen": -29.609922409057617, + "logps/rejected": -46.73149490356445, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5046648979187012, + "rewards/margins": 6.538877487182617, + "rewards/rejected": -7.043542385101318, + "step": 243 + }, + { + "epoch": 4.135593220338983, + "grad_norm": 2.1372418800599786, + "learning_rate": 2.7956829273034146e-07, + "logits/chosen": -0.13386383652687073, + "logits/rejected": -0.1250249445438385, + "logps/chosen": -26.58926773071289, + "logps/rejected": -51.22819900512695, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8299556970596313, + "rewards/margins": 6.830400466918945, + "rewards/rejected": -7.660356521606445, + "step": 244 + }, + { + "epoch": 4.1525423728813555, + "grad_norm": 2.9141398948843804, + "learning_rate": 2.7772814525937634e-07, + "logits/chosen": -0.32944080233573914, + "logits/rejected": -0.27718019485473633, + "logps/chosen": -28.87648582458496, + "logps/rejected": -48.459808349609375, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7797695994377136, + "rewards/margins": 6.1633992195129395, + "rewards/rejected": -6.943169116973877, + "step": 245 + }, + { + "epoch": 4.169491525423728, + "grad_norm": 2.497866238527004, + "learning_rate": 2.7588647553531576e-07, + "logits/chosen": -0.25429630279541016, + "logits/rejected": -0.23005954921245575, + "logps/chosen": -25.186725616455078, + "logps/rejected": -55.01511001586914, + "loss": 0.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38165754079818726, + "rewards/margins": 7.027002334594727, + "rewards/rejected": -7.408658981323242, + "step": 246 + }, + { + "epoch": 4.186440677966102, + "grad_norm": 1.7406944144206382, + "learning_rate": 2.7404338466436116e-07, + "logits/chosen": -0.2958889901638031, + "logits/rejected": -0.26341933012008667, + "logps/chosen": -28.1710205078125, + "logps/rejected": -50.88844299316406, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06356866657733917, + "rewards/margins": 7.469226360321045, + "rewards/rejected": -7.532794952392578, + "step": 247 + }, + { + "epoch": 4.203389830508475, + "grad_norm": 2.330847556376873, + "learning_rate": 2.721989738307337e-07, + "logits/chosen": -0.3691413104534149, + "logits/rejected": -0.35948917269706726, + "logps/chosen": -29.122577667236328, + "logps/rejected": -46.696510314941406, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09089075028896332, + "rewards/margins": 4.961187839508057, + "rewards/rejected": -5.052079200744629, + "step": 248 + }, + { + "epoch": 4.220338983050848, + "grad_norm": 2.5507390864394046, + "learning_rate": 2.7035334429111955e-07, + "logits/chosen": -0.22923773527145386, + "logits/rejected": -0.1796061396598816, + "logps/chosen": -37.402748107910156, + "logps/rejected": -61.04646682739258, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6148930191993713, + "rewards/margins": 6.983782768249512, + "rewards/rejected": -7.598675727844238, + "step": 249 + }, + { + "epoch": 4.237288135593221, + "grad_norm": 1.95547934634835, + "learning_rate": 2.685065973691107e-07, + "logits/chosen": -0.20895695686340332, + "logits/rejected": -0.2264058142900467, + "logps/chosen": -31.016735076904297, + "logps/rejected": -56.749725341796875, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8533796072006226, + "rewards/margins": 6.410269737243652, + "rewards/rejected": -7.2636494636535645, + "step": 250 + }, + { + "epoch": 4.254237288135593, + "grad_norm": 2.1411469355757973, + "learning_rate": 2.6665883444964277e-07, + "logits/chosen": -0.16789795458316803, + "logits/rejected": -0.14672429859638214, + "logps/chosen": -23.094444274902344, + "logps/rejected": -55.99787139892578, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8806984424591064, + "rewards/margins": 8.1028470993042, + "rewards/rejected": -8.983545303344727, + "step": 251 + }, + { + "epoch": 4.271186440677966, + "grad_norm": 2.372366174155855, + "learning_rate": 2.6481015697342856e-07, + "logits/chosen": -0.3404889404773712, + "logits/rejected": -0.32007667422294617, + "logps/chosen": -19.16732406616211, + "logps/rejected": -42.858253479003906, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46728387475013733, + "rewards/margins": 5.529178619384766, + "rewards/rejected": -5.996462345123291, + "step": 252 + }, + { + "epoch": 4.288135593220339, + "grad_norm": 1.958723562417606, + "learning_rate": 2.629606664313896e-07, + "logits/chosen": -0.35188454389572144, + "logits/rejected": -0.3609326481819153, + "logps/chosen": -25.61526107788086, + "logps/rejected": -50.27090072631836, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.02239191532135, + "rewards/margins": 6.072734832763672, + "rewards/rejected": -7.095126628875732, + "step": 253 + }, + { + "epoch": 4.305084745762712, + "grad_norm": 2.0112122888894115, + "learning_rate": 2.611104643590838e-07, + "logits/chosen": -0.29033514857292175, + "logits/rejected": -0.26703035831451416, + "logps/chosen": -21.255908966064453, + "logps/rejected": -53.08380126953125, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24649456143379211, + "rewards/margins": 7.01984977722168, + "rewards/rejected": -7.26634407043457, + "step": 254 + }, + { + "epoch": 4.322033898305085, + "grad_norm": 2.053603221627952, + "learning_rate": 2.592596523311317e-07, + "logits/chosen": -0.30223536491394043, + "logits/rejected": -0.2536553740501404, + "logps/chosen": -32.25640106201172, + "logps/rejected": -43.348167419433594, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06762519478797913, + "rewards/margins": 6.735665321350098, + "rewards/rejected": -6.803289890289307, + "step": 255 + }, + { + "epoch": 4.338983050847458, + "grad_norm": 2.8451217392600707, + "learning_rate": 2.5740833195563994e-07, + "logits/chosen": -0.3592408299446106, + "logits/rejected": -0.32396936416625977, + "logps/chosen": -29.65281867980957, + "logps/rejected": -46.565242767333984, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1844983100891113, + "rewards/margins": 5.651597499847412, + "rewards/rejected": -6.836095809936523, + "step": 256 + }, + { + "epoch": 4.3559322033898304, + "grad_norm": 2.4064312299996398, + "learning_rate": 2.5555660486862293e-07, + "logits/chosen": -0.3634299039840698, + "logits/rejected": -0.3088497817516327, + "logps/chosen": -28.253725051879883, + "logps/rejected": -48.81061553955078, + "loss": 0.021, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4266662895679474, + "rewards/margins": 6.367308616638184, + "rewards/rejected": -6.793975353240967, + "step": 257 + }, + { + "epoch": 4.372881355932203, + "grad_norm": 2.146631041454485, + "learning_rate": 2.5370457272842315e-07, + "logits/chosen": -0.24686959385871887, + "logits/rejected": -0.18535006046295166, + "logps/chosen": -33.190582275390625, + "logps/rejected": -48.917503356933594, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8520054817199707, + "rewards/margins": 5.909256935119629, + "rewards/rejected": -6.761262893676758, + "step": 258 + }, + { + "epoch": 4.389830508474576, + "grad_norm": 2.0738992157558642, + "learning_rate": 2.5185233721013053e-07, + "logits/chosen": -0.359385222196579, + "logits/rejected": -0.357438325881958, + "logps/chosen": -24.519697189331055, + "logps/rejected": -44.44859313964844, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16271838545799255, + "rewards/margins": 6.145666122436523, + "rewards/rejected": -6.308384418487549, + "step": 259 + }, + { + "epoch": 4.406779661016949, + "grad_norm": 3.2910145632235572, + "learning_rate": 2.5e-07, + "logits/chosen": -0.11309901624917984, + "logits/rejected": -0.11735934764146805, + "logps/chosen": -24.769031524658203, + "logps/rejected": -52.153263092041016, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5375908017158508, + "rewards/margins": 6.348197937011719, + "rewards/rejected": -6.885788917541504, + "step": 260 + }, + { + "epoch": 4.423728813559322, + "grad_norm": 1.9904353477375836, + "learning_rate": 2.4814766278986944e-07, + "logits/chosen": -0.3224155604839325, + "logits/rejected": -0.2858419716358185, + "logps/chosen": -29.066646575927734, + "logps/rejected": -63.335533142089844, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6143862009048462, + "rewards/margins": 7.916276931762695, + "rewards/rejected": -8.530662536621094, + "step": 261 + }, + { + "epoch": 4.440677966101695, + "grad_norm": 2.8941606742565, + "learning_rate": 2.462954272715768e-07, + "logits/chosen": -0.450508177280426, + "logits/rejected": -0.4239945411682129, + "logps/chosen": -35.900840759277344, + "logps/rejected": -45.3778190612793, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7406729459762573, + "rewards/margins": 4.7124552726745605, + "rewards/rejected": -6.453128337860107, + "step": 262 + }, + { + "epoch": 4.4576271186440675, + "grad_norm": 2.38515146244392, + "learning_rate": 2.4444339513137716e-07, + "logits/chosen": -0.38119906187057495, + "logits/rejected": -0.36609771847724915, + "logps/chosen": -30.711692810058594, + "logps/rejected": -60.621646881103516, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.59708571434021, + "rewards/margins": 8.203582763671875, + "rewards/rejected": -8.800668716430664, + "step": 263 + }, + { + "epoch": 4.47457627118644, + "grad_norm": 1.9234760349513347, + "learning_rate": 2.4259166804436003e-07, + "logits/chosen": -0.3686653971672058, + "logits/rejected": -0.31526994705200195, + "logps/chosen": -32.2381706237793, + "logps/rejected": -53.97626876831055, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1651465892791748, + "rewards/margins": 6.137485027313232, + "rewards/rejected": -7.302631855010986, + "step": 264 + }, + { + "epoch": 4.491525423728813, + "grad_norm": 2.611964451389311, + "learning_rate": 2.4074034766886826e-07, + "logits/chosen": -0.3324103355407715, + "logits/rejected": -0.26400357484817505, + "logps/chosen": -23.483598709106445, + "logps/rejected": -49.0655632019043, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20981693267822266, + "rewards/margins": 7.410755157470703, + "rewards/rejected": -7.620572090148926, + "step": 265 + }, + { + "epoch": 4.508474576271187, + "grad_norm": 2.9683559733463056, + "learning_rate": 2.3888953564091616e-07, + "logits/chosen": -0.39179760217666626, + "logits/rejected": -0.38096728920936584, + "logps/chosen": -31.189739227294922, + "logps/rejected": -53.24143600463867, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5632021427154541, + "rewards/margins": 6.741451263427734, + "rewards/rejected": -7.304653167724609, + "step": 266 + }, + { + "epoch": 4.52542372881356, + "grad_norm": 2.385454067550593, + "learning_rate": 2.3703933356861044e-07, + "logits/chosen": -0.41365846991539, + "logits/rejected": -0.41495996713638306, + "logps/chosen": -29.90151596069336, + "logps/rejected": -53.238502502441406, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4617680311203003, + "rewards/margins": 7.544755458831787, + "rewards/rejected": -9.006523132324219, + "step": 267 + }, + { + "epoch": 4.5423728813559325, + "grad_norm": 2.2006915125969946, + "learning_rate": 2.3518984302657144e-07, + "logits/chosen": -0.27264100313186646, + "logits/rejected": -0.29000911116600037, + "logps/chosen": -22.446334838867188, + "logps/rejected": -57.679481506347656, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5886446833610535, + "rewards/margins": 7.639779090881348, + "rewards/rejected": -8.228424072265625, + "step": 268 + }, + { + "epoch": 4.559322033898305, + "grad_norm": 2.150067392598777, + "learning_rate": 2.333411655503572e-07, + "logits/chosen": -0.2162581980228424, + "logits/rejected": -0.16464056074619293, + "logps/chosen": -29.03925323486328, + "logps/rejected": -61.93821716308594, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7494891881942749, + "rewards/margins": 8.04469108581543, + "rewards/rejected": -8.794179916381836, + "step": 269 + }, + { + "epoch": 4.576271186440678, + "grad_norm": 3.2103132884907355, + "learning_rate": 2.3149340263088927e-07, + "logits/chosen": -0.4069588780403137, + "logits/rejected": -0.39735129475593567, + "logps/chosen": -25.142169952392578, + "logps/rejected": -54.4061279296875, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7782204747200012, + "rewards/margins": 7.401907920837402, + "rewards/rejected": -8.18012809753418, + "step": 270 + }, + { + "epoch": 4.593220338983051, + "grad_norm": 3.4106686634255814, + "learning_rate": 2.296466557088805e-07, + "logits/chosen": -0.4093379080295563, + "logits/rejected": -0.3818233013153076, + "logps/chosen": -24.618453979492188, + "logps/rejected": -53.87172317504883, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7368046045303345, + "rewards/margins": 8.036300659179688, + "rewards/rejected": -8.773106575012207, + "step": 271 + }, + { + "epoch": 4.610169491525424, + "grad_norm": 2.227751631839648, + "learning_rate": 2.278010261692663e-07, + "logits/chosen": -0.3430100679397583, + "logits/rejected": -0.32270756363868713, + "logps/chosen": -27.739946365356445, + "logps/rejected": -50.70249938964844, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3157005310058594, + "rewards/margins": 6.844423294067383, + "rewards/rejected": -8.160122871398926, + "step": 272 + }, + { + "epoch": 4.627118644067797, + "grad_norm": 2.718467637449855, + "learning_rate": 2.2595661533563887e-07, + "logits/chosen": -0.39202579855918884, + "logits/rejected": -0.37344199419021606, + "logps/chosen": -28.954833984375, + "logps/rejected": -52.06825256347656, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1071019172668457, + "rewards/margins": 5.494106769561768, + "rewards/rejected": -6.6012091636657715, + "step": 273 + }, + { + "epoch": 4.6440677966101696, + "grad_norm": 1.8979857813927623, + "learning_rate": 2.2411352446468424e-07, + "logits/chosen": -0.2902525067329407, + "logits/rejected": -0.2769823372364044, + "logps/chosen": -21.65315055847168, + "logps/rejected": -53.80813980102539, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.367279589176178, + "rewards/margins": 7.097145080566406, + "rewards/rejected": -7.464425086975098, + "step": 274 + }, + { + "epoch": 4.661016949152542, + "grad_norm": 1.9652537606332783, + "learning_rate": 2.2227185474062374e-07, + "logits/chosen": -0.3663102984428406, + "logits/rejected": -0.3732694983482361, + "logps/chosen": -25.794607162475586, + "logps/rejected": -52.91725540161133, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9790402054786682, + "rewards/margins": 6.477062702178955, + "rewards/rejected": -7.4561028480529785, + "step": 275 + }, + { + "epoch": 4.677966101694915, + "grad_norm": 2.3633232838068854, + "learning_rate": 2.2043170726965857e-07, + "logits/chosen": -0.3861359655857086, + "logits/rejected": -0.33153507113456726, + "logps/chosen": -26.395111083984375, + "logps/rejected": -46.5584716796875, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21754157543182373, + "rewards/margins": 6.467673301696777, + "rewards/rejected": -6.685215473175049, + "step": 276 + }, + { + "epoch": 4.694915254237288, + "grad_norm": 2.1756252476018925, + "learning_rate": 2.1859318307441966e-07, + "logits/chosen": -0.3655955493450165, + "logits/rejected": -0.2858305871486664, + "logps/chosen": -31.3674373626709, + "logps/rejected": -55.38779067993164, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8195254802703857, + "rewards/margins": 7.775407314300537, + "rewards/rejected": -8.594932556152344, + "step": 277 + }, + { + "epoch": 4.711864406779661, + "grad_norm": 2.241164329559457, + "learning_rate": 2.1675638308842142e-07, + "logits/chosen": -0.32866764068603516, + "logits/rejected": -0.3286994397640228, + "logps/chosen": -23.2701358795166, + "logps/rejected": -50.79416275024414, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15657079219818115, + "rewards/margins": 7.880356788635254, + "rewards/rejected": -7.723785877227783, + "step": 278 + }, + { + "epoch": 4.728813559322034, + "grad_norm": 2.2629759157234983, + "learning_rate": 2.149214081505205e-07, + "logits/chosen": -0.36036401987075806, + "logits/rejected": -0.29053574800491333, + "logps/chosen": -28.8673095703125, + "logps/rejected": -45.81161880493164, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12538844347000122, + "rewards/margins": 7.48973274230957, + "rewards/rejected": -7.615121841430664, + "step": 279 + }, + { + "epoch": 4.745762711864407, + "grad_norm": 2.9812316443120133, + "learning_rate": 2.1308835899937972e-07, + "logits/chosen": -0.4776584506034851, + "logits/rejected": -0.4330436587333679, + "logps/chosen": -26.34911346435547, + "logps/rejected": -46.94022750854492, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6425644159317017, + "rewards/margins": 6.7085676193237305, + "rewards/rejected": -7.351131916046143, + "step": 280 + }, + { + "epoch": 4.762711864406779, + "grad_norm": 3.0770144105013757, + "learning_rate": 2.112573362679379e-07, + "logits/chosen": -0.3524860143661499, + "logits/rejected": -0.35296574234962463, + "logps/chosen": -36.912437438964844, + "logps/rejected": -64.40142059326172, + "loss": 0.0278, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4731464087963104, + "rewards/margins": 7.376462936401367, + "rewards/rejected": -7.849608421325684, + "step": 281 + }, + { + "epoch": 4.779661016949152, + "grad_norm": 3.0797571681448845, + "learning_rate": 2.09428440477885e-07, + "logits/chosen": -0.5038030743598938, + "logits/rejected": -0.3990883231163025, + "logps/chosen": -24.284114837646484, + "logps/rejected": -54.10459518432617, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30106982588768005, + "rewards/margins": 9.012360572814941, + "rewards/rejected": -9.313429832458496, + "step": 282 + }, + { + "epoch": 4.796610169491525, + "grad_norm": 1.96025124354275, + "learning_rate": 2.0760177203414366e-07, + "logits/chosen": -0.46829330921173096, + "logits/rejected": -0.42585426568984985, + "logps/chosen": -30.78460693359375, + "logps/rejected": -44.711978912353516, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8381346464157104, + "rewards/margins": 7.145539283752441, + "rewards/rejected": -7.983673572540283, + "step": 283 + }, + { + "epoch": 4.813559322033898, + "grad_norm": 2.5513885163665013, + "learning_rate": 2.0577743121935682e-07, + "logits/chosen": -0.30383074283599854, + "logits/rejected": -0.2893516719341278, + "logps/chosen": -24.012680053710938, + "logps/rejected": -55.98198318481445, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7660890221595764, + "rewards/margins": 6.679078578948975, + "rewards/rejected": -7.445167064666748, + "step": 284 + }, + { + "epoch": 4.830508474576272, + "grad_norm": 2.2427849156465443, + "learning_rate": 2.0395551818838243e-07, + "logits/chosen": -0.3513972759246826, + "logits/rejected": -0.36794793605804443, + "logps/chosen": -35.454872131347656, + "logps/rejected": -58.40122985839844, + "loss": 0.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9875959753990173, + "rewards/margins": 7.528386116027832, + "rewards/rejected": -8.515982627868652, + "step": 285 + }, + { + "epoch": 4.847457627118644, + "grad_norm": 2.178682023578529, + "learning_rate": 2.021361329627953e-07, + "logits/chosen": -0.348906934261322, + "logits/rejected": -0.2906019985675812, + "logps/chosen": -21.8374080657959, + "logps/rejected": -54.439029693603516, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.657446026802063, + "rewards/margins": 7.482587814331055, + "rewards/rejected": -8.140033721923828, + "step": 286 + }, + { + "epoch": 4.864406779661017, + "grad_norm": 2.548002935250282, + "learning_rate": 2.003193754253957e-07, + "logits/chosen": -0.3012135624885559, + "logits/rejected": -0.2940428555011749, + "logps/chosen": -28.2236385345459, + "logps/rejected": -47.59425354003906, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8078848123550415, + "rewards/margins": 6.637413024902344, + "rewards/rejected": -7.445297718048096, + "step": 287 + }, + { + "epoch": 4.88135593220339, + "grad_norm": 2.659959937090622, + "learning_rate": 1.9850534531472544e-07, + "logits/chosen": -0.3548402488231659, + "logits/rejected": -0.3299209475517273, + "logps/chosen": -26.74940299987793, + "logps/rejected": -50.08027648925781, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9076037406921387, + "rewards/margins": 6.909872531890869, + "rewards/rejected": -7.817476272583008, + "step": 288 + }, + { + "epoch": 4.898305084745763, + "grad_norm": 2.1597951514795297, + "learning_rate": 1.966941422195933e-07, + "logits/chosen": -0.3348950147628784, + "logits/rejected": -0.3803963363170624, + "logps/chosen": -27.04452133178711, + "logps/rejected": -57.45584487915039, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.858380913734436, + "rewards/margins": 7.073944091796875, + "rewards/rejected": -7.9323248863220215, + "step": 289 + }, + { + "epoch": 4.915254237288136, + "grad_norm": 2.8616875849096095, + "learning_rate": 1.94885865573607e-07, + "logits/chosen": -0.42029163241386414, + "logits/rejected": -0.40385907888412476, + "logps/chosen": -21.713485717773438, + "logps/rejected": -51.253334045410156, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.67279452085495, + "rewards/margins": 7.22087287902832, + "rewards/rejected": -7.893667221069336, + "step": 290 + }, + { + "epoch": 4.932203389830509, + "grad_norm": 2.4652867757867347, + "learning_rate": 1.930806146497146e-07, + "logits/chosen": -0.3921091556549072, + "logits/rejected": -0.37878188490867615, + "logps/chosen": -24.921491622924805, + "logps/rejected": -50.15573501586914, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5007016658782959, + "rewards/margins": 6.720961093902588, + "rewards/rejected": -7.221663475036621, + "step": 291 + }, + { + "epoch": 4.9491525423728815, + "grad_norm": 2.8758915083893832, + "learning_rate": 1.912784885547541e-07, + "logits/chosen": -0.257066547870636, + "logits/rejected": -0.24492767453193665, + "logps/chosen": -28.24458885192871, + "logps/rejected": -53.056297302246094, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5785134434700012, + "rewards/margins": 5.2356109619140625, + "rewards/rejected": -5.814124584197998, + "step": 292 + }, + { + "epoch": 4.966101694915254, + "grad_norm": 2.5031202245992956, + "learning_rate": 1.8947958622401328e-07, + "logits/chosen": -0.3068751394748688, + "logits/rejected": -0.321804940700531, + "logps/chosen": -25.078857421875, + "logps/rejected": -51.03053283691406, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7383342385292053, + "rewards/margins": 6.418414115905762, + "rewards/rejected": -7.156747817993164, + "step": 293 + }, + { + "epoch": 4.983050847457627, + "grad_norm": 2.270555425985558, + "learning_rate": 1.876840064157976e-07, + "logits/chosen": -0.3506714403629303, + "logits/rejected": -0.35707730054855347, + "logps/chosen": -26.248760223388672, + "logps/rejected": -51.72494888305664, + "loss": 0.0189, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8412652611732483, + "rewards/margins": 6.9783101081848145, + "rewards/rejected": -7.819576263427734, + "step": 294 + }, + { + "epoch": 5.0, + "grad_norm": 2.0981316143274804, + "learning_rate": 1.858918477060089e-07, + "logits/chosen": -0.3620571792125702, + "logits/rejected": -0.31848618388175964, + "logps/chosen": -24.052921295166016, + "logps/rejected": -48.84006881713867, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8626826405525208, + "rewards/margins": 6.730321407318115, + "rewards/rejected": -7.5930047035217285, + "step": 295 + }, + { + "epoch": 5.016949152542373, + "grad_norm": 1.7908906466642667, + "learning_rate": 1.8410320848273313e-07, + "logits/chosen": -0.40287381410598755, + "logits/rejected": -0.383707195520401, + "logps/chosen": -21.60245704650879, + "logps/rejected": -49.50798797607422, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0992738008499146, + "rewards/margins": 6.644189834594727, + "rewards/rejected": -7.743463516235352, + "step": 296 + }, + { + "epoch": 5.033898305084746, + "grad_norm": 1.8189459242932866, + "learning_rate": 1.8231818694083938e-07, + "logits/chosen": -0.2570793330669403, + "logits/rejected": -0.19739127159118652, + "logps/chosen": -38.83268356323242, + "logps/rejected": -66.01898956298828, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5877269506454468, + "rewards/margins": 8.778990745544434, + "rewards/rejected": -10.366718292236328, + "step": 297 + }, + { + "epoch": 5.0508474576271185, + "grad_norm": 2.1580633851808253, + "learning_rate": 1.8053688107658905e-07, + "logits/chosen": -0.40840768814086914, + "logits/rejected": -0.375863254070282, + "logps/chosen": -23.65566062927246, + "logps/rejected": -42.537147521972656, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23549441993236542, + "rewards/margins": 6.186650276184082, + "rewards/rejected": -6.422145366668701, + "step": 298 + }, + { + "epoch": 5.067796610169491, + "grad_norm": 2.1748249256852206, + "learning_rate": 1.787593886822556e-07, + "logits/chosen": -0.23409932851791382, + "logits/rejected": -0.25459229946136475, + "logps/chosen": -24.52175521850586, + "logps/rejected": -57.98483657836914, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7716866135597229, + "rewards/margins": 7.725409030914307, + "rewards/rejected": -8.497096061706543, + "step": 299 + }, + { + "epoch": 5.084745762711864, + "grad_norm": 1.8537224482358896, + "learning_rate": 1.7698580734075607e-07, + "logits/chosen": -0.2868376672267914, + "logits/rejected": -0.22661691904067993, + "logps/chosen": -28.01044273376465, + "logps/rejected": -51.4971923828125, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0641169548034668, + "rewards/margins": 6.613253593444824, + "rewards/rejected": -7.677370071411133, + "step": 300 + }, + { + "epoch": 5.101694915254237, + "grad_norm": 1.2874213290715422, + "learning_rate": 1.7521623442029388e-07, + "logits/chosen": -0.24358531832695007, + "logits/rejected": -0.23622053861618042, + "logps/chosen": -23.474660873413086, + "logps/rejected": -57.48131561279297, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7229022979736328, + "rewards/margins": 7.076157569885254, + "rewards/rejected": -7.799059867858887, + "step": 301 + }, + { + "epoch": 5.11864406779661, + "grad_norm": 2.1339366626101572, + "learning_rate": 1.7345076706901326e-07, + "logits/chosen": -0.3415279984474182, + "logits/rejected": -0.3399394154548645, + "logps/chosen": -32.244102478027344, + "logps/rejected": -61.445579528808594, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.47477126121521, + "rewards/margins": 7.632542133331299, + "rewards/rejected": -9.10731315612793, + "step": 302 + }, + { + "epoch": 5.135593220338983, + "grad_norm": 1.9591512485372344, + "learning_rate": 1.7168950220966614e-07, + "logits/chosen": -0.2298712432384491, + "logits/rejected": -0.22750090062618256, + "logps/chosen": -29.4824161529541, + "logps/rejected": -53.65066146850586, + "loss": 0.0235, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0305908918380737, + "rewards/margins": 6.653254985809326, + "rewards/rejected": -7.683846473693848, + "step": 303 + }, + { + "epoch": 5.1525423728813555, + "grad_norm": 2.5880324875437477, + "learning_rate": 1.6993253653429062e-07, + "logits/chosen": -0.3975529372692108, + "logits/rejected": -0.3743340075016022, + "logps/chosen": -34.480491638183594, + "logps/rejected": -58.636436462402344, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9836530685424805, + "rewards/margins": 6.951813697814941, + "rewards/rejected": -8.935466766357422, + "step": 304 + }, + { + "epoch": 5.169491525423728, + "grad_norm": 2.067798884368433, + "learning_rate": 1.681799664989033e-07, + "logits/chosen": -0.2536097764968872, + "logits/rejected": -0.23771128058433533, + "logps/chosen": -23.92715072631836, + "logps/rejected": -39.59846878051758, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.060520462691783905, + "rewards/margins": 5.955630302429199, + "rewards/rejected": -5.8951096534729, + "step": 305 + }, + { + "epoch": 5.186440677966102, + "grad_norm": 1.848984932148484, + "learning_rate": 1.6643188831820374e-07, + "logits/chosen": -0.31347960233688354, + "logits/rejected": -0.33331871032714844, + "logps/chosen": -27.515384674072266, + "logps/rejected": -56.68106460571289, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7880456447601318, + "rewards/margins": 8.233735084533691, + "rewards/rejected": -10.021780967712402, + "step": 306 + }, + { + "epoch": 5.203389830508475, + "grad_norm": 1.73302837343722, + "learning_rate": 1.6468839796029198e-07, + "logits/chosen": -0.4207502603530884, + "logits/rejected": -0.44503217935562134, + "logps/chosen": -32.272743225097656, + "logps/rejected": -66.51268005371094, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1082534790039062, + "rewards/margins": 7.719623565673828, + "rewards/rejected": -8.827877044677734, + "step": 307 + }, + { + "epoch": 5.220338983050848, + "grad_norm": 1.8347981610660942, + "learning_rate": 1.6294959114140033e-07, + "logits/chosen": -0.48544037342071533, + "logits/rejected": -0.4978610873222351, + "logps/chosen": -29.223657608032227, + "logps/rejected": -50.626705169677734, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6578260660171509, + "rewards/margins": 6.7553253173828125, + "rewards/rejected": -7.413151264190674, + "step": 308 + }, + { + "epoch": 5.237288135593221, + "grad_norm": 1.7157007652326588, + "learning_rate": 1.6121556332063861e-07, + "logits/chosen": -0.3168514370918274, + "logits/rejected": -0.2793565094470978, + "logps/chosen": -35.01051712036133, + "logps/rejected": -47.876895904541016, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6407256722450256, + "rewards/margins": 6.404169082641602, + "rewards/rejected": -7.044894218444824, + "step": 309 + }, + { + "epoch": 5.254237288135593, + "grad_norm": 2.1874539268273816, + "learning_rate": 1.5948640969475345e-07, + "logits/chosen": -0.3500838875770569, + "logits/rejected": -0.3151024580001831, + "logps/chosen": -22.813232421875, + "logps/rejected": -44.60185241699219, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16966784000396729, + "rewards/margins": 7.1504058837890625, + "rewards/rejected": -7.32007360458374, + "step": 310 + }, + { + "epoch": 5.271186440677966, + "grad_norm": 1.5893639618489923, + "learning_rate": 1.5776222519290204e-07, + "logits/chosen": -0.5237964987754822, + "logits/rejected": -0.5302670001983643, + "logps/chosen": -25.384904861450195, + "logps/rejected": -52.52655792236328, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9161776304244995, + "rewards/margins": 7.243150234222412, + "rewards/rejected": -8.15932846069336, + "step": 311 + }, + { + "epoch": 5.288135593220339, + "grad_norm": 1.507891171137426, + "learning_rate": 1.560431044714405e-07, + "logits/chosen": -0.388788104057312, + "logits/rejected": -0.3403037488460541, + "logps/chosen": -34.243717193603516, + "logps/rejected": -60.99458312988281, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.348089575767517, + "rewards/margins": 7.640527248382568, + "rewards/rejected": -8.988616943359375, + "step": 312 + }, + { + "epoch": 5.305084745762712, + "grad_norm": 2.10463616748223, + "learning_rate": 1.5432914190872756e-07, + "logits/chosen": -0.3692334294319153, + "logits/rejected": -0.349362313747406, + "logps/chosen": -26.829898834228516, + "logps/rejected": -47.90415954589844, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8458276987075806, + "rewards/margins": 6.770244121551514, + "rewards/rejected": -7.6160712242126465, + "step": 313 + }, + { + "epoch": 5.322033898305085, + "grad_norm": 1.926573035403958, + "learning_rate": 1.5262043159994314e-07, + "logits/chosen": -0.44576406478881836, + "logits/rejected": -0.39015570282936096, + "logps/chosen": -24.399137496948242, + "logps/rejected": -62.678646087646484, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23113268613815308, + "rewards/margins": 9.731943130493164, + "rewards/rejected": -9.963075637817383, + "step": 314 + }, + { + "epoch": 5.338983050847458, + "grad_norm": 1.7795060387621737, + "learning_rate": 1.5091706735192266e-07, + "logits/chosen": -0.3505421280860901, + "logits/rejected": -0.3113071322441101, + "logps/chosen": -22.754703521728516, + "logps/rejected": -58.468963623046875, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9263325929641724, + "rewards/margins": 7.451290130615234, + "rewards/rejected": -8.377622604370117, + "step": 315 + }, + { + "epoch": 5.3559322033898304, + "grad_norm": 2.145087625077026, + "learning_rate": 1.4921914267800699e-07, + "logits/chosen": -0.3622016906738281, + "logits/rejected": -0.3543117642402649, + "logps/chosen": -20.036022186279297, + "logps/rejected": -38.42483901977539, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31162628531455994, + "rewards/margins": 5.150010108947754, + "rewards/rejected": -5.461635589599609, + "step": 316 + }, + { + "epoch": 5.372881355932203, + "grad_norm": 1.7053004335113204, + "learning_rate": 1.4752675079290848e-07, + "logits/chosen": -0.31497931480407715, + "logits/rejected": -0.2895013391971588, + "logps/chosen": -28.917264938354492, + "logps/rejected": -43.556121826171875, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9403586983680725, + "rewards/margins": 5.686085224151611, + "rewards/rejected": -6.626444339752197, + "step": 317 + }, + { + "epoch": 5.389830508474576, + "grad_norm": 1.6608947087640378, + "learning_rate": 1.458399846075942e-07, + "logits/chosen": -0.5058786273002625, + "logits/rejected": -0.47814008593559265, + "logps/chosen": -31.875675201416016, + "logps/rejected": -60.682525634765625, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1719341278076172, + "rewards/margins": 7.2327094078063965, + "rewards/rejected": -8.404644012451172, + "step": 318 + }, + { + "epoch": 5.406779661016949, + "grad_norm": 2.159091958032586, + "learning_rate": 1.441589367241846e-07, + "logits/chosen": -0.3478569984436035, + "logits/rejected": -0.3360307216644287, + "logps/chosen": -25.11379051208496, + "logps/rejected": -47.88860321044922, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6650610566139221, + "rewards/margins": 6.3550872802734375, + "rewards/rejected": -7.020147323608398, + "step": 319 + }, + { + "epoch": 5.423728813559322, + "grad_norm": 2.576395194299276, + "learning_rate": 1.4248369943086995e-07, + "logits/chosen": -0.41911399364471436, + "logits/rejected": -0.3637450933456421, + "logps/chosen": -29.785608291625977, + "logps/rejected": -50.008182525634766, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31600964069366455, + "rewards/margins": 7.043180465698242, + "rewards/rejected": -7.359189510345459, + "step": 320 + }, + { + "epoch": 5.440677966101695, + "grad_norm": 2.1653315983567416, + "learning_rate": 1.4081436469684337e-07, + "logits/chosen": -0.32830509543418884, + "logits/rejected": -0.317745566368103, + "logps/chosen": -25.206449508666992, + "logps/rejected": -49.792205810546875, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6629531383514404, + "rewards/margins": 7.332546234130859, + "rewards/rejected": -7.995500087738037, + "step": 321 + }, + { + "epoch": 5.4576271186440675, + "grad_norm": 1.619909996410463, + "learning_rate": 1.3915102416725286e-07, + "logits/chosen": -0.43633776903152466, + "logits/rejected": -0.4285232424736023, + "logps/chosen": -21.11972427368164, + "logps/rejected": -50.980804443359375, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.572965145111084, + "rewards/margins": 6.161455154418945, + "rewards/rejected": -6.734420299530029, + "step": 322 + }, + { + "epoch": 5.47457627118644, + "grad_norm": 2.1885009314879538, + "learning_rate": 1.3749376915816885e-07, + "logits/chosen": -0.21762433648109436, + "logits/rejected": -0.196787029504776, + "logps/chosen": -35.26130676269531, + "logps/rejected": -56.66743087768555, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8203563690185547, + "rewards/margins": 7.474347114562988, + "rewards/rejected": -9.294703483581543, + "step": 323 + }, + { + "epoch": 5.491525423728813, + "grad_norm": 1.9365005337913619, + "learning_rate": 1.3584269065157172e-07, + "logits/chosen": -0.27862459421157837, + "logits/rejected": -0.2110404521226883, + "logps/chosen": -35.968971252441406, + "logps/rejected": -56.928218841552734, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3680837154388428, + "rewards/margins": 6.986885070800781, + "rewards/rejected": -8.354969024658203, + "step": 324 + }, + { + "epoch": 5.508474576271187, + "grad_norm": 1.3422635792385325, + "learning_rate": 1.341978792903568e-07, + "logits/chosen": -0.30388015508651733, + "logits/rejected": -0.2732846736907959, + "logps/chosen": -23.869976043701172, + "logps/rejected": -52.69965362548828, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31776559352874756, + "rewards/margins": 8.570083618164062, + "rewards/rejected": -8.887847900390625, + "step": 325 + }, + { + "epoch": 5.52542372881356, + "grad_norm": 2.3813246068255487, + "learning_rate": 1.3255942537335804e-07, + "logits/chosen": -0.33688196539878845, + "logits/rejected": -0.35382434725761414, + "logps/chosen": -28.49911117553711, + "logps/rejected": -51.572757720947266, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8108580112457275, + "rewards/margins": 6.868773937225342, + "rewards/rejected": -7.67963171005249, + "step": 326 + }, + { + "epoch": 5.5423728813559325, + "grad_norm": 1.4536548361254025, + "learning_rate": 1.3092741885039085e-07, + "logits/chosen": -0.2705250084400177, + "logits/rejected": -0.2894834876060486, + "logps/chosen": -27.802425384521484, + "logps/rejected": -66.8345718383789, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.008652925491333, + "rewards/margins": 8.34438419342041, + "rewards/rejected": -9.35303783416748, + "step": 327 + }, + { + "epoch": 5.559322033898305, + "grad_norm": 2.049468728531298, + "learning_rate": 1.2930194931731382e-07, + "logits/chosen": -0.36835363507270813, + "logits/rejected": -0.3584752380847931, + "logps/chosen": -20.919490814208984, + "logps/rejected": -39.850074768066406, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48556971549987793, + "rewards/margins": 6.541074752807617, + "rewards/rejected": -7.026644229888916, + "step": 328 + }, + { + "epoch": 5.576271186440678, + "grad_norm": 1.5752341980459406, + "learning_rate": 1.2768310601110993e-07, + "logits/chosen": -0.4180675148963928, + "logits/rejected": -0.4410182535648346, + "logps/chosen": -25.881986618041992, + "logps/rejected": -69.80418395996094, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.903774082660675, + "rewards/margins": 9.744110107421875, + "rewards/rejected": -10.647883415222168, + "step": 329 + }, + { + "epoch": 5.593220338983051, + "grad_norm": 1.7653806800087801, + "learning_rate": 1.260709778049877e-07, + "logits/chosen": -0.29894641041755676, + "logits/rejected": -0.300833523273468, + "logps/chosen": -26.03153419494629, + "logps/rejected": -47.6400260925293, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7818778157234192, + "rewards/margins": 6.886005878448486, + "rewards/rejected": -7.667883396148682, + "step": 330 + }, + { + "epoch": 5.610169491525424, + "grad_norm": 1.4297534909157374, + "learning_rate": 1.2446565320350182e-07, + "logits/chosen": -0.3907126188278198, + "logits/rejected": -0.37021511793136597, + "logps/chosen": -21.903635025024414, + "logps/rejected": -48.463523864746094, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.698778510093689, + "rewards/margins": 6.993403434753418, + "rewards/rejected": -7.692181587219238, + "step": 331 + }, + { + "epoch": 5.627118644067797, + "grad_norm": 1.9822675391192361, + "learning_rate": 1.2286722033769492e-07, + "logits/chosen": -0.4067448675632477, + "logits/rejected": -0.3597560524940491, + "logps/chosen": -27.199350357055664, + "logps/rejected": -53.19655227661133, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44194698333740234, + "rewards/margins": 7.5092453956604, + "rewards/rejected": -7.9511919021606445, + "step": 332 + }, + { + "epoch": 5.6440677966101696, + "grad_norm": 1.6861714590542527, + "learning_rate": 1.2127576696025826e-07, + "logits/chosen": -0.38976797461509705, + "logits/rejected": -0.3696633577346802, + "logps/chosen": -30.58667755126953, + "logps/rejected": -71.94692993164062, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8064645528793335, + "rewards/margins": 10.191534042358398, + "rewards/rejected": -10.99799919128418, + "step": 333 + }, + { + "epoch": 5.661016949152542, + "grad_norm": 2.050031332323167, + "learning_rate": 1.19691380440715e-07, + "logits/chosen": -0.3898102045059204, + "logits/rejected": -0.37484288215637207, + "logps/chosen": -28.617263793945312, + "logps/rejected": -48.9388427734375, + "loss": 0.0184, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1024081707000732, + "rewards/margins": 5.788519382476807, + "rewards/rejected": -6.890927314758301, + "step": 334 + }, + { + "epoch": 5.677966101694915, + "grad_norm": 1.4885965824230383, + "learning_rate": 1.1811414776062365e-07, + "logits/chosen": -0.21893128752708435, + "logits/rejected": -0.17550604045391083, + "logps/chosen": -31.78797149658203, + "logps/rejected": -53.7734260559082, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8101913928985596, + "rewards/margins": 7.902094841003418, + "rewards/rejected": -8.712285995483398, + "step": 335 + }, + { + "epoch": 5.694915254237288, + "grad_norm": 1.8676903528380577, + "learning_rate": 1.1654415550880242e-07, + "logits/chosen": -0.4299631118774414, + "logits/rejected": -0.4651949405670166, + "logps/chosen": -24.422216415405273, + "logps/rejected": -49.618309020996094, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24530749022960663, + "rewards/margins": 8.013923645019531, + "rewards/rejected": -8.259231567382812, + "step": 336 + }, + { + "epoch": 5.711864406779661, + "grad_norm": 2.0067876768226243, + "learning_rate": 1.1498148987657549e-07, + "logits/chosen": -0.290162056684494, + "logits/rejected": -0.2921581566333771, + "logps/chosen": -29.040874481201172, + "logps/rejected": -60.61643981933594, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3281574249267578, + "rewards/margins": 8.345926284790039, + "rewards/rejected": -9.674084663391113, + "step": 337 + }, + { + "epoch": 5.728813559322034, + "grad_norm": 1.6926593631605538, + "learning_rate": 1.1342623665304207e-07, + "logits/chosen": -0.39946579933166504, + "logits/rejected": -0.3756706416606903, + "logps/chosen": -26.8501033782959, + "logps/rejected": -57.20337677001953, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4086235761642456, + "rewards/margins": 7.640737056732178, + "rewards/rejected": -9.049360275268555, + "step": 338 + }, + { + "epoch": 5.745762711864407, + "grad_norm": 1.933017411699114, + "learning_rate": 1.1187848122036562e-07, + "logits/chosen": -0.38379529118537903, + "logits/rejected": -0.35069793462753296, + "logps/chosen": -27.219024658203125, + "logps/rejected": -45.80855941772461, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0740251541137695, + "rewards/margins": 6.810902118682861, + "rewards/rejected": -7.884926795959473, + "step": 339 + }, + { + "epoch": 5.762711864406779, + "grad_norm": 2.68284847566437, + "learning_rate": 1.1033830854908691e-07, + "logits/chosen": -0.463611364364624, + "logits/rejected": -0.46968621015548706, + "logps/chosen": -23.01727294921875, + "logps/rejected": -51.12052917480469, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40349888801574707, + "rewards/margins": 7.609687328338623, + "rewards/rejected": -8.013185501098633, + "step": 340 + }, + { + "epoch": 5.779661016949152, + "grad_norm": 1.830575531381985, + "learning_rate": 1.0880580319345919e-07, + "logits/chosen": -0.4400818645954132, + "logits/rejected": -0.36093467473983765, + "logps/chosen": -29.340173721313477, + "logps/rejected": -51.93349075317383, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12024500221014023, + "rewards/margins": 8.31672191619873, + "rewards/rejected": -8.436967849731445, + "step": 341 + }, + { + "epoch": 5.796610169491525, + "grad_norm": 2.1072568448760323, + "learning_rate": 1.0728104928680623e-07, + "logits/chosen": -0.4102250039577484, + "logits/rejected": -0.40293923020362854, + "logps/chosen": -24.35076904296875, + "logps/rejected": -49.26411437988281, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5383625030517578, + "rewards/margins": 6.970728874206543, + "rewards/rejected": -8.5090913772583, + "step": 342 + }, + { + "epoch": 5.813559322033898, + "grad_norm": 1.5832557378178098, + "learning_rate": 1.0576413053690326e-07, + "logits/chosen": -0.3550926446914673, + "logits/rejected": -0.33369180560112, + "logps/chosen": -23.023447036743164, + "logps/rejected": -51.34148406982422, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41946250200271606, + "rewards/margins": 8.600196838378906, + "rewards/rejected": -9.019659042358398, + "step": 343 + }, + { + "epoch": 5.830508474576272, + "grad_norm": 1.2740745062843633, + "learning_rate": 1.0425513022138202e-07, + "logits/chosen": -0.44471290707588196, + "logits/rejected": -0.45575839281082153, + "logps/chosen": -30.049896240234375, + "logps/rejected": -62.054786682128906, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.082101583480835, + "rewards/margins": 8.661722183227539, + "rewards/rejected": -9.743824005126953, + "step": 344 + }, + { + "epoch": 5.847457627118644, + "grad_norm": 1.8801561548243628, + "learning_rate": 1.0275413118315798e-07, + "logits/chosen": -0.4198082387447357, + "logits/rejected": -0.4343384802341461, + "logps/chosen": -26.124162673950195, + "logps/rejected": -49.45093536376953, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23411789536476135, + "rewards/margins": 7.040526390075684, + "rewards/rejected": -7.27464485168457, + "step": 345 + }, + { + "epoch": 5.864406779661017, + "grad_norm": 2.125582956895436, + "learning_rate": 1.0126121582588315e-07, + "logits/chosen": -0.42699775099754333, + "logits/rejected": -0.332169771194458, + "logps/chosen": -42.01930618286133, + "logps/rejected": -51.770362854003906, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4565836191177368, + "rewards/margins": 6.331849098205566, + "rewards/rejected": -7.788432598114014, + "step": 346 + }, + { + "epoch": 5.88135593220339, + "grad_norm": 1.7338911737005034, + "learning_rate": 9.977646610942201e-08, + "logits/chosen": -0.46750593185424805, + "logits/rejected": -0.4310920536518097, + "logps/chosen": -34.33905792236328, + "logps/rejected": -55.792449951171875, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1425302028656006, + "rewards/margins": 6.989797592163086, + "rewards/rejected": -8.13232707977295, + "step": 347 + }, + { + "epoch": 5.898305084745763, + "grad_norm": 2.124936709443199, + "learning_rate": 9.829996354535172e-08, + "logits/chosen": -0.19952382147312164, + "logits/rejected": -0.21836933493614197, + "logps/chosen": -19.52752685546875, + "logps/rejected": -51.54100799560547, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42098936438560486, + "rewards/margins": 7.090313911437988, + "rewards/rejected": -7.511303424835205, + "step": 348 + }, + { + "epoch": 5.915254237288136, + "grad_norm": 1.936391711504055, + "learning_rate": 9.68317891924871e-08, + "logits/chosen": -0.37741342186927795, + "logits/rejected": -0.3150150179862976, + "logps/chosen": -33.96430587768555, + "logps/rejected": -58.169151306152344, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8618389368057251, + "rewards/margins": 7.058804988861084, + "rewards/rejected": -7.9206438064575195, + "step": 349 + }, + { + "epoch": 5.932203389830509, + "grad_norm": 1.8102577538281432, + "learning_rate": 9.53720236524313e-08, + "logits/chosen": -0.3949698805809021, + "logits/rejected": -0.29366767406463623, + "logps/chosen": -37.89696502685547, + "logps/rejected": -47.563255310058594, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.420377641916275, + "rewards/margins": 6.494236946105957, + "rewards/rejected": -6.914615631103516, + "step": 350 + }, + { + "epoch": 5.9491525423728815, + "grad_norm": 1.5979190927318805, + "learning_rate": 9.392074706515002e-08, + "logits/chosen": -0.2729552388191223, + "logits/rejected": -0.28463542461395264, + "logps/chosen": -28.566404342651367, + "logps/rejected": -56.13119125366211, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8524938821792603, + "rewards/margins": 7.293689727783203, + "rewards/rejected": -8.146183967590332, + "step": 351 + }, + { + "epoch": 5.966101694915254, + "grad_norm": 2.3839776379838384, + "learning_rate": 9.247803910457225e-08, + "logits/chosen": -0.3895640969276428, + "logits/rejected": -0.3975210189819336, + "logps/chosen": -26.325103759765625, + "logps/rejected": -53.08610534667969, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3783128261566162, + "rewards/margins": 7.456340789794922, + "rewards/rejected": -8.834653854370117, + "step": 352 + }, + { + "epoch": 5.983050847457627, + "grad_norm": 1.2920698746602828, + "learning_rate": 9.104397897421623e-08, + "logits/chosen": -0.32404041290283203, + "logits/rejected": -0.27217093110084534, + "logps/chosen": -25.754863739013672, + "logps/rejected": -63.25767135620117, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3308719396591187, + "rewards/margins": 8.711791038513184, + "rewards/rejected": -10.04266357421875, + "step": 353 + }, + { + "epoch": 6.0, + "grad_norm": 1.282532794116693, + "learning_rate": 8.961864540284119e-08, + "logits/chosen": -0.49952465295791626, + "logits/rejected": -0.5269231796264648, + "logps/chosen": -22.62492561340332, + "logps/rejected": -51.047019958496094, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5334063172340393, + "rewards/margins": 8.3474702835083, + "rewards/rejected": -8.880876541137695, + "step": 354 + }, + { + "epoch": 6.016949152542373, + "grad_norm": 1.7112403823625462, + "learning_rate": 8.82021166401253e-08, + "logits/chosen": -0.3233092129230499, + "logits/rejected": -0.2621540427207947, + "logps/chosen": -44.06984329223633, + "logps/rejected": -58.04002380371094, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.143134355545044, + "rewards/margins": 6.095809459686279, + "rewards/rejected": -8.238943099975586, + "step": 355 + }, + { + "epoch": 6.033898305084746, + "grad_norm": 2.3344092696583947, + "learning_rate": 8.679447045236962e-08, + "logits/chosen": -0.3654767572879791, + "logits/rejected": -0.3644530773162842, + "logps/chosen": -20.51791763305664, + "logps/rejected": -45.79965591430664, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5996273756027222, + "rewards/margins": 7.613353729248047, + "rewards/rejected": -8.212981224060059, + "step": 356 + }, + { + "epoch": 6.0508474576271185, + "grad_norm": 2.317485347300773, + "learning_rate": 8.539578411822901e-08, + "logits/chosen": -0.3773816227912903, + "logits/rejected": -0.3972689211368561, + "logps/chosen": -30.56630516052246, + "logps/rejected": -50.376220703125, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9383816719055176, + "rewards/margins": 6.174047946929932, + "rewards/rejected": -7.112429618835449, + "step": 357 + }, + { + "epoch": 6.067796610169491, + "grad_norm": 1.4474880701357473, + "learning_rate": 8.400613442446947e-08, + "logits/chosen": -0.5112478733062744, + "logits/rejected": -0.4722178876399994, + "logps/chosen": -27.13446807861328, + "logps/rejected": -52.48398208618164, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3206241130828857, + "rewards/margins": 7.3138251304626465, + "rewards/rejected": -8.63444995880127, + "step": 358 + }, + { + "epoch": 6.084745762711864, + "grad_norm": 1.2482543871299383, + "learning_rate": 8.262559766175253e-08, + "logits/chosen": -0.37037163972854614, + "logits/rejected": -0.40053224563598633, + "logps/chosen": -26.4130802154541, + "logps/rejected": -58.55597686767578, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.92750483751297, + "rewards/margins": 8.936185836791992, + "rewards/rejected": -9.863691329956055, + "step": 359 + }, + { + "epoch": 6.101694915254237, + "grad_norm": 1.3326694254649336, + "learning_rate": 8.125424962044741e-08, + "logits/chosen": -0.4082280993461609, + "logits/rejected": -0.3952917158603668, + "logps/chosen": -31.45101547241211, + "logps/rejected": -57.48428726196289, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.774019479751587, + "rewards/margins": 7.355801105499268, + "rewards/rejected": -9.129819869995117, + "step": 360 + }, + { + "epoch": 6.11864406779661, + "grad_norm": 1.7964966769625663, + "learning_rate": 7.989216558646941e-08, + "logits/chosen": -0.37784266471862793, + "logits/rejected": -0.3356171250343323, + "logps/chosen": -33.39372634887695, + "logps/rejected": -53.96965789794922, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1166173219680786, + "rewards/margins": 7.514166831970215, + "rewards/rejected": -8.630784034729004, + "step": 361 + }, + { + "epoch": 6.135593220338983, + "grad_norm": 1.4381980147201805, + "learning_rate": 7.853942033714736e-08, + "logits/chosen": -0.33557164669036865, + "logits/rejected": -0.3193064332008362, + "logps/chosen": -37.53783416748047, + "logps/rejected": -60.92087173461914, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.018031120300293, + "rewards/margins": 7.809800148010254, + "rewards/rejected": -8.827831268310547, + "step": 362 + }, + { + "epoch": 6.1525423728813555, + "grad_norm": 1.7983591719289653, + "learning_rate": 7.719608813711847e-08, + "logits/chosen": -0.39093196392059326, + "logits/rejected": -0.37135645747184753, + "logps/chosen": -25.3659725189209, + "logps/rejected": -43.70526123046875, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.914777398109436, + "rewards/margins": 6.569075584411621, + "rewards/rejected": -7.483852863311768, + "step": 363 + }, + { + "epoch": 6.169491525423728, + "grad_norm": 2.0553124333196475, + "learning_rate": 7.586224273425081e-08, + "logits/chosen": -0.43935853242874146, + "logits/rejected": -0.39239639043807983, + "logps/chosen": -31.48431396484375, + "logps/rejected": -54.33441162109375, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8172799944877625, + "rewards/margins": 7.351185321807861, + "rewards/rejected": -8.168466567993164, + "step": 364 + }, + { + "epoch": 6.186440677966102, + "grad_norm": 1.5277879793345497, + "learning_rate": 7.45379573555947e-08, + "logits/chosen": -0.35419967770576477, + "logits/rejected": -0.29741495847702026, + "logps/chosen": -32.288909912109375, + "logps/rejected": -48.68520736694336, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7244929671287537, + "rewards/margins": 6.385775089263916, + "rewards/rejected": -7.110268592834473, + "step": 365 + }, + { + "epoch": 6.203389830508475, + "grad_norm": 1.2587042351574373, + "learning_rate": 7.322330470336313e-08, + "logits/chosen": -0.3986334502696991, + "logits/rejected": -0.41473451256752014, + "logps/chosen": -28.10173225402832, + "logps/rejected": -60.30015182495117, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9395461082458496, + "rewards/margins": 8.40530014038086, + "rewards/rejected": -9.34484577178955, + "step": 366 + }, + { + "epoch": 6.220338983050848, + "grad_norm": 1.9104878326976753, + "learning_rate": 7.19183569509398e-08, + "logits/chosen": -0.42515650391578674, + "logits/rejected": -0.4083452820777893, + "logps/chosen": -25.77292251586914, + "logps/rejected": -43.442447662353516, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8836389183998108, + "rewards/margins": 6.8581342697143555, + "rewards/rejected": -7.74177360534668, + "step": 367 + }, + { + "epoch": 6.237288135593221, + "grad_norm": 1.6423168042541676, + "learning_rate": 7.062318573891715e-08, + "logits/chosen": -0.27111876010894775, + "logits/rejected": -0.22774375975131989, + "logps/chosen": -25.00893783569336, + "logps/rejected": -51.19509506225586, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6221886873245239, + "rewards/margins": 7.797216892242432, + "rewards/rejected": -8.419405937194824, + "step": 368 + }, + { + "epoch": 6.254237288135593, + "grad_norm": 1.9302693331546565, + "learning_rate": 6.933786217116364e-08, + "logits/chosen": -0.3160867691040039, + "logits/rejected": -0.2463129460811615, + "logps/chosen": -24.83222198486328, + "logps/rejected": -45.60934066772461, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16461661458015442, + "rewards/margins": 6.204172134399414, + "rewards/rejected": -6.368788242340088, + "step": 369 + }, + { + "epoch": 6.271186440677966, + "grad_norm": 1.5842681388350077, + "learning_rate": 6.806245681091944e-08, + "logits/chosen": -0.3545396029949188, + "logits/rejected": -0.25907883048057556, + "logps/chosen": -28.055213928222656, + "logps/rejected": -56.62708282470703, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.246912956237793, + "rewards/margins": 8.316703796386719, + "rewards/rejected": -9.563617706298828, + "step": 370 + }, + { + "epoch": 6.288135593220339, + "grad_norm": 1.8111378961230746, + "learning_rate": 6.679703967692321e-08, + "logits/chosen": -0.21147161722183228, + "logits/rejected": -0.20179268717765808, + "logps/chosen": -23.622386932373047, + "logps/rejected": -56.40178680419922, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9630917310714722, + "rewards/margins": 7.12011194229126, + "rewards/rejected": -8.08320426940918, + "step": 371 + }, + { + "epoch": 6.305084745762712, + "grad_norm": 1.1986230098938282, + "learning_rate": 6.554168023956816e-08, + "logits/chosen": -0.2591314911842346, + "logits/rejected": -0.27817869186401367, + "logps/chosen": -27.67983627319336, + "logps/rejected": -50.832889556884766, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1853872537612915, + "rewards/margins": 6.63606071472168, + "rewards/rejected": -7.82144832611084, + "step": 372 + }, + { + "epoch": 6.322033898305085, + "grad_norm": 1.7049379706268657, + "learning_rate": 6.429644741708779e-08, + "logits/chosen": -0.4500387907028198, + "logits/rejected": -0.36974358558654785, + "logps/chosen": -23.562002182006836, + "logps/rejected": -42.636146545410156, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6414427757263184, + "rewards/margins": 6.672116756439209, + "rewards/rejected": -7.313559532165527, + "step": 373 + }, + { + "epoch": 6.338983050847458, + "grad_norm": 1.359337092287595, + "learning_rate": 6.306140957177225e-08, + "logits/chosen": -0.3460231125354767, + "logits/rejected": -0.3752771317958832, + "logps/chosen": -25.647207260131836, + "logps/rejected": -51.0827522277832, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7650890350341797, + "rewards/margins": 6.918083667755127, + "rewards/rejected": -7.683172702789307, + "step": 374 + }, + { + "epoch": 6.3559322033898304, + "grad_norm": 1.534380633126308, + "learning_rate": 6.183663450621607e-08, + "logits/chosen": -0.34895992279052734, + "logits/rejected": -0.3292369842529297, + "logps/chosen": -34.13381576538086, + "logps/rejected": -55.714393615722656, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8866183757781982, + "rewards/margins": 7.842109203338623, + "rewards/rejected": -8.728727340698242, + "step": 375 + }, + { + "epoch": 6.372881355932203, + "grad_norm": 1.5779257981471628, + "learning_rate": 6.062218945959496e-08, + "logits/chosen": -0.4587939977645874, + "logits/rejected": -0.4462360143661499, + "logps/chosen": -33.18772888183594, + "logps/rejected": -50.75407028198242, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7835342288017273, + "rewards/margins": 7.535502910614014, + "rewards/rejected": -8.319037437438965, + "step": 376 + }, + { + "epoch": 6.389830508474576, + "grad_norm": 1.4524083267309678, + "learning_rate": 5.9418141103975026e-08, + "logits/chosen": -0.3016980290412903, + "logits/rejected": -0.3382137417793274, + "logps/chosen": -28.424884796142578, + "logps/rejected": -67.0705337524414, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.947837233543396, + "rewards/margins": 10.247238159179688, + "rewards/rejected": -11.195074081420898, + "step": 377 + }, + { + "epoch": 6.406779661016949, + "grad_norm": 1.5502969744773236, + "learning_rate": 5.822455554065217e-08, + "logits/chosen": -0.22019946575164795, + "logits/rejected": -0.18748457729816437, + "logps/chosen": -21.47047996520996, + "logps/rejected": -43.39370346069336, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4421558678150177, + "rewards/margins": 6.153988361358643, + "rewards/rejected": -6.59614372253418, + "step": 378 + }, + { + "epoch": 6.423728813559322, + "grad_norm": 1.4169584989322257, + "learning_rate": 5.704149829652341e-08, + "logits/chosen": -0.45599544048309326, + "logits/rejected": -0.38583889603614807, + "logps/chosen": -30.23516082763672, + "logps/rejected": -60.14368438720703, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9913487434387207, + "rewards/margins": 8.080925941467285, + "rewards/rejected": -9.072275161743164, + "step": 379 + }, + { + "epoch": 6.440677966101695, + "grad_norm": 1.9051661634723038, + "learning_rate": 5.586903432048942e-08, + "logits/chosen": -0.49622446298599243, + "logits/rejected": -0.4194895625114441, + "logps/chosen": -29.30303192138672, + "logps/rejected": -59.797874450683594, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.547768473625183, + "rewards/margins": 8.939881324768066, + "rewards/rejected": -10.487649917602539, + "step": 380 + }, + { + "epoch": 6.4576271186440675, + "grad_norm": 1.6431936228958974, + "learning_rate": 5.470722797988883e-08, + "logits/chosen": -0.2737140953540802, + "logits/rejected": -0.27121812105178833, + "logps/chosen": -25.59175682067871, + "logps/rejected": -46.32392120361328, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4059231281280518, + "rewards/margins": 6.439781188964844, + "rewards/rejected": -7.845704078674316, + "step": 381 + }, + { + "epoch": 6.47457627118644, + "grad_norm": 1.3079903093761553, + "learning_rate": 5.355614305696468e-08, + "logits/chosen": -0.3695864677429199, + "logits/rejected": -0.30790218710899353, + "logps/chosen": -27.365276336669922, + "logps/rejected": -51.38225555419922, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4411066770553589, + "rewards/margins": 7.909936904907227, + "rewards/rejected": -8.351043701171875, + "step": 382 + }, + { + "epoch": 6.491525423728813, + "grad_norm": 1.5133700851337937, + "learning_rate": 5.241584274536259e-08, + "logits/chosen": -0.298088937997818, + "logits/rejected": -0.267940878868103, + "logps/chosen": -28.913124084472656, + "logps/rejected": -59.5203971862793, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8007093667984009, + "rewards/margins": 8.938852310180664, + "rewards/rejected": -9.739561080932617, + "step": 383 + }, + { + "epoch": 6.508474576271187, + "grad_norm": 1.405914234766755, + "learning_rate": 5.1286389646661654e-08, + "logits/chosen": -0.2601643204689026, + "logits/rejected": -0.2083461433649063, + "logps/chosen": -28.819889068603516, + "logps/rejected": -52.013458251953125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2809916734695435, + "rewards/margins": 7.361697196960449, + "rewards/rejected": -8.642688751220703, + "step": 384 + }, + { + "epoch": 6.52542372881356, + "grad_norm": 1.4580353927814265, + "learning_rate": 5.0167845766937806e-08, + "logits/chosen": -0.4725567400455475, + "logits/rejected": -0.4601272642612457, + "logps/chosen": -29.988752365112305, + "logps/rejected": -49.545658111572266, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3534915447235107, + "rewards/margins": 6.975593090057373, + "rewards/rejected": -8.329084396362305, + "step": 385 + }, + { + "epoch": 6.5423728813559325, + "grad_norm": 1.6115119401528328, + "learning_rate": 4.906027251335917e-08, + "logits/chosen": -0.3040216565132141, + "logits/rejected": -0.2666282653808594, + "logps/chosen": -23.24091148376465, + "logps/rejected": -58.34555435180664, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.226002812385559, + "rewards/margins": 8.7108154296875, + "rewards/rejected": -9.936819076538086, + "step": 386 + }, + { + "epoch": 6.559322033898305, + "grad_norm": 1.9607341782533316, + "learning_rate": 4.7963730690815467e-08, + "logits/chosen": -0.3676231801509857, + "logits/rejected": -0.3557916581630707, + "logps/chosen": -15.970260620117188, + "logps/rejected": -43.20943069458008, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12866909801959991, + "rewards/margins": 7.353845119476318, + "rewards/rejected": -7.2251763343811035, + "step": 387 + }, + { + "epoch": 6.576271186440678, + "grad_norm": 2.247467950803516, + "learning_rate": 4.687828049857967e-08, + "logits/chosen": -0.40337732434272766, + "logits/rejected": -0.3796375095844269, + "logps/chosen": -29.360713958740234, + "logps/rejected": -45.309486389160156, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8816546201705933, + "rewards/margins": 6.715970516204834, + "rewards/rejected": -7.597624778747559, + "step": 388 + }, + { + "epoch": 6.593220338983051, + "grad_norm": 1.5736622891761218, + "learning_rate": 4.580398152700304e-08, + "logits/chosen": -0.41009533405303955, + "logits/rejected": -0.445589154958725, + "logps/chosen": -24.89777374267578, + "logps/rejected": -54.37321853637695, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.717568576335907, + "rewards/margins": 8.025527000427246, + "rewards/rejected": -8.743096351623535, + "step": 389 + }, + { + "epoch": 6.610169491525424, + "grad_norm": 1.8210132983277654, + "learning_rate": 4.47408927542435e-08, + "logits/chosen": -0.23610210418701172, + "logits/rejected": -0.2229936569929123, + "logps/chosen": -23.190319061279297, + "logps/rejected": -47.796974182128906, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9015488028526306, + "rewards/margins": 6.892556667327881, + "rewards/rejected": -7.794105052947998, + "step": 390 + }, + { + "epoch": 6.627118644067797, + "grad_norm": 1.8113881906369103, + "learning_rate": 4.368907254302837e-08, + "logits/chosen": -0.4094342589378357, + "logits/rejected": -0.4127545654773712, + "logps/chosen": -18.310993194580078, + "logps/rejected": -50.38030242919922, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3419753909111023, + "rewards/margins": 8.148374557495117, + "rewards/rejected": -8.490348815917969, + "step": 391 + }, + { + "epoch": 6.6440677966101696, + "grad_norm": 1.2223010613658718, + "learning_rate": 4.264857863744956e-08, + "logits/chosen": -0.3197595477104187, + "logits/rejected": -0.2664377689361572, + "logps/chosen": -22.68889617919922, + "logps/rejected": -50.56337356567383, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.839199960231781, + "rewards/margins": 8.539407730102539, + "rewards/rejected": -9.378606796264648, + "step": 392 + }, + { + "epoch": 6.661016949152542, + "grad_norm": 1.712303748336679, + "learning_rate": 4.161946815979403e-08, + "logits/chosen": -0.34751880168914795, + "logits/rejected": -0.32943466305732727, + "logps/chosen": -31.06998062133789, + "logps/rejected": -54.03396987915039, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07644249498844147, + "rewards/margins": 7.472164154052734, + "rewards/rejected": -7.5486063957214355, + "step": 393 + }, + { + "epoch": 6.677966101694915, + "grad_norm": 1.5445735625557495, + "learning_rate": 4.0601797607407505e-08, + "logits/chosen": -0.43604975938796997, + "logits/rejected": -0.44707322120666504, + "logps/chosen": -24.34992218017578, + "logps/rejected": -48.38789749145508, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0394837856292725, + "rewards/margins": 6.550841331481934, + "rewards/rejected": -7.590324878692627, + "step": 394 + }, + { + "epoch": 6.694915254237288, + "grad_norm": 1.3515811447555408, + "learning_rate": 3.9595622849593e-08, + "logits/chosen": -0.48450133204460144, + "logits/rejected": -0.425273060798645, + "logps/chosen": -24.3756046295166, + "logps/rejected": -54.65086364746094, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8011985421180725, + "rewards/margins": 8.252460479736328, + "rewards/rejected": -9.053659439086914, + "step": 395 + }, + { + "epoch": 6.711864406779661, + "grad_norm": 1.82157364531828, + "learning_rate": 3.8600999124543455e-08, + "logits/chosen": -0.43978190422058105, + "logits/rejected": -0.3958742618560791, + "logps/chosen": -23.50148582458496, + "logps/rejected": -49.621158599853516, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3347681760787964, + "rewards/margins": 7.130214214324951, + "rewards/rejected": -7.464982032775879, + "step": 396 + }, + { + "epoch": 6.728813559322034, + "grad_norm": 1.376347630528171, + "learning_rate": 3.7617981036309533e-08, + "logits/chosen": -0.44994401931762695, + "logits/rejected": -0.4660834074020386, + "logps/chosen": -22.31543731689453, + "logps/rejected": -48.74754333496094, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6207740306854248, + "rewards/margins": 7.773540019989014, + "rewards/rejected": -8.39431381225586, + "step": 397 + }, + { + "epoch": 6.745762711864407, + "grad_norm": 1.6630761397695306, + "learning_rate": 3.664662255180134e-08, + "logits/chosen": -0.2478867769241333, + "logits/rejected": -0.2218003273010254, + "logps/chosen": -27.264450073242188, + "logps/rejected": -48.04404830932617, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0151184797286987, + "rewards/margins": 5.991296768188477, + "rewards/rejected": -7.006415367126465, + "step": 398 + }, + { + "epoch": 6.762711864406779, + "grad_norm": 1.7823746972379073, + "learning_rate": 3.5686976997826245e-08, + "logits/chosen": -0.4420131742954254, + "logits/rejected": -0.4338444769382477, + "logps/chosen": -37.46350860595703, + "logps/rejected": -59.96247100830078, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.341228723526001, + "rewards/margins": 7.825350761413574, + "rewards/rejected": -9.166579246520996, + "step": 399 + }, + { + "epoch": 6.779661016949152, + "grad_norm": 1.8399624391407163, + "learning_rate": 3.473909705816111e-08, + "logits/chosen": -0.31618526577949524, + "logits/rejected": -0.2617036700248718, + "logps/chosen": -35.39426803588867, + "logps/rejected": -58.3309326171875, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.725992202758789, + "rewards/margins": 8.16711711883545, + "rewards/rejected": -9.893108367919922, + "step": 400 + }, + { + "epoch": 6.796610169491525, + "grad_norm": 1.3821869838967202, + "learning_rate": 3.3803034770659824e-08, + "logits/chosen": -0.43311774730682373, + "logits/rejected": -0.400162935256958, + "logps/chosen": -36.61433029174805, + "logps/rejected": -77.34138488769531, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.166048526763916, + "rewards/margins": 9.511601448059082, + "rewards/rejected": -10.677648544311523, + "step": 401 + }, + { + "epoch": 6.813559322033898, + "grad_norm": 1.5978002425036417, + "learning_rate": 3.287884152439646e-08, + "logits/chosen": -0.2973329722881317, + "logits/rejected": -0.27377772331237793, + "logps/chosen": -30.09129524230957, + "logps/rejected": -53.16349792480469, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7582967281341553, + "rewards/margins": 7.514451503753662, + "rewards/rejected": -8.272747993469238, + "step": 402 + }, + { + "epoch": 6.830508474576272, + "grad_norm": 1.8281621888445494, + "learning_rate": 3.19665680568445e-08, + "logits/chosen": -0.4268870949745178, + "logits/rejected": -0.37249866127967834, + "logps/chosen": -32.59174346923828, + "logps/rejected": -46.462005615234375, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4710607528686523, + "rewards/margins": 5.770994186401367, + "rewards/rejected": -7.242054462432861, + "step": 403 + }, + { + "epoch": 6.847457627118644, + "grad_norm": 0.9001061616975613, + "learning_rate": 3.106626445109081e-08, + "logits/chosen": -0.37813207507133484, + "logits/rejected": -0.3872162401676178, + "logps/chosen": -30.775136947631836, + "logps/rejected": -61.652565002441406, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1428020000457764, + "rewards/margins": 7.899393081665039, + "rewards/rejected": -9.042195320129395, + "step": 404 + }, + { + "epoch": 6.864406779661017, + "grad_norm": 1.597346490336906, + "learning_rate": 3.017798013308645e-08, + "logits/chosen": -0.3538016080856323, + "logits/rejected": -0.341571182012558, + "logps/chosen": -31.822330474853516, + "logps/rejected": -50.422752380371094, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8719119429588318, + "rewards/margins": 7.065603256225586, + "rewards/rejected": -7.937515735626221, + "step": 405 + }, + { + "epoch": 6.88135593220339, + "grad_norm": 1.810856752512637, + "learning_rate": 2.9301763868933153e-08, + "logits/chosen": -0.4209059178829193, + "logits/rejected": -0.373024046421051, + "logps/chosen": -22.65794563293457, + "logps/rejected": -46.467655181884766, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.554482102394104, + "rewards/margins": 7.356679439544678, + "rewards/rejected": -7.911161422729492, + "step": 406 + }, + { + "epoch": 6.898305084745763, + "grad_norm": 1.573195940423645, + "learning_rate": 2.843766376220616e-08, + "logits/chosen": -0.48762577772140503, + "logits/rejected": -0.5053017735481262, + "logps/chosen": -26.841205596923828, + "logps/rejected": -52.88998794555664, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3870327472686768, + "rewards/margins": 7.6463212966918945, + "rewards/rejected": -9.033354759216309, + "step": 407 + }, + { + "epoch": 6.915254237288136, + "grad_norm": 2.0992362165416494, + "learning_rate": 2.7585727251313195e-08, + "logits/chosen": -0.39123690128326416, + "logits/rejected": -0.33466434478759766, + "logps/chosen": -41.52196502685547, + "logps/rejected": -67.35308837890625, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1684083938598633, + "rewards/margins": 7.4681878089904785, + "rewards/rejected": -10.636595726013184, + "step": 408 + }, + { + "epoch": 6.932203389830509, + "grad_norm": 1.4571552338914753, + "learning_rate": 2.6746001106890377e-08, + "logits/chosen": -0.4723522663116455, + "logits/rejected": -0.4505174458026886, + "logps/chosen": -29.244626998901367, + "logps/rejected": -52.072750091552734, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2372843027114868, + "rewards/margins": 6.951776027679443, + "rewards/rejected": -8.18906021118164, + "step": 409 + }, + { + "epoch": 6.9491525423728815, + "grad_norm": 1.4632431809951456, + "learning_rate": 2.5918531429234364e-08, + "logits/chosen": -0.34531697630882263, + "logits/rejected": -0.2954227924346924, + "logps/chosen": -26.551538467407227, + "logps/rejected": -64.3398666381836, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1470048427581787, + "rewards/margins": 9.478975296020508, + "rewards/rejected": -10.62597942352295, + "step": 410 + }, + { + "epoch": 6.966101694915254, + "grad_norm": 1.4110992084448712, + "learning_rate": 2.5103363645771536e-08, + "logits/chosen": -0.5194912552833557, + "logits/rejected": -0.48006966710090637, + "logps/chosen": -37.35237121582031, + "logps/rejected": -52.63975524902344, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2981250286102295, + "rewards/margins": 6.82828950881958, + "rewards/rejected": -8.126415252685547, + "step": 411 + }, + { + "epoch": 6.983050847457627, + "grad_norm": 1.4538836011603475, + "learning_rate": 2.4300542508564114e-08, + "logits/chosen": -0.3935295641422272, + "logits/rejected": -0.336182177066803, + "logps/chosen": -25.596094131469727, + "logps/rejected": -51.74674987792969, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5983898639678955, + "rewards/margins": 7.388174057006836, + "rewards/rejected": -7.986563682556152, + "step": 412 + }, + { + "epoch": 7.0, + "grad_norm": 1.5706199385371322, + "learning_rate": 2.3510112091853357e-08, + "logits/chosen": -0.2152971625328064, + "logits/rejected": -0.1929609775543213, + "logps/chosen": -20.163312911987305, + "logps/rejected": -52.776371002197266, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37090063095092773, + "rewards/margins": 7.480232238769531, + "rewards/rejected": -7.851133346557617, + "step": 413 + }, + { + "epoch": 7.016949152542373, + "grad_norm": 1.7457089160064294, + "learning_rate": 2.27321157896396e-08, + "logits/chosen": -0.305334210395813, + "logits/rejected": -0.29125475883483887, + "logps/chosen": -27.516210556030273, + "logps/rejected": -57.63336944580078, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9373904466629028, + "rewards/margins": 8.481929779052734, + "rewards/rejected": -9.419321060180664, + "step": 414 + }, + { + "epoch": 7.033898305084746, + "grad_norm": 1.5469113812840338, + "learning_rate": 2.1966596313300362e-08, + "logits/chosen": -0.5671955943107605, + "logits/rejected": -0.5605946779251099, + "logps/chosen": -27.200397491455078, + "logps/rejected": -47.673065185546875, + "loss": 0.0158, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.48190128803253174, + "rewards/margins": 6.851955413818359, + "rewards/rejected": -7.33385705947876, + "step": 415 + }, + { + "epoch": 7.0508474576271185, + "grad_norm": 1.2508305219444056, + "learning_rate": 2.1213595689245384e-08, + "logits/chosen": -0.3499354422092438, + "logits/rejected": -0.3194410800933838, + "logps/chosen": -24.055994033813477, + "logps/rejected": -44.12571716308594, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39127033948898315, + "rewards/margins": 6.136053085327148, + "rewards/rejected": -6.527322769165039, + "step": 416 + }, + { + "epoch": 7.067796610169491, + "grad_norm": 1.6178572230856318, + "learning_rate": 2.0473155256609363e-08, + "logits/chosen": -0.4242691993713379, + "logits/rejected": -0.4158502221107483, + "logps/chosen": -27.026355743408203, + "logps/rejected": -50.12807083129883, + "loss": 0.02, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9110782146453857, + "rewards/margins": 6.59192419052124, + "rewards/rejected": -7.503002643585205, + "step": 417 + }, + { + "epoch": 7.084745762711864, + "grad_norm": 1.4383810028427624, + "learning_rate": 1.9745315664982277e-08, + "logits/chosen": -0.452391117811203, + "logits/rejected": -0.42704349756240845, + "logps/chosen": -21.179094314575195, + "logps/rejected": -45.48261260986328, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6325095891952515, + "rewards/margins": 7.283842086791992, + "rewards/rejected": -7.916351318359375, + "step": 418 + }, + { + "epoch": 7.101694915254237, + "grad_norm": 1.4230254989220643, + "learning_rate": 1.9030116872178314e-08, + "logits/chosen": -0.4608815014362335, + "logits/rejected": -0.4329046905040741, + "logps/chosen": -25.20135498046875, + "logps/rejected": -48.06166458129883, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5603317022323608, + "rewards/margins": 6.86769962310791, + "rewards/rejected": -7.428031921386719, + "step": 419 + }, + { + "epoch": 7.11864406779661, + "grad_norm": 1.7148759769374629, + "learning_rate": 1.8327598142041656e-08, + "logits/chosen": -0.06360499560832977, + "logits/rejected": -0.02670701965689659, + "logps/chosen": -38.62311935424805, + "logps/rejected": -69.75874328613281, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8160368800163269, + "rewards/margins": 8.83346176147461, + "rewards/rejected": -9.649497032165527, + "step": 420 + }, + { + "epoch": 7.135593220338983, + "grad_norm": 1.6040424998385574, + "learning_rate": 1.7637798042291125e-08, + "logits/chosen": -0.40952420234680176, + "logits/rejected": -0.4033817648887634, + "logps/chosen": -31.083284378051758, + "logps/rejected": -49.10109329223633, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0806251764297485, + "rewards/margins": 6.7387261390686035, + "rewards/rejected": -7.819350719451904, + "step": 421 + }, + { + "epoch": 7.1525423728813555, + "grad_norm": 1.7913863319457528, + "learning_rate": 1.696075444240305e-08, + "logits/chosen": -0.3758937120437622, + "logits/rejected": -0.33229541778564453, + "logps/chosen": -22.26805877685547, + "logps/rejected": -49.145355224609375, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9506510496139526, + "rewards/margins": 6.685871601104736, + "rewards/rejected": -7.636523246765137, + "step": 422 + }, + { + "epoch": 7.169491525423728, + "grad_norm": 1.8102037954527834, + "learning_rate": 1.6296504511531834e-08, + "logits/chosen": -0.43989044427871704, + "logits/rejected": -0.44529837369918823, + "logps/chosen": -27.95105743408203, + "logps/rejected": -56.04852294921875, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9269706010818481, + "rewards/margins": 7.820449352264404, + "rewards/rejected": -8.747420310974121, + "step": 423 + }, + { + "epoch": 7.186440677966102, + "grad_norm": 1.5701815802054835, + "learning_rate": 1.5645084716469776e-08, + "logits/chosen": -0.4497320353984833, + "logits/rejected": -0.41386500000953674, + "logps/chosen": -33.39286804199219, + "logps/rejected": -56.49354934692383, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3042948246002197, + "rewards/margins": 8.047407150268555, + "rewards/rejected": -9.351702690124512, + "step": 424 + }, + { + "epoch": 7.203389830508475, + "grad_norm": 1.3727625339462444, + "learning_rate": 1.5006530819644923e-08, + "logits/chosen": -0.2934183180332184, + "logits/rejected": -0.3134685158729553, + "logps/chosen": -31.583393096923828, + "logps/rejected": -53.04698944091797, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4079774618148804, + "rewards/margins": 7.08270263671875, + "rewards/rejected": -8.490680694580078, + "step": 425 + }, + { + "epoch": 7.220338983050848, + "grad_norm": 1.2271197429060396, + "learning_rate": 1.4380877877157832e-08, + "logits/chosen": -0.3554607629776001, + "logits/rejected": -0.365239679813385, + "logps/chosen": -28.676355361938477, + "logps/rejected": -57.925479888916016, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0384249687194824, + "rewards/margins": 7.682095527648926, + "rewards/rejected": -8.720520973205566, + "step": 426 + }, + { + "epoch": 7.237288135593221, + "grad_norm": 0.9777815105599793, + "learning_rate": 1.3768160236856674e-08, + "logits/chosen": -0.3703988194465637, + "logits/rejected": -0.3820286691188812, + "logps/chosen": -29.876129150390625, + "logps/rejected": -59.59389114379883, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.152010202407837, + "rewards/margins": 7.627685070037842, + "rewards/rejected": -8.779695510864258, + "step": 427 + }, + { + "epoch": 7.254237288135593, + "grad_norm": 1.626489536598415, + "learning_rate": 1.316841153645215e-08, + "logits/chosen": -0.4109363853931427, + "logits/rejected": -0.34275108575820923, + "logps/chosen": -29.276466369628906, + "logps/rejected": -53.1593132019043, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.102203369140625, + "rewards/margins": 7.080381870269775, + "rewards/rejected": -8.182584762573242, + "step": 428 + }, + { + "epoch": 7.271186440677966, + "grad_norm": 1.6657284917403243, + "learning_rate": 1.2581664701670296e-08, + "logits/chosen": -0.44309279322624207, + "logits/rejected": -0.3362104892730713, + "logps/chosen": -29.43478012084961, + "logps/rejected": -52.261634826660156, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8790161609649658, + "rewards/margins": 7.74953031539917, + "rewards/rejected": -9.628546714782715, + "step": 429 + }, + { + "epoch": 7.288135593220339, + "grad_norm": 1.5026292891085353, + "learning_rate": 1.2007951944445121e-08, + "logits/chosen": -0.3713536262512207, + "logits/rejected": -0.3408533036708832, + "logps/chosen": -21.457298278808594, + "logps/rejected": -46.37548065185547, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2886759042739868, + "rewards/margins": 6.526200771331787, + "rewards/rejected": -6.814876556396484, + "step": 430 + }, + { + "epoch": 7.305084745762712, + "grad_norm": 1.639671277812395, + "learning_rate": 1.144730476115019e-08, + "logits/chosen": -0.4143469035625458, + "logits/rejected": -0.4372211694717407, + "logps/chosen": -27.68434715270996, + "logps/rejected": -61.32562255859375, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6124215126037598, + "rewards/margins": 7.509528636932373, + "rewards/rejected": -9.121950149536133, + "step": 431 + }, + { + "epoch": 7.322033898305085, + "grad_norm": 1.3049845757020513, + "learning_rate": 1.0899753930869394e-08, + "logits/chosen": -0.4528166949748993, + "logits/rejected": -0.4254574775695801, + "logps/chosen": -26.2335147857666, + "logps/rejected": -55.94972229003906, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2019011974334717, + "rewards/margins": 8.213329315185547, + "rewards/rejected": -9.415230751037598, + "step": 432 + }, + { + "epoch": 7.338983050847458, + "grad_norm": 1.6327474891460472, + "learning_rate": 1.036532951370736e-08, + "logits/chosen": -0.41717565059661865, + "logits/rejected": -0.3538900911808014, + "logps/chosen": -28.74737548828125, + "logps/rejected": -59.961647033691406, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015626758337020874, + "rewards/margins": 8.777100563049316, + "rewards/rejected": -8.792726516723633, + "step": 433 + }, + { + "epoch": 7.3559322033898304, + "grad_norm": 1.5187852224535574, + "learning_rate": 9.844060849138997e-09, + "logits/chosen": -0.4029984474182129, + "logits/rejected": -0.38465699553489685, + "logps/chosen": -21.391469955444336, + "logps/rejected": -47.680503845214844, + "loss": 0.0153, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.46827206015586853, + "rewards/margins": 7.35407829284668, + "rewards/rejected": -7.822350025177002, + "step": 434 + }, + { + "epoch": 7.372881355932203, + "grad_norm": 1.3171463040757392, + "learning_rate": 9.335976554398912e-09, + "logits/chosen": -0.5152924060821533, + "logits/rejected": -0.43552643060684204, + "logps/chosen": -28.43988800048828, + "logps/rejected": -45.27113342285156, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8594914674758911, + "rewards/margins": 6.36436128616333, + "rewards/rejected": -7.223852157592773, + "step": 435 + }, + { + "epoch": 7.389830508474576, + "grad_norm": 1.352843609066883, + "learning_rate": 8.841104522910342e-09, + "logits/chosen": -0.3669931888580322, + "logits/rejected": -0.34047171473503113, + "logps/chosen": -35.265140533447266, + "logps/rejected": -61.45825958251953, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.776789665222168, + "rewards/margins": 8.250631332397461, + "rewards/rejected": -10.027421951293945, + "step": 436 + }, + { + "epoch": 7.406779661016949, + "grad_norm": 1.8098418726298369, + "learning_rate": 8.359471922753714e-09, + "logits/chosen": -0.355437695980072, + "logits/rejected": -0.3270444869995117, + "logps/chosen": -29.349018096923828, + "logps/rejected": -60.3387336730957, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6025338768959045, + "rewards/margins": 8.68097972869873, + "rewards/rejected": -9.283513069152832, + "step": 437 + }, + { + "epoch": 7.423728813559322, + "grad_norm": 1.8786140797643052, + "learning_rate": 7.891105195175356e-09, + "logits/chosen": -0.41774412989616394, + "logits/rejected": -0.379474401473999, + "logps/chosen": -31.24578094482422, + "logps/rejected": -48.23630905151367, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4310747385025024, + "rewards/margins": 6.719178676605225, + "rewards/rejected": -8.150252342224121, + "step": 438 + }, + { + "epoch": 7.440677966101695, + "grad_norm": 1.607505847132714, + "learning_rate": 7.4360300531355894e-09, + "logits/chosen": -0.2568835914134979, + "logits/rejected": -0.22977690398693085, + "logps/chosen": -32.3515625, + "logps/rejected": -66.43345642089844, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2904284000396729, + "rewards/margins": 8.297982215881348, + "rewards/rejected": -9.588411331176758, + "step": 439 + }, + { + "epoch": 7.4576271186440675, + "grad_norm": 1.3035469316341721, + "learning_rate": 6.994271479897313e-09, + "logits/chosen": -0.43775883316993713, + "logits/rejected": -0.4212513267993927, + "logps/chosen": -22.23575210571289, + "logps/rejected": -42.02503967285156, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44508761167526245, + "rewards/margins": 6.237364768981934, + "rewards/rejected": -6.68245267868042, + "step": 440 + }, + { + "epoch": 7.47457627118644, + "grad_norm": 1.366151434834416, + "learning_rate": 6.565853727654502e-09, + "logits/chosen": -0.5031697154045105, + "logits/rejected": -0.5177669525146484, + "logps/chosen": -31.38254737854004, + "logps/rejected": -55.45163345336914, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3961818218231201, + "rewards/margins": 7.067687034606934, + "rewards/rejected": -8.463868141174316, + "step": 441 + }, + { + "epoch": 7.491525423728813, + "grad_norm": 1.4529924037385114, + "learning_rate": 6.150800316200605e-09, + "logits/chosen": -0.44623109698295593, + "logits/rejected": -0.42772334814071655, + "logps/chosen": -26.20204734802246, + "logps/rejected": -44.2577018737793, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4152941405773163, + "rewards/margins": 7.550034046173096, + "rewards/rejected": -7.965329170227051, + "step": 442 + }, + { + "epoch": 7.508474576271187, + "grad_norm": 1.7067297640882242, + "learning_rate": 5.7491340316373485e-09, + "logits/chosen": -0.291814386844635, + "logits/rejected": -0.2504284083843231, + "logps/chosen": -27.000459671020508, + "logps/rejected": -54.978851318359375, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1894947290420532, + "rewards/margins": 7.88712739944458, + "rewards/rejected": -9.076622009277344, + "step": 443 + }, + { + "epoch": 7.52542372881356, + "grad_norm": 1.8852004572462866, + "learning_rate": 5.360876925123992e-09, + "logits/chosen": -0.4699954390525818, + "logits/rejected": -0.4355739653110504, + "logps/chosen": -34.63423156738281, + "logps/rejected": -65.0200424194336, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3088154792785645, + "rewards/margins": 8.483866691589355, + "rewards/rejected": -9.792682647705078, + "step": 444 + }, + { + "epoch": 7.5423728813559325, + "grad_norm": 1.9525606885122415, + "learning_rate": 4.9860503116665176e-09, + "logits/chosen": -0.580295205116272, + "logits/rejected": -0.5321290493011475, + "logps/chosen": -26.729717254638672, + "logps/rejected": -52.8782958984375, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3733120858669281, + "rewards/margins": 6.56894588470459, + "rewards/rejected": -6.942258358001709, + "step": 445 + }, + { + "epoch": 7.559322033898305, + "grad_norm": 1.9545216316177383, + "learning_rate": 4.624674768947484e-09, + "logits/chosen": -0.47320348024368286, + "logits/rejected": -0.42538437247276306, + "logps/chosen": -27.289257049560547, + "logps/rejected": -52.25529479980469, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5522884130477905, + "rewards/margins": 6.996241092681885, + "rewards/rejected": -7.548530578613281, + "step": 446 + }, + { + "epoch": 7.576271186440678, + "grad_norm": 1.884083633370762, + "learning_rate": 4.2767701361964835e-09, + "logits/chosen": -0.31534552574157715, + "logits/rejected": -0.2909752428531647, + "logps/chosen": -37.41596221923828, + "logps/rejected": -55.43064880371094, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.208031415939331, + "rewards/margins": 6.201999664306641, + "rewards/rejected": -8.41003131866455, + "step": 447 + }, + { + "epoch": 7.593220338983051, + "grad_norm": 1.6257233201960972, + "learning_rate": 3.942355513100792e-09, + "logits/chosen": -0.40161648392677307, + "logits/rejected": -0.4106261134147644, + "logps/chosen": -26.018543243408203, + "logps/rejected": -62.679073333740234, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1849199533462524, + "rewards/margins": 8.389888763427734, + "rewards/rejected": -9.574809074401855, + "step": 448 + }, + { + "epoch": 7.610169491525424, + "grad_norm": 1.7831606635295467, + "learning_rate": 3.6214492587569313e-09, + "logits/chosen": -0.3574334383010864, + "logits/rejected": -0.350351482629776, + "logps/chosen": -33.58333206176758, + "logps/rejected": -48.65354537963867, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2194223403930664, + "rewards/margins": 7.209741592407227, + "rewards/rejected": -8.429162979125977, + "step": 449 + }, + { + "epoch": 7.627118644067797, + "grad_norm": 1.9827926138744145, + "learning_rate": 3.314068990662805e-09, + "logits/chosen": -0.5334146022796631, + "logits/rejected": -0.4546634256839752, + "logps/chosen": -25.30044174194336, + "logps/rejected": -49.781150817871094, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5673917531967163, + "rewards/margins": 7.860011577606201, + "rewards/rejected": -8.427403450012207, + "step": 450 + }, + { + "epoch": 7.6440677966101696, + "grad_norm": 1.5345319966235849, + "learning_rate": 3.0202315837502545e-09, + "logits/chosen": -0.41027843952178955, + "logits/rejected": -0.36624419689178467, + "logps/chosen": -29.778715133666992, + "logps/rejected": -46.88585662841797, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.713862657546997, + "rewards/margins": 5.918593406677246, + "rewards/rejected": -7.632455825805664, + "step": 451 + }, + { + "epoch": 7.661016949152542, + "grad_norm": 1.45498172133433, + "learning_rate": 2.7399531694589917e-09, + "logits/chosen": -0.49980151653289795, + "logits/rejected": -0.5059882998466492, + "logps/chosen": -27.507404327392578, + "logps/rejected": -53.86846923828125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6134958267211914, + "rewards/margins": 7.079102039337158, + "rewards/rejected": -8.692597389221191, + "step": 452 + }, + { + "epoch": 7.677966101694915, + "grad_norm": 1.3296441933194811, + "learning_rate": 2.473249134850808e-09, + "logits/chosen": -0.3527723550796509, + "logits/rejected": -0.31979426741600037, + "logps/chosen": -22.46451187133789, + "logps/rejected": -50.37282180786133, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8375161290168762, + "rewards/margins": 7.392008304595947, + "rewards/rejected": -8.229524612426758, + "step": 453 + }, + { + "epoch": 7.694915254237288, + "grad_norm": 1.9704674503284925, + "learning_rate": 2.220134121764833e-09, + "logits/chosen": -0.43200796842575073, + "logits/rejected": -0.4080568850040436, + "logps/chosen": -15.70004940032959, + "logps/rejected": -44.24908447265625, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03188416361808777, + "rewards/margins": 7.455360412597656, + "rewards/rejected": -7.423476219177246, + "step": 454 + }, + { + "epoch": 7.711864406779661, + "grad_norm": 1.58493742628634, + "learning_rate": 1.9806220260137065e-09, + "logits/chosen": -0.4422493577003479, + "logits/rejected": -0.37290158867836, + "logps/chosen": -30.651966094970703, + "logps/rejected": -55.6935920715332, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4701915681362152, + "rewards/margins": 8.02414321899414, + "rewards/rejected": -8.494333267211914, + "step": 455 + }, + { + "epoch": 7.728813559322034, + "grad_norm": 1.309516904226872, + "learning_rate": 1.7547259966207705e-09, + "logits/chosen": -0.5261704325675964, + "logits/rejected": -0.49233362078666687, + "logps/chosen": -27.28386116027832, + "logps/rejected": -53.776641845703125, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0615800619125366, + "rewards/margins": 8.335307121276855, + "rewards/rejected": -9.396886825561523, + "step": 456 + }, + { + "epoch": 7.745762711864407, + "grad_norm": 1.7021291625550554, + "learning_rate": 1.5424584350981485e-09, + "logits/chosen": -0.3087800443172455, + "logits/rejected": -0.3105306923389435, + "logps/chosen": -24.47256851196289, + "logps/rejected": -50.43601989746094, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5138031244277954, + "rewards/margins": 7.677865505218506, + "rewards/rejected": -8.191668510437012, + "step": 457 + }, + { + "epoch": 7.762711864406779, + "grad_norm": 1.476829222148452, + "learning_rate": 1.343830994765982e-09, + "logits/chosen": -0.44474345445632935, + "logits/rejected": -0.42049241065979004, + "logps/chosen": -23.719074249267578, + "logps/rejected": -62.17032241821289, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6281836032867432, + "rewards/margins": 9.182320594787598, + "rewards/rejected": -9.810504913330078, + "step": 458 + }, + { + "epoch": 7.779661016949152, + "grad_norm": 1.350716594904905, + "learning_rate": 1.1588545801125837e-09, + "logits/chosen": -0.5191625356674194, + "logits/rejected": -0.4718668460845947, + "logps/chosen": -35.341068267822266, + "logps/rejected": -59.45354461669922, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2722684144973755, + "rewards/margins": 7.540390968322754, + "rewards/rejected": -8.81265926361084, + "step": 459 + }, + { + "epoch": 7.796610169491525, + "grad_norm": 1.4635314314598586, + "learning_rate": 9.87539346195776e-10, + "logits/chosen": -0.3168594241142273, + "logits/rejected": -0.2879508435726166, + "logps/chosen": -28.040536880493164, + "logps/rejected": -43.86100387573242, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7127920389175415, + "rewards/margins": 6.108259201049805, + "rewards/rejected": -6.821051120758057, + "step": 460 + }, + { + "epoch": 7.813559322033898, + "grad_norm": 1.5494249427881754, + "learning_rate": 8.298946980855315e-10, + "logits/chosen": -0.4457828998565674, + "logits/rejected": -0.3980650007724762, + "logps/chosen": -25.963443756103516, + "logps/rejected": -45.423763275146484, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5499848127365112, + "rewards/margins": 6.5887556076049805, + "rewards/rejected": -7.138739585876465, + "step": 461 + }, + { + "epoch": 7.830508474576272, + "grad_norm": 1.4534702698382904, + "learning_rate": 6.8592929034747e-10, + "logits/chosen": -0.35777002573013306, + "logits/rejected": -0.3949616849422455, + "logps/chosen": -28.67134666442871, + "logps/rejected": -57.564937591552734, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1654642820358276, + "rewards/margins": 6.651158332824707, + "rewards/rejected": -7.816622257232666, + "step": 462 + }, + { + "epoch": 7.847457627118644, + "grad_norm": 1.032270598397053, + "learning_rate": 5.556510265678771e-10, + "logits/chosen": -0.4886370003223419, + "logits/rejected": -0.5037115216255188, + "logps/chosen": -20.78964614868164, + "logps/rejected": -48.41303253173828, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6196805834770203, + "rewards/margins": 7.310200214385986, + "rewards/rejected": -7.9298810958862305, + "step": 463 + }, + { + "epoch": 7.864406779661017, + "grad_norm": 1.2578929925717066, + "learning_rate": 4.390670589196621e-10, + "logits/chosen": -0.2916780114173889, + "logits/rejected": -0.2758171856403351, + "logps/chosen": -25.246580123901367, + "logps/rejected": -56.34712219238281, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3181313276290894, + "rewards/margins": 8.5591402053833, + "rewards/rejected": -9.877272605895996, + "step": 464 + }, + { + "epoch": 7.88135593220339, + "grad_norm": 1.718937820229263, + "learning_rate": 3.3618378776981147e-10, + "logits/chosen": -0.2728620767593384, + "logits/rejected": -0.25526100397109985, + "logps/chosen": -27.420053482055664, + "logps/rejected": -48.728145599365234, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45923471450805664, + "rewards/margins": 6.748472213745117, + "rewards/rejected": -7.207706928253174, + "step": 465 + }, + { + "epoch": 7.898305084745763, + "grad_norm": 1.3557652621850438, + "learning_rate": 2.4700686132803075e-10, + "logits/chosen": -0.3592544496059418, + "logits/rejected": -0.37164703011512756, + "logps/chosen": -28.122146606445312, + "logps/rejected": -53.134910583496094, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5242568254470825, + "rewards/margins": 7.785172939300537, + "rewards/rejected": -8.309430122375488, + "step": 466 + }, + { + "epoch": 7.915254237288136, + "grad_norm": 1.2354179862035723, + "learning_rate": 1.715411753365481e-10, + "logits/chosen": -0.5242431163787842, + "logits/rejected": -0.4909352958202362, + "logps/chosen": -26.54534912109375, + "logps/rejected": -55.37403106689453, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6659201383590698, + "rewards/margins": 7.7830328941345215, + "rewards/rejected": -9.448952674865723, + "step": 467 + }, + { + "epoch": 7.932203389830509, + "grad_norm": 1.6240885957181501, + "learning_rate": 1.0979087280141297e-10, + "logits/chosen": -0.32274141907691956, + "logits/rejected": -0.32932335138320923, + "logps/chosen": -20.59052085876465, + "logps/rejected": -46.62628173828125, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.706177294254303, + "rewards/margins": 7.163034439086914, + "rewards/rejected": -7.8692121505737305, + "step": 468 + }, + { + "epoch": 7.9491525423728815, + "grad_norm": 1.495293278605491, + "learning_rate": 6.175934376509429e-11, + "logits/chosen": -0.272166907787323, + "logits/rejected": -0.29551127552986145, + "logps/chosen": -27.317262649536133, + "logps/rejected": -73.80632781982422, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1428358554840088, + "rewards/margins": 10.341331481933594, + "rewards/rejected": -11.484167098999023, + "step": 469 + }, + { + "epoch": 7.966101694915254, + "grad_norm": 1.4312305917111094, + "learning_rate": 2.7449225120268482e-11, + "logits/chosen": -0.2674176096916199, + "logits/rejected": -0.25498396158218384, + "logps/chosen": -26.586524963378906, + "logps/rejected": -52.65561294555664, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.174375295639038, + "rewards/margins": 6.976294040679932, + "rewards/rejected": -8.150670051574707, + "step": 470 + }, + { + "epoch": 7.983050847457627, + "grad_norm": 1.5506645838575677, + "learning_rate": 6.862400465157403e-12, + "logits/chosen": -0.22756405174732208, + "logits/rejected": -0.21930274367332458, + "logps/chosen": -35.59461212158203, + "logps/rejected": -42.98273849487305, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.217814564704895, + "rewards/margins": 5.8322625160217285, + "rewards/rejected": -7.050076484680176, + "step": 471 + }, + { + "epoch": 8.0, + "grad_norm": 1.608716199113347, + "learning_rate": 0.0, + "logits/chosen": -0.49069491028785706, + "logits/rejected": -0.4363957941532135, + "logps/chosen": -31.856151580810547, + "logps/rejected": -46.5308837890625, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7556713223457336, + "rewards/margins": 6.963629245758057, + "rewards/rejected": -7.719299793243408, + "step": 472 + }, + { + "epoch": 8.0, + "step": 472, + "total_flos": 0.0, + "train_loss": 0.11897581996064696, + "train_runtime": 99870.155, + "train_samples_per_second": 0.605, + "train_steps_per_second": 0.005 + } + ], + "logging_steps": 1, + "max_steps": 472, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 400, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}