{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02092050209205021, "grad_norm": 5.176846146347138, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.630842924118042, "logits/rejected": -2.5769855976104736, "logps/chosen": -288.64373779296875, "logps/rejected": -275.88287353515625, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.00047967396676540375, "rewards/margins": 0.0007994862971827388, "rewards/rejected": -0.0003198123595211655, "step": 10 }, { "epoch": 0.04184100418410042, "grad_norm": 4.778746903378828, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6447551250457764, "logits/rejected": -2.6132736206054688, "logps/chosen": -293.56829833984375, "logps/rejected": -259.22283935546875, "loss": 0.6927, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.002531626494601369, "rewards/margins": 0.0012841664720326662, "rewards/rejected": 0.0012474602553993464, "step": 20 }, { "epoch": 0.06276150627615062, "grad_norm": 4.484860702954145, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.6660215854644775, "logits/rejected": -2.589404821395874, "logps/chosen": -294.7344970703125, "logps/rejected": -287.273193359375, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": 0.014913314953446388, "rewards/margins": 0.008078203536570072, "rewards/rejected": 0.006835112813860178, "step": 30 }, { "epoch": 0.08368200836820083, "grad_norm": 4.460846213560835, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.6350624561309814, "logits/rejected": -2.5527660846710205, "logps/chosen": -270.5862731933594, "logps/rejected": -240.20895385742188, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": 0.033098004758358, "rewards/margins": 0.02473345957696438, "rewards/rejected": 0.008364550769329071, "step": 40 }, { "epoch": 0.10460251046025104, "grad_norm": 5.258394523703879, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.5787136554718018, "logits/rejected": -2.5705668926239014, "logps/chosen": -264.2970886230469, "logps/rejected": -246.346435546875, "loss": 0.6746, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01130986213684082, "rewards/margins": 0.07325177639722824, "rewards/rejected": -0.06194191053509712, "step": 50 }, { "epoch": 0.10460251046025104, "eval_logits/chosen": -2.6120545864105225, "eval_logits/rejected": -2.5751192569732666, "eval_logps/chosen": -260.48486328125, "eval_logps/rejected": -271.1068115234375, "eval_loss": 0.6513926982879639, "eval_rewards/accuracies": 0.6953125, "eval_rewards/chosen": 0.021450327709317207, "eval_rewards/margins": 0.10589740425348282, "eval_rewards/rejected": -0.08444707095623016, "eval_runtime": 104.2349, "eval_samples_per_second": 19.187, "eval_steps_per_second": 0.307, "step": 50 }, { "epoch": 0.12552301255230125, "grad_norm": 6.387285250582947, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.5387370586395264, "logits/rejected": -2.4989449977874756, "logps/chosen": -267.24676513671875, "logps/rejected": -259.34765625, "loss": 0.6603, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0057476600632071495, "rewards/margins": 0.13733819127082825, "rewards/rejected": -0.13159053027629852, "step": 60 }, { "epoch": 0.14644351464435146, "grad_norm": 10.713757122261297, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.6051666736602783, "logits/rejected": -2.5439701080322266, "logps/chosen": -302.8467712402344, "logps/rejected": -307.2948303222656, "loss": 0.6438, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17766565084457397, "rewards/margins": 0.19495461881160736, "rewards/rejected": -0.37262025475502014, "step": 70 }, { "epoch": 0.16736401673640167, "grad_norm": 13.46181824781145, "learning_rate": 4.931986719649298e-07, "logits/chosen": -1.8741085529327393, "logits/rejected": -1.8762668371200562, "logps/chosen": -299.6753845214844, "logps/rejected": -331.6051330566406, "loss": 0.6154, "rewards/accuracies": 0.59375, "rewards/chosen": -0.40526852011680603, "rewards/margins": 0.28667524456977844, "rewards/rejected": -0.6919438242912292, "step": 80 }, { "epoch": 0.18828451882845187, "grad_norm": 11.81447104504551, "learning_rate": 4.883222001996351e-07, "logits/chosen": -1.0113297700881958, "logits/rejected": -0.8674372434616089, "logps/chosen": -323.6530456542969, "logps/rejected": -368.2422790527344, "loss": 0.6023, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5255545377731323, "rewards/margins": 0.5487722158432007, "rewards/rejected": -1.074326753616333, "step": 90 }, { "epoch": 0.20920502092050208, "grad_norm": 20.253194674698854, "learning_rate": 4.821741763807186e-07, "logits/chosen": -0.3053513169288635, "logits/rejected": 0.3383195400238037, "logps/chosen": -372.4469299316406, "logps/rejected": -375.91619873046875, "loss": 0.5801, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.784656286239624, "rewards/margins": 0.7117811441421509, "rewards/rejected": -1.4964375495910645, "step": 100 }, { "epoch": 0.20920502092050208, "eval_logits/chosen": 0.5614331960678101, "eval_logits/rejected": 0.8477897644042969, "eval_logps/chosen": -386.7606506347656, "eval_logps/rejected": -462.9020690917969, "eval_loss": 0.5962740182876587, "eval_rewards/accuracies": 0.69140625, "eval_rewards/chosen": -1.241307258605957, "eval_rewards/margins": 0.7610923647880554, "eval_rewards/rejected": -2.0023996829986572, "eval_runtime": 102.9081, "eval_samples_per_second": 19.435, "eval_steps_per_second": 0.311, "step": 100 }, { "epoch": 0.2301255230125523, "grad_norm": 14.600851352900357, "learning_rate": 4.747874028753375e-07, "logits/chosen": 0.512942910194397, "logits/rejected": 0.9520395398139954, "logps/chosen": -399.2005920410156, "logps/rejected": -453.3448791503906, "loss": 0.5789, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0446428060531616, "rewards/margins": 0.7325394749641418, "rewards/rejected": -1.7771823406219482, "step": 110 }, { "epoch": 0.2510460251046025, "grad_norm": 18.76879266561917, "learning_rate": 4.662012913161997e-07, "logits/chosen": 0.7682480812072754, "logits/rejected": 1.3250311613082886, "logps/chosen": -378.8160095214844, "logps/rejected": -432.3802795410156, "loss": 0.5736, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0223931074142456, "rewards/margins": 0.9169514775276184, "rewards/rejected": -1.9393446445465088, "step": 120 }, { "epoch": 0.2719665271966527, "grad_norm": 15.069786999974797, "learning_rate": 4.5646165232345103e-07, "logits/chosen": 0.34132882952690125, "logits/rejected": 0.8815134167671204, "logps/chosen": -395.4140930175781, "logps/rejected": -450.15667724609375, "loss": 0.5599, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9794706106185913, "rewards/margins": 0.8771657943725586, "rewards/rejected": -1.856636643409729, "step": 130 }, { "epoch": 0.2928870292887029, "grad_norm": 19.253522278201135, "learning_rate": 4.456204510851956e-07, "logits/chosen": 0.5900996327400208, "logits/rejected": 1.196023941040039, "logps/chosen": -394.09033203125, "logps/rejected": -462.4150390625, "loss": 0.5556, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9378790855407715, "rewards/margins": 0.9935140609741211, "rewards/rejected": -1.931393027305603, "step": 140 }, { "epoch": 0.3138075313807531, "grad_norm": 17.705226251621124, "learning_rate": 4.337355301007335e-07, "logits/chosen": 1.6807399988174438, "logits/rejected": 2.5443928241729736, "logps/chosen": -434.1851501464844, "logps/rejected": -498.25689697265625, "loss": 0.561, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4313938617706299, "rewards/margins": 0.8576656579971313, "rewards/rejected": -2.289059638977051, "step": 150 }, { "epoch": 0.3138075313807531, "eval_logits/chosen": 1.6750099658966064, "eval_logits/rejected": 2.122723340988159, "eval_logps/chosen": -397.78521728515625, "eval_logps/rejected": -493.1910095214844, "eval_loss": 0.5612272620201111, "eval_rewards/accuracies": 0.7421875, "eval_rewards/chosen": -1.351552963256836, "eval_rewards/margins": 0.9537361860275269, "eval_rewards/rejected": -2.3052892684936523, "eval_runtime": 102.8554, "eval_samples_per_second": 19.445, "eval_steps_per_second": 0.311, "step": 150 }, { "epoch": 0.33472803347280333, "grad_norm": 17.923487490122945, "learning_rate": 4.2087030056579986e-07, "logits/chosen": 1.7796026468276978, "logits/rejected": 2.404552936553955, "logps/chosen": -391.76666259765625, "logps/rejected": -457.2339782714844, "loss": 0.5577, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2124366760253906, "rewards/margins": 0.963293731212616, "rewards/rejected": -2.1757304668426514, "step": 160 }, { "epoch": 0.35564853556485354, "grad_norm": 22.93702368800413, "learning_rate": 4.070934040463998e-07, "logits/chosen": 2.2401363849639893, "logits/rejected": 3.037454128265381, "logps/chosen": -417.4873962402344, "logps/rejected": -499.13262939453125, "loss": 0.5345, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4775826930999756, "rewards/margins": 0.9278723001480103, "rewards/rejected": -2.4054548740386963, "step": 170 }, { "epoch": 0.37656903765690375, "grad_norm": 22.059850949763494, "learning_rate": 3.9247834624635404e-07, "logits/chosen": 2.308513879776001, "logits/rejected": 3.129546642303467, "logps/chosen": -479.5143127441406, "logps/rejected": -558.447509765625, "loss": 0.5486, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7520792484283447, "rewards/margins": 1.1397490501403809, "rewards/rejected": -2.8918280601501465, "step": 180 }, { "epoch": 0.39748953974895396, "grad_norm": 21.54003656099098, "learning_rate": 3.7710310482256523e-07, "logits/chosen": 1.515995979309082, "logits/rejected": 2.3186123371124268, "logps/chosen": -405.1570129394531, "logps/rejected": -478.882080078125, "loss": 0.531, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5137742757797241, "rewards/margins": 0.8394115567207336, "rewards/rejected": -2.3531858921051025, "step": 190 }, { "epoch": 0.41841004184100417, "grad_norm": 21.26002965858666, "learning_rate": 3.610497133404795e-07, "logits/chosen": 2.5295510292053223, "logits/rejected": 3.3073439598083496, "logps/chosen": -462.2782287597656, "logps/rejected": -518.1025390625, "loss": 0.552, "rewards/accuracies": 0.6875, "rewards/chosen": -2.019491672515869, "rewards/margins": 0.858087420463562, "rewards/rejected": -2.8775787353515625, "step": 200 }, { "epoch": 0.41841004184100417, "eval_logits/chosen": 2.018301010131836, "eval_logits/rejected": 2.6770527362823486, "eval_logps/chosen": -441.72589111328125, "eval_logps/rejected": -564.1273803710938, "eval_loss": 0.563401997089386, "eval_rewards/accuracies": 0.75390625, "eval_rewards/chosen": -1.7909597158432007, "eval_rewards/margins": 1.2236928939819336, "eval_rewards/rejected": -3.0146522521972656, "eval_runtime": 104.172, "eval_samples_per_second": 19.199, "eval_steps_per_second": 0.307, "step": 200 }, { "epoch": 0.4393305439330544, "grad_norm": 21.451368111326985, "learning_rate": 3.4440382358952115e-07, "logits/chosen": 1.9598217010498047, "logits/rejected": 2.9120290279388428, "logps/chosen": -447.69476318359375, "logps/rejected": -529.9307861328125, "loss": 0.5413, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7580368518829346, "rewards/margins": 0.9606950879096985, "rewards/rejected": -2.7187318801879883, "step": 210 }, { "epoch": 0.4602510460251046, "grad_norm": 20.160436671538378, "learning_rate": 3.272542485937368e-07, "logits/chosen": 1.119225025177002, "logits/rejected": 1.9501537084579468, "logps/chosen": -417.14422607421875, "logps/rejected": -513.2694091796875, "loss": 0.555, "rewards/accuracies": 0.75, "rewards/chosen": -1.3289070129394531, "rewards/margins": 1.0918917655944824, "rewards/rejected": -2.4207987785339355, "step": 220 }, { "epoch": 0.4811715481171548, "grad_norm": 20.190647127947944, "learning_rate": 3.096924887558854e-07, "logits/chosen": 2.0996615886688232, "logits/rejected": 3.000412940979004, "logps/chosen": -453.54473876953125, "logps/rejected": -568.1104125976562, "loss": 0.5188, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7147760391235352, "rewards/margins": 1.3763706684112549, "rewards/rejected": -3.09114670753479, "step": 230 }, { "epoch": 0.502092050209205, "grad_norm": 20.668039994089078, "learning_rate": 2.9181224366319943e-07, "logits/chosen": 3.0839335918426514, "logits/rejected": 4.029969215393066, "logps/chosen": -446.9358825683594, "logps/rejected": -518.8782958984375, "loss": 0.5411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.922851800918579, "rewards/margins": 1.0858973264694214, "rewards/rejected": -3.00874924659729, "step": 240 }, { "epoch": 0.5230125523012552, "grad_norm": 21.91013779471945, "learning_rate": 2.7370891215954565e-07, "logits/chosen": 2.500542163848877, "logits/rejected": 3.4184327125549316, "logps/chosen": -480.619873046875, "logps/rejected": -543.4830322265625, "loss": 0.5367, "rewards/accuracies": 0.75, "rewards/chosen": -1.8631795644760132, "rewards/margins": 1.1071960926055908, "rewards/rejected": -2.9703755378723145, "step": 250 }, { "epoch": 0.5230125523012552, "eval_logits/chosen": 2.173550844192505, "eval_logits/rejected": 2.8097872734069824, "eval_logps/chosen": -423.32470703125, "eval_logps/rejected": -549.812744140625, "eval_loss": 0.5404338836669922, "eval_rewards/accuracies": 0.765625, "eval_rewards/chosen": -1.6069477796554565, "eval_rewards/margins": 1.2645587921142578, "eval_rewards/rejected": -2.871506690979004, "eval_runtime": 104.7771, "eval_samples_per_second": 19.088, "eval_steps_per_second": 0.305, "step": 250 }, { "epoch": 0.5439330543933054, "grad_norm": 22.08171649508119, "learning_rate": 2.55479083351317e-07, "logits/chosen": 2.1905927658081055, "logits/rejected": 3.057525873184204, "logps/chosen": -468.181884765625, "logps/rejected": -552.5390014648438, "loss": 0.5252, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5366401672363281, "rewards/margins": 1.3390202522277832, "rewards/rejected": -2.8756604194641113, "step": 260 }, { "epoch": 0.5648535564853556, "grad_norm": 18.888110325544, "learning_rate": 2.3722002126275822e-07, "logits/chosen": 2.343186140060425, "logits/rejected": 3.283693790435791, "logps/chosen": -480.1078186035156, "logps/rejected": -560.9808349609375, "loss": 0.5243, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8669660091400146, "rewards/margins": 1.2250339984893799, "rewards/rejected": -3.0920000076293945, "step": 270 }, { "epoch": 0.5857740585774058, "grad_norm": 27.244776233826, "learning_rate": 2.19029145890313e-07, "logits/chosen": 2.048271656036377, "logits/rejected": 2.768385887145996, "logps/chosen": -433.47479248046875, "logps/rejected": -546.029296875, "loss": 0.5272, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7612041234970093, "rewards/margins": 1.1764047145843506, "rewards/rejected": -2.937608242034912, "step": 280 }, { "epoch": 0.606694560669456, "grad_norm": 21.547725491962094, "learning_rate": 2.0100351342479216e-07, "logits/chosen": 2.24129056930542, "logits/rejected": 2.8873372077941895, "logps/chosen": -460.38690185546875, "logps/rejected": -570.9521484375, "loss": 0.523, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6904579401016235, "rewards/margins": 1.2296130657196045, "rewards/rejected": -2.9200711250305176, "step": 290 }, { "epoch": 0.6276150627615062, "grad_norm": 21.34579167577593, "learning_rate": 1.8323929841460178e-07, "logits/chosen": 2.917755603790283, "logits/rejected": 3.5216128826141357, "logps/chosen": -471.07965087890625, "logps/rejected": -600.7293701171875, "loss": 0.5231, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0175368785858154, "rewards/margins": 1.2063273191452026, "rewards/rejected": -3.2238643169403076, "step": 300 }, { "epoch": 0.6276150627615062, "eval_logits/chosen": 2.2075393199920654, "eval_logits/rejected": 2.986379384994507, "eval_logps/chosen": -445.05584716796875, "eval_logps/rejected": -587.8876953125, "eval_loss": 0.5511458516120911, "eval_rewards/accuracies": 0.765625, "eval_rewards/chosen": -1.8242592811584473, "eval_rewards/margins": 1.42799711227417, "eval_rewards/rejected": -3.252256393432617, "eval_runtime": 103.1559, "eval_samples_per_second": 19.388, "eval_steps_per_second": 0.31, "step": 300 }, { "epoch": 0.6485355648535565, "grad_norm": 18.70711648745981, "learning_rate": 1.6583128063291573e-07, "logits/chosen": 1.9323720932006836, "logits/rejected": 2.7933290004730225, "logps/chosen": -479.4503479003906, "logps/rejected": -588.09228515625, "loss": 0.5303, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8548892736434937, "rewards/margins": 1.2552098035812378, "rewards/rejected": -3.1100995540618896, "step": 310 }, { "epoch": 0.6694560669456067, "grad_norm": 24.633583083411633, "learning_rate": 1.488723393865766e-07, "logits/chosen": 1.4087620973587036, "logits/rejected": 2.465496778488159, "logps/chosen": -487.93646240234375, "logps/rejected": -566.53466796875, "loss": 0.5063, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9570667743682861, "rewards/margins": 1.188623070716858, "rewards/rejected": -3.1456902027130127, "step": 320 }, { "epoch": 0.6903765690376569, "grad_norm": 20.927701473521733, "learning_rate": 1.3245295796480788e-07, "logits/chosen": 1.6705989837646484, "logits/rejected": 2.787672758102417, "logps/chosen": -478.430908203125, "logps/rejected": -544.5813598632812, "loss": 0.5178, "rewards/accuracies": 0.6875, "rewards/chosen": -1.948035478591919, "rewards/margins": 1.0792946815490723, "rewards/rejected": -3.027329921722412, "step": 330 }, { "epoch": 0.7112970711297071, "grad_norm": 21.58725862881672, "learning_rate": 1.1666074087171627e-07, "logits/chosen": 2.2645626068115234, "logits/rejected": 3.0073537826538086, "logps/chosen": -433.1025390625, "logps/rejected": -569.9879150390625, "loss": 0.5331, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8666340112686157, "rewards/margins": 1.2376598119735718, "rewards/rejected": -3.1042943000793457, "step": 340 }, { "epoch": 0.7322175732217573, "grad_norm": 22.053584645202807, "learning_rate": 1.0157994641835734e-07, "logits/chosen": 2.3491084575653076, "logits/rejected": 3.309586763381958, "logps/chosen": -490.469970703125, "logps/rejected": -593.7388916015625, "loss": 0.5092, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0824320316314697, "rewards/margins": 1.1687943935394287, "rewards/rejected": -3.2512269020080566, "step": 350 }, { "epoch": 0.7322175732217573, "eval_logits/chosen": 2.094608783721924, "eval_logits/rejected": 2.88336443901062, "eval_logps/chosen": -461.0306701660156, "eval_logps/rejected": -602.9060668945312, "eval_loss": 0.5401915311813354, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -1.9840072393417358, "eval_rewards/margins": 1.4184322357177734, "eval_rewards/rejected": -3.402439594268799, "eval_runtime": 104.739, "eval_samples_per_second": 19.095, "eval_steps_per_second": 0.306, "step": 350 }, { "epoch": 0.7531380753138075, "grad_norm": 18.40926814309719, "learning_rate": 8.729103716819111e-08, "logits/chosen": 2.376570701599121, "logits/rejected": 3.0961055755615234, "logps/chosen": -463.7430725097656, "logps/rejected": -592.4307861328125, "loss": 0.5261, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.168846368789673, "rewards/margins": 1.2989672422409058, "rewards/rejected": -3.467813491821289, "step": 360 }, { "epoch": 0.7740585774058577, "grad_norm": 20.93465319965544, "learning_rate": 7.387025063449081e-08, "logits/chosen": 1.8760063648223877, "logits/rejected": 3.0332190990448, "logps/chosen": -485.4361267089844, "logps/rejected": -611.39599609375, "loss": 0.512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.975916862487793, "rewards/margins": 1.4137961864471436, "rewards/rejected": -3.3897128105163574, "step": 370 }, { "epoch": 0.7949790794979079, "grad_norm": 21.53735300502108, "learning_rate": 6.138919252022435e-08, "logits/chosen": 2.2272870540618896, "logits/rejected": 2.9403111934661865, "logps/chosen": -529.391845703125, "logps/rejected": -598.6041870117188, "loss": 0.5243, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.129279613494873, "rewards/margins": 1.0328638553619385, "rewards/rejected": -3.1621437072753906, "step": 380 }, { "epoch": 0.8158995815899581, "grad_norm": 22.856314943850858, "learning_rate": 4.991445467064689e-08, "logits/chosen": 2.5150341987609863, "logits/rejected": 3.433137893676758, "logps/chosen": -523.1361694335938, "logps/rejected": -647.6205444335938, "loss": 0.5212, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1657605171203613, "rewards/margins": 1.4086949825286865, "rewards/rejected": -3.5744547843933105, "step": 390 }, { "epoch": 0.8368200836820083, "grad_norm": 20.459584660793766, "learning_rate": 3.9507259776993954e-08, "logits/chosen": 2.373227596282959, "logits/rejected": 3.0179686546325684, "logps/chosen": -520.0594482421875, "logps/rejected": -614.9906005859375, "loss": 0.5231, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2612462043762207, "rewards/margins": 1.1028645038604736, "rewards/rejected": -3.3641109466552734, "step": 400 }, { "epoch": 0.8368200836820083, "eval_logits/chosen": 2.2364866733551025, "eval_logits/rejected": 3.0541534423828125, "eval_logps/chosen": -472.12713623046875, "eval_logps/rejected": -619.1116333007812, "eval_loss": 0.5417460799217224, "eval_rewards/accuracies": 0.78125, "eval_rewards/chosen": -2.0949723720550537, "eval_rewards/margins": 1.4695227146148682, "eval_rewards/rejected": -3.564495086669922, "eval_runtime": 103.2544, "eval_samples_per_second": 19.37, "eval_steps_per_second": 0.31, "step": 400 }, { "epoch": 0.8577405857740585, "grad_norm": 20.75509691260348, "learning_rate": 3.022313472693447e-08, "logits/chosen": 1.8815155029296875, "logits/rejected": 2.752415657043457, "logps/chosen": -530.4354248046875, "logps/rejected": -645.202392578125, "loss": 0.514, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.134329319000244, "rewards/margins": 1.3350379467010498, "rewards/rejected": -3.469367504119873, "step": 410 }, { "epoch": 0.8786610878661087, "grad_norm": 21.72917985290927, "learning_rate": 2.2111614344599684e-08, "logits/chosen": 2.2683472633361816, "logits/rejected": 3.545788526535034, "logps/chosen": -509.83563232421875, "logps/rejected": -617.6029052734375, "loss": 0.4959, "rewards/accuracies": 0.75, "rewards/chosen": -2.1364758014678955, "rewards/margins": 1.329615831375122, "rewards/rejected": -3.4660911560058594, "step": 420 }, { "epoch": 0.899581589958159, "grad_norm": 20.573623570397274, "learning_rate": 1.521597710086439e-08, "logits/chosen": 2.243511438369751, "logits/rejected": 3.2753005027770996, "logps/chosen": -517.85498046875, "logps/rejected": -601.6973876953125, "loss": 0.5284, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.218738317489624, "rewards/margins": 1.2048834562301636, "rewards/rejected": -3.4236221313476562, "step": 430 }, { "epoch": 0.9205020920502092, "grad_norm": 17.236420967349105, "learning_rate": 9.57301420397924e-09, "logits/chosen": 2.407790422439575, "logits/rejected": 3.659886598587036, "logps/chosen": -511.99688720703125, "logps/rejected": -602.4998168945312, "loss": 0.5239, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2972302436828613, "rewards/margins": 1.267395257949829, "rewards/rejected": -3.5646255016326904, "step": 440 }, { "epoch": 0.9414225941422594, "grad_norm": 18.669490237353997, "learning_rate": 5.212833302556258e-09, "logits/chosen": 2.2820138931274414, "logits/rejected": 3.1481757164001465, "logps/chosen": -492.22052001953125, "logps/rejected": -632.2754516601562, "loss": 0.5232, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.074744462966919, "rewards/margins": 1.4727141857147217, "rewards/rejected": -3.5474586486816406, "step": 450 }, { "epoch": 0.9414225941422594, "eval_logits/chosen": 2.1926114559173584, "eval_logits/rejected": 3.032156467437744, "eval_logps/chosen": -469.2007751464844, "eval_logps/rejected": -617.9429931640625, "eval_loss": 0.5418744087219238, "eval_rewards/accuracies": 0.78125, "eval_rewards/chosen": -2.0657081604003906, "eval_rewards/margins": 1.487100601196289, "eval_rewards/rejected": -3.552809000015259, "eval_runtime": 103.486, "eval_samples_per_second": 19.326, "eval_steps_per_second": 0.309, "step": 450 }, { "epoch": 0.9623430962343096, "grad_norm": 21.564710597734962, "learning_rate": 2.158697848236607e-09, "logits/chosen": 2.6190943717956543, "logits/rejected": 3.548638105392456, "logps/chosen": -499.64208984375, "logps/rejected": -623.9893188476562, "loss": 0.5095, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.363645076751709, "rewards/margins": 1.3193647861480713, "rewards/rejected": -3.683009624481201, "step": 460 }, { "epoch": 0.9832635983263598, "grad_norm": 17.345861357972396, "learning_rate": 4.269029751107489e-10, "logits/chosen": 2.4917562007904053, "logits/rejected": 2.865358591079712, "logps/chosen": -476.9666442871094, "logps/rejected": -609.1448974609375, "loss": 0.5372, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.138411045074463, "rewards/margins": 1.2895748615264893, "rewards/rejected": -3.427985668182373, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.5571172007955767, "train_runtime": 12724.9946, "train_samples_per_second": 4.804, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }