{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988571428571429, "eval_steps": 100, "global_step": 437, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022857142857142857, "grad_norm": 9.858836405653433, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.7008285522460938, "logits/rejected": -2.6250243186950684, "logps/chosen": -301.27081298828125, "logps/rejected": -281.75146484375, "loss": 0.693, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 0.00015341350808739662, "rewards/margins": 0.0001716136175673455, "rewards/rejected": -1.8200071281171404e-05, "step": 10 }, { "epoch": 0.045714285714285714, "grad_norm": 7.79798162706573, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.6407124996185303, "logits/rejected": -2.6055800914764404, "logps/chosen": -278.97711181640625, "logps/rejected": -254.7215576171875, "loss": 0.6923, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0019410619279369712, "rewards/margins": 0.0014655741397291422, "rewards/rejected": 0.0004754880501423031, "step": 20 }, { "epoch": 0.06857142857142857, "grad_norm": 8.391842432337471, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.63759183883667, "logits/rejected": -2.6166491508483887, "logps/chosen": -263.44866943359375, "logps/rejected": -263.5602111816406, "loss": 0.6871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.012344349175691605, "rewards/margins": 0.008015613071620464, "rewards/rejected": 0.004328734241425991, "step": 30 }, { "epoch": 0.09142857142857143, "grad_norm": 9.39930524490582, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.6476945877075195, "logits/rejected": -2.5853049755096436, "logps/chosen": -290.5145568847656, "logps/rejected": -268.3503723144531, "loss": 0.6755, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.034826718270778656, "rewards/margins": 0.042017363011837006, "rewards/rejected": -0.007190642412751913, "step": 40 }, { "epoch": 0.11428571428571428, "grad_norm": 13.99279327188838, "learning_rate": 4.997124959943201e-07, "logits/chosen": -2.6765246391296387, "logits/rejected": -2.5974183082580566, "logps/chosen": -294.23516845703125, "logps/rejected": -254.03042602539062, "loss": 0.6629, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02075173892080784, "rewards/margins": 0.0924127846956253, "rewards/rejected": -0.07166104018688202, "step": 50 }, { "epoch": 0.13714285714285715, "grad_norm": 10.917441498337828, "learning_rate": 4.979579212164186e-07, "logits/chosen": -2.5758731365203857, "logits/rejected": -2.472479820251465, "logps/chosen": -290.3058166503906, "logps/rejected": -270.32891845703125, "loss": 0.6375, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.09799840301275253, "rewards/margins": 0.11976947635412216, "rewards/rejected": -0.21776790916919708, "step": 60 }, { "epoch": 0.16, "grad_norm": 14.438870715842985, "learning_rate": 4.946196886175515e-07, "logits/chosen": -2.568722724914551, "logits/rejected": -2.5102906227111816, "logps/chosen": -284.9170227050781, "logps/rejected": -291.45648193359375, "loss": 0.6099, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09138783067464828, "rewards/margins": 0.22690913081169128, "rewards/rejected": -0.31829696893692017, "step": 70 }, { "epoch": 0.18285714285714286, "grad_norm": 12.571362458092375, "learning_rate": 4.897191188239667e-07, "logits/chosen": -2.5521557331085205, "logits/rejected": -2.4944987297058105, "logps/chosen": -291.53851318359375, "logps/rejected": -307.15631103515625, "loss": 0.6075, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.24509508907794952, "rewards/margins": 0.27337878942489624, "rewards/rejected": -0.5184738636016846, "step": 80 }, { "epoch": 0.2057142857142857, "grad_norm": 13.355734952743322, "learning_rate": 4.832875107981763e-07, "logits/chosen": -2.6172804832458496, "logits/rejected": -2.551274538040161, "logps/chosen": -292.2721252441406, "logps/rejected": -307.9034423828125, "loss": 0.6079, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.15986846387386322, "rewards/margins": 0.3636865019798279, "rewards/rejected": -0.5235549807548523, "step": 90 }, { "epoch": 0.22857142857142856, "grad_norm": 14.60913947420442, "learning_rate": 4.753659419387223e-07, "logits/chosen": -2.629087448120117, "logits/rejected": -2.5364227294921875, "logps/chosen": -312.4242858886719, "logps/rejected": -296.9600524902344, "loss": 0.5995, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2603791356086731, "rewards/margins": 0.3747071921825409, "rewards/rejected": -0.6350862979888916, "step": 100 }, { "epoch": 0.22857142857142856, "eval_logits/chosen": -2.465451717376709, "eval_logits/rejected": -2.34794545173645, "eval_logps/chosen": -306.55853271484375, "eval_logps/rejected": -294.0323486328125, "eval_loss": 0.5960295796394348, "eval_rewards/accuracies": 0.7155172228813171, "eval_rewards/chosen": -0.30954551696777344, "eval_rewards/margins": 0.4400167167186737, "eval_rewards/rejected": -0.7495622038841248, "eval_runtime": 90.9114, "eval_samples_per_second": 20.14, "eval_steps_per_second": 0.319, "step": 100 }, { "epoch": 0.25142857142857145, "grad_norm": 16.802955839707842, "learning_rate": 4.660050057270191e-07, "logits/chosen": -2.4196040630340576, "logits/rejected": -2.3442416191101074, "logps/chosen": -358.9793395996094, "logps/rejected": -372.57965087890625, "loss": 0.5847, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.42675742506980896, "rewards/margins": 0.3199427127838135, "rewards/rejected": -0.7467001676559448, "step": 110 }, { "epoch": 0.2742857142857143, "grad_norm": 18.751366323915487, "learning_rate": 4.5526448859687144e-07, "logits/chosen": -2.141848087310791, "logits/rejected": -1.955249547958374, "logps/chosen": -351.53961181640625, "logps/rejected": -315.23406982421875, "loss": 0.571, "rewards/accuracies": 0.6875, "rewards/chosen": -0.40635138750076294, "rewards/margins": 0.42884722352027893, "rewards/rejected": -0.8351985812187195, "step": 120 }, { "epoch": 0.29714285714285715, "grad_norm": 24.80835479779821, "learning_rate": 4.432129880904388e-07, "logits/chosen": -1.6259305477142334, "logits/rejected": -1.344472885131836, "logps/chosen": -370.61798095703125, "logps/rejected": -360.02374267578125, "loss": 0.5487, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6546204686164856, "rewards/margins": 0.4409145414829254, "rewards/rejected": -1.0955349206924438, "step": 130 }, { "epoch": 0.32, "grad_norm": 22.937175363847114, "learning_rate": 4.299274747394055e-07, "logits/chosen": -1.5916811227798462, "logits/rejected": -1.4237914085388184, "logps/chosen": -361.98211669921875, "logps/rejected": -371.1536560058594, "loss": 0.5577, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4653448462486267, "rewards/margins": 0.5270770788192749, "rewards/rejected": -0.9924219250679016, "step": 140 }, { "epoch": 0.34285714285714286, "grad_norm": 22.019311232158735, "learning_rate": 4.1549280046953653e-07, "logits/chosen": -0.7861512303352356, "logits/rejected": -0.3463224768638611, "logps/chosen": -362.31756591796875, "logps/rejected": -414.545166015625, "loss": 0.5335, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7533167004585266, "rewards/margins": 0.6291457414627075, "rewards/rejected": -1.3824622631072998, "step": 150 }, { "epoch": 0.3657142857142857, "grad_norm": 28.0394510543571, "learning_rate": 4.000011566683401e-07, "logits/chosen": -0.27471694350242615, "logits/rejected": 0.3287709653377533, "logps/chosen": -392.02667236328125, "logps/rejected": -424.70556640625, "loss": 0.5445, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9268640279769897, "rewards/margins": 0.7637636065483093, "rewards/rejected": -1.6906276941299438, "step": 160 }, { "epoch": 0.38857142857142857, "grad_norm": 26.83796617150145, "learning_rate": 3.8355148537705047e-07, "logits/chosen": -0.9808514714241028, "logits/rejected": -0.5751891732215881, "logps/chosen": -374.8761291503906, "logps/rejected": -386.53851318359375, "loss": 0.5452, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7077123522758484, "rewards/margins": 0.5003622770309448, "rewards/rejected": -1.2080745697021484, "step": 170 }, { "epoch": 0.4114285714285714, "grad_norm": 24.04944603295859, "learning_rate": 3.662488473675315e-07, "logits/chosen": -1.283348798751831, "logits/rejected": -0.556102991104126, "logps/chosen": -405.58306884765625, "logps/rejected": -420.27606201171875, "loss": 0.5456, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7540328502655029, "rewards/margins": 0.8492224812507629, "rewards/rejected": -1.603255271911621, "step": 180 }, { "epoch": 0.4342857142857143, "grad_norm": 24.200865372247023, "learning_rate": 3.48203751140067e-07, "logits/chosen": -0.9612107276916504, "logits/rejected": -0.4138285517692566, "logps/chosen": -374.7398376464844, "logps/rejected": -386.4855041503906, "loss": 0.5501, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9848228693008423, "rewards/margins": 0.5350502133369446, "rewards/rejected": -1.5198729038238525, "step": 190 }, { "epoch": 0.45714285714285713, "grad_norm": 21.201969213616838, "learning_rate": 3.2953144712759537e-07, "logits/chosen": -1.3318579196929932, "logits/rejected": -0.688397228717804, "logps/chosen": -338.3636169433594, "logps/rejected": -369.132080078125, "loss": 0.5431, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7002092599868774, "rewards/margins": 0.7023059725761414, "rewards/rejected": -1.402515172958374, "step": 200 }, { "epoch": 0.45714285714285713, "eval_logits/chosen": -1.0623676776885986, "eval_logits/rejected": -0.09592445194721222, "eval_logps/chosen": -338.33111572265625, "eval_logps/rejected": -371.40704345703125, "eval_loss": 0.5325908660888672, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -0.627271831035614, "eval_rewards/margins": 0.8960375785827637, "eval_rewards/rejected": -1.523309350013733, "eval_runtime": 90.9273, "eval_samples_per_second": 20.137, "eval_steps_per_second": 0.319, "step": 200 }, { "epoch": 0.48, "grad_norm": 26.3766221546479, "learning_rate": 3.103511916141658e-07, "logits/chosen": -0.7819164991378784, "logits/rejected": -0.08627365529537201, "logps/chosen": -337.33160400390625, "logps/rejected": -386.0670471191406, "loss": 0.5355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7387806177139282, "rewards/margins": 0.6690900921821594, "rewards/rejected": -1.4078707695007324, "step": 210 }, { "epoch": 0.5028571428571429, "grad_norm": 25.508902398231484, "learning_rate": 2.9078548506882117e-07, "logits/chosen": -0.4892755448818207, "logits/rejected": 0.36961695551872253, "logps/chosen": -375.30145263671875, "logps/rejected": -400.32037353515625, "loss": 0.5589, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.90235435962677, "rewards/margins": 0.6390146613121033, "rewards/rejected": -1.5413691997528076, "step": 220 }, { "epoch": 0.5257142857142857, "grad_norm": 20.815219094385377, "learning_rate": 2.709592897595191e-07, "logits/chosen": -0.5455812215805054, "logits/rejected": 0.40110301971435547, "logps/chosen": -356.85198974609375, "logps/rejected": -377.463623046875, "loss": 0.5282, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7579285502433777, "rewards/margins": 0.6238406896591187, "rewards/rejected": -1.3817692995071411, "step": 230 }, { "epoch": 0.5485714285714286, "grad_norm": 29.165147607391557, "learning_rate": 2.509992316440332e-07, "logits/chosen": -0.44089436531066895, "logits/rejected": 0.5840796828269958, "logps/chosen": -384.84747314453125, "logps/rejected": -446.173828125, "loss": 0.5281, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8613764047622681, "rewards/margins": 0.8861438632011414, "rewards/rejected": -1.7475202083587646, "step": 240 }, { "epoch": 0.5714285714285714, "grad_norm": 21.600370107234998, "learning_rate": 2.3103279163519918e-07, "logits/chosen": -0.7112084031105042, "logits/rejected": -0.16417662799358368, "logps/chosen": -350.6177978515625, "logps/rejected": -410.4039001464844, "loss": 0.5421, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.69991534948349, "rewards/margins": 0.7021188735961914, "rewards/rejected": -1.4020342826843262, "step": 250 }, { "epoch": 0.5942857142857143, "grad_norm": 25.092625042887974, "learning_rate": 2.1118749140573358e-07, "logits/chosen": -0.9435871243476868, "logits/rejected": -0.3378845751285553, "logps/chosen": -350.99188232421875, "logps/rejected": -403.04901123046875, "loss": 0.5464, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7120517492294312, "rewards/margins": 0.5850083231925964, "rewards/rejected": -1.297060251235962, "step": 260 }, { "epoch": 0.6171428571428571, "grad_norm": 27.282048688910123, "learning_rate": 1.9159007893272703e-07, "logits/chosen": -0.04894972965121269, "logits/rejected": 1.124455213546753, "logps/chosen": -359.29815673828125, "logps/rejected": -393.60260009765625, "loss": 0.5192, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8677228689193726, "rewards/margins": 0.7632287740707397, "rewards/rejected": -1.6309516429901123, "step": 270 }, { "epoch": 0.64, "grad_norm": 28.952729734552342, "learning_rate": 1.7236571898357766e-07, "logits/chosen": 0.5159433484077454, "logits/rejected": 1.3371174335479736, "logps/chosen": -371.30694580078125, "logps/rejected": -441.8828125, "loss": 0.5321, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9826878309249878, "rewards/margins": 0.8050721287727356, "rewards/rejected": -1.7877601385116577, "step": 280 }, { "epoch": 0.6628571428571428, "grad_norm": 26.553591107110048, "learning_rate": 1.5363719371356882e-07, "logits/chosen": 0.31891578435897827, "logits/rejected": 1.1744709014892578, "logps/chosen": -396.26849365234375, "logps/rejected": -436.003173828125, "loss": 0.527, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9536263346672058, "rewards/margins": 0.7509206533432007, "rewards/rejected": -1.7045469284057617, "step": 290 }, { "epoch": 0.6857142857142857, "grad_norm": 21.68374961944068, "learning_rate": 1.3552411848071565e-07, "logits/chosen": -0.4674099385738373, "logits/rejected": 0.9225466847419739, "logps/chosen": -378.4803771972656, "logps/rejected": -411.07000732421875, "loss": 0.5138, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7657118439674377, "rewards/margins": 0.8185675740242004, "rewards/rejected": -1.5842792987823486, "step": 300 }, { "epoch": 0.6857142857142857, "eval_logits/chosen": -0.7676966190338135, "eval_logits/rejected": 0.6739733219146729, "eval_logps/chosen": -332.8910827636719, "eval_logps/rejected": -368.96173095703125, "eval_loss": 0.524158775806427, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -0.5728713274002075, "eval_rewards/margins": 0.9259848594665527, "eval_rewards/rejected": -1.4988560676574707, "eval_runtime": 90.1222, "eval_samples_per_second": 20.317, "eval_steps_per_second": 0.322, "step": 300 }, { "epoch": 0.7085714285714285, "grad_norm": 21.32729150729434, "learning_rate": 1.1814217788631473e-07, "logits/chosen": -0.6603255271911621, "logits/rejected": 0.27117711305618286, "logps/chosen": -326.17193603515625, "logps/rejected": -374.136474609375, "loss": 0.5335, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6986777782440186, "rewards/margins": 0.6707334518432617, "rewards/rejected": -1.3694112300872803, "step": 310 }, { "epoch": 0.7314285714285714, "grad_norm": 26.8635604286969, "learning_rate": 1.0160238692045331e-07, "logits/chosen": -0.23095369338989258, "logits/rejected": 0.5602467656135559, "logps/chosen": -329.86492919921875, "logps/rejected": -387.0187683105469, "loss": 0.5276, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8558257222175598, "rewards/margins": 0.6044800281524658, "rewards/rejected": -1.4603056907653809, "step": 320 }, { "epoch": 0.7542857142857143, "grad_norm": 24.235988068898312, "learning_rate": 8.601038193139438e-08, "logits/chosen": -0.3879459798336029, "logits/rejected": 0.737913966178894, "logps/chosen": -381.5906677246094, "logps/rejected": -407.6598815917969, "loss": 0.5205, "rewards/accuracies": 0.78125, "rewards/chosen": -0.823724627494812, "rewards/margins": 0.7749950289726257, "rewards/rejected": -1.5987197160720825, "step": 330 }, { "epoch": 0.7771428571428571, "grad_norm": 19.828650597594493, "learning_rate": 7.146574594727572e-08, "logits/chosen": 0.20474159717559814, "logits/rejected": 1.0162971019744873, "logps/chosen": -362.6044921875, "logps/rejected": -421.4503479003906, "loss": 0.521, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9665349721908569, "rewards/margins": 0.8277307748794556, "rewards/rejected": -1.7942657470703125, "step": 340 }, { "epoch": 0.8, "grad_norm": 22.77041280299393, "learning_rate": 5.8061372659157306e-08, "logits/chosen": -0.23095539212226868, "logits/rejected": 0.8589683771133423, "logps/chosen": -380.7283630371094, "logps/rejected": -412.0213317871094, "loss": 0.5294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9102112054824829, "rewards/margins": 0.667026162147522, "rewards/rejected": -1.5772373676300049, "step": 350 }, { "epoch": 0.8228571428571428, "grad_norm": 24.963575760417292, "learning_rate": 4.5882873127531614e-08, "logits/chosen": -0.4319024682044983, "logits/rejected": 0.861344039440155, "logps/chosen": -366.2247314453125, "logps/rejected": -415.48028564453125, "loss": 0.5177, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8081969022750854, "rewards/margins": 0.8357402086257935, "rewards/rejected": -1.643937110900879, "step": 360 }, { "epoch": 0.8457142857142858, "grad_norm": 22.865584640185784, "learning_rate": 3.500802900154412e-08, "logits/chosen": -0.290349543094635, "logits/rejected": 1.0458359718322754, "logps/chosen": -349.37005615234375, "logps/rejected": -402.3358459472656, "loss": 0.5229, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7830866575241089, "rewards/margins": 0.856115996837616, "rewards/rejected": -1.6392027139663696, "step": 370 }, { "epoch": 0.8685714285714285, "grad_norm": 25.875193424699745, "learning_rate": 2.550629574310309e-08, "logits/chosen": -0.4227335453033447, "logits/rejected": 0.8518049120903015, "logps/chosen": -414.580322265625, "logps/rejected": -417.60418701171875, "loss": 0.5205, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9652398824691772, "rewards/margins": 0.6368075609207153, "rewards/rejected": -1.602047324180603, "step": 380 }, { "epoch": 0.8914285714285715, "grad_norm": 23.202834331091832, "learning_rate": 1.7438359028687983e-08, "logits/chosen": -0.143943652510643, "logits/rejected": 0.5985423922538757, "logps/chosen": -392.14068603515625, "logps/rejected": -445.93560791015625, "loss": 0.526, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8076774477958679, "rewards/margins": 0.7039517760276794, "rewards/rejected": -1.5116291046142578, "step": 390 }, { "epoch": 0.9142857142857143, "grad_norm": 31.25928182468869, "learning_rate": 1.0855747162029361e-08, "logits/chosen": 0.005290505476295948, "logits/rejected": 0.5700523257255554, "logps/chosen": -371.91815185546875, "logps/rejected": -422.913818359375, "loss": 0.5493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9318493604660034, "rewards/margins": 0.6315523982048035, "rewards/rejected": -1.563401699066162, "step": 400 }, { "epoch": 0.9142857142857143, "eval_logits/chosen": -0.012980658560991287, "eval_logits/rejected": 1.572424292564392, "eval_logps/chosen": -357.8257751464844, "eval_logps/rejected": -398.6412353515625, "eval_loss": 0.5201366543769836, "eval_rewards/accuracies": 0.7758620977401733, "eval_rewards/chosen": -0.8222182989120483, "eval_rewards/margins": 0.9734326004981995, "eval_rewards/rejected": -1.795650839805603, "eval_runtime": 90.9384, "eval_samples_per_second": 20.135, "eval_steps_per_second": 0.319, "step": 400 }, { "epoch": 0.9371428571428572, "grad_norm": 24.160283510679385, "learning_rate": 5.8005019731033615e-09, "logits/chosen": -0.10076072067022324, "logits/rejected": 0.8628407716751099, "logps/chosen": -385.8863525390625, "logps/rejected": -425.5079040527344, "loss": 0.5157, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.977064311504364, "rewards/margins": 0.6807142496109009, "rewards/rejected": -1.6577785015106201, "step": 410 }, { "epoch": 0.96, "grad_norm": 21.949347011437847, "learning_rate": 2.3049103053431886e-09, "logits/chosen": -0.21599116921424866, "logits/rejected": 1.2091736793518066, "logps/chosen": -353.96514892578125, "logps/rejected": -403.6047058105469, "loss": 0.5167, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7121706604957581, "rewards/margins": 0.9840081930160522, "rewards/rejected": -1.696178674697876, "step": 420 }, { "epoch": 0.9828571428571429, "grad_norm": 23.473520497590073, "learning_rate": 3.9129780600541397e-10, "logits/chosen": 0.03587682545185089, "logits/rejected": 1.0841922760009766, "logps/chosen": -372.0699768066406, "logps/rejected": -431.086181640625, "loss": 0.5158, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8983810544013977, "rewards/margins": 0.775784432888031, "rewards/rejected": -1.6741654872894287, "step": 430 }, { "epoch": 0.9988571428571429, "step": 437, "total_flos": 0.0, "train_loss": 0.5608050315822016, "train_runtime": 10950.5403, "train_samples_per_second": 5.114, "train_steps_per_second": 0.04 } ], "logging_steps": 10, "max_steps": 437, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }