{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988571428571429, "eval_steps": 50, "global_step": 437, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022857142857142857, "grad_norm": 5.131386476065513, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.7012476921081543, "logits/rejected": -2.6254587173461914, "logps/chosen": -301.2864685058594, "logps/rejected": -281.73876953125, "loss": 0.6931, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -3.3517978863528697e-06, "rewards/margins": -0.00011216916755074635, "rewards/rejected": 0.0001088173157768324, "step": 10 }, { "epoch": 0.045714285714285714, "grad_norm": 4.052727080189218, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.640902042388916, "logits/rejected": -2.6057076454162598, "logps/chosen": -278.9025573730469, "logps/rejected": -254.7306671142578, "loss": 0.6927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0026862886734306812, "rewards/margins": 0.0023018636275082827, "rewards/rejected": 0.00038442533696070313, "step": 20 }, { "epoch": 0.06857142857142857, "grad_norm": 4.434624613429657, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6378769874572754, "logits/rejected": -2.6170475482940674, "logps/chosen": -263.394287109375, "logps/rejected": -263.47857666015625, "loss": 0.6898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.012888330034911633, "rewards/margins": 0.007743468042463064, "rewards/rejected": 0.005144862923771143, "step": 30 }, { "epoch": 0.09142857142857143, "grad_norm": 5.043723997595032, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.648149013519287, "logits/rejected": -2.5857646465301514, "logps/chosen": -290.42108154296875, "logps/rejected": -268.315185546875, "loss": 0.6832, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.03576214984059334, "rewards/margins": 0.042601048946380615, "rewards/rejected": -0.006838902831077576, "step": 40 }, { "epoch": 0.11428571428571428, "grad_norm": 7.751230449827184, "learning_rate": 4.997124959943201e-07, "logits/chosen": -2.675534725189209, "logits/rejected": -2.5954620838165283, "logps/chosen": -294.5313720703125, "logps/rejected": -254.864013671875, "loss": 0.6745, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.017789244651794434, "rewards/margins": 0.09778620302677155, "rewards/rejected": -0.07999695837497711, "step": 50 }, { "epoch": 0.11428571428571428, "eval_logits/chosen": -2.537318468093872, "eval_logits/rejected": -2.435882806777954, "eval_logps/chosen": -278.02972412109375, "eval_logps/rejected": -236.3332977294922, "eval_loss": 0.6651818156242371, "eval_rewards/accuracies": 0.6853448152542114, "eval_rewards/chosen": -0.024257637560367584, "eval_rewards/margins": 0.14831387996673584, "eval_rewards/rejected": -0.17257152497768402, "eval_runtime": 90.3676, "eval_samples_per_second": 20.262, "eval_steps_per_second": 0.321, "step": 50 }, { "epoch": 0.13714285714285715, "grad_norm": 6.305194820143998, "learning_rate": 4.979579212164186e-07, "logits/chosen": -2.5774006843566895, "logits/rejected": -2.4738996028900146, "logps/chosen": -293.5943908691406, "logps/rejected": -274.21575927734375, "loss": 0.6582, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13088412582874298, "rewards/margins": 0.12575200200080872, "rewards/rejected": -0.2566361129283905, "step": 60 }, { "epoch": 0.16, "grad_norm": 7.496714062889331, "learning_rate": 4.946196886175515e-07, "logits/chosen": -2.6025655269622803, "logits/rejected": -2.5487170219421387, "logps/chosen": -289.283203125, "logps/rejected": -295.7733459472656, "loss": 0.6391, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.13504940271377563, "rewards/margins": 0.22641614079475403, "rewards/rejected": -0.36146557331085205, "step": 70 }, { "epoch": 0.18285714285714286, "grad_norm": 9.182158909337614, "learning_rate": 4.897191188239667e-07, "logits/chosen": -2.635709047317505, "logits/rejected": -2.6006340980529785, "logps/chosen": -301.3983459472656, "logps/rejected": -321.3275451660156, "loss": 0.6321, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3436935245990753, "rewards/margins": 0.3164929151535034, "rewards/rejected": -0.6601864099502563, "step": 80 }, { "epoch": 0.2057142857142857, "grad_norm": 12.866627758013413, "learning_rate": 4.832875107981763e-07, "logits/chosen": -2.6577916145324707, "logits/rejected": -2.6026382446289062, "logps/chosen": -306.7483215332031, "logps/rejected": -331.4389343261719, "loss": 0.6272, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.30463024973869324, "rewards/margins": 0.45427924394607544, "rewards/rejected": -0.7589095830917358, "step": 90 }, { "epoch": 0.22857142857142856, "grad_norm": 10.120425097316348, "learning_rate": 4.753659419387223e-07, "logits/chosen": -2.6471400260925293, "logits/rejected": -2.5577075481414795, "logps/chosen": -336.1173095703125, "logps/rejected": -324.4599609375, "loss": 0.6243, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.49730944633483887, "rewards/margins": 0.41277560591697693, "rewards/rejected": -0.9100850820541382, "step": 100 }, { "epoch": 0.22857142857142856, "eval_logits/chosen": -2.496720790863037, "eval_logits/rejected": -2.3884377479553223, "eval_logps/chosen": -324.6850280761719, "eval_logps/rejected": -321.733154296875, "eval_loss": 0.6148081421852112, "eval_rewards/accuracies": 0.7025862336158752, "eval_rewards/chosen": -0.49081093072891235, "eval_rewards/margins": 0.5357595682144165, "eval_rewards/rejected": -1.0265703201293945, "eval_runtime": 90.5627, "eval_samples_per_second": 20.218, "eval_steps_per_second": 0.32, "step": 100 }, { "epoch": 0.25142857142857145, "grad_norm": 13.934592452644212, "learning_rate": 4.660050057270191e-07, "logits/chosen": -2.161785364151001, "logits/rejected": -2.060573101043701, "logps/chosen": -390.38104248046875, "logps/rejected": -416.17913818359375, "loss": 0.6043, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7407739758491516, "rewards/margins": 0.4419216215610504, "rewards/rejected": -1.1826956272125244, "step": 110 }, { "epoch": 0.2742857142857143, "grad_norm": 18.04191794745382, "learning_rate": 4.5526448859687144e-07, "logits/chosen": -0.9994190335273743, "logits/rejected": -0.5663596391677856, "logps/chosen": -400.4964904785156, "logps/rejected": -379.3451232910156, "loss": 0.5895, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8959201574325562, "rewards/margins": 0.580389142036438, "rewards/rejected": -1.4763094186782837, "step": 120 }, { "epoch": 0.29714285714285715, "grad_norm": 23.569070805460544, "learning_rate": 4.432129880904388e-07, "logits/chosen": 0.06917886435985565, "logits/rejected": 0.525992751121521, "logps/chosen": -440.16839599609375, "logps/rejected": -436.06341552734375, "loss": 0.5759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.350124716758728, "rewards/margins": 0.505806565284729, "rewards/rejected": -1.8559315204620361, "step": 130 }, { "epoch": 0.32, "grad_norm": 17.135407563954015, "learning_rate": 4.299274747394055e-07, "logits/chosen": -0.5247443914413452, "logits/rejected": -0.054386675357818604, "logps/chosen": -396.3623352050781, "logps/rejected": -422.71234130859375, "loss": 0.5878, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.809146523475647, "rewards/margins": 0.6988624930381775, "rewards/rejected": -1.5080091953277588, "step": 140 }, { "epoch": 0.34285714285714286, "grad_norm": 21.61040590906815, "learning_rate": 4.1549280046953653e-07, "logits/chosen": -0.23892001807689667, "logits/rejected": 0.5097410082817078, "logps/chosen": -382.4234924316406, "logps/rejected": -453.66033935546875, "loss": 0.5613, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9543758630752563, "rewards/margins": 0.8192380666732788, "rewards/rejected": -1.7736135721206665, "step": 150 }, { "epoch": 0.34285714285714286, "eval_logits/chosen": 0.7831615805625916, "eval_logits/rejected": 1.843852162361145, "eval_logps/chosen": -431.0166015625, "eval_logps/rejected": -483.92266845703125, "eval_loss": 0.5827990770339966, "eval_rewards/accuracies": 0.7241379022598267, "eval_rewards/chosen": -1.5541267395019531, "eval_rewards/margins": 1.0943388938903809, "eval_rewards/rejected": -2.648465394973755, "eval_runtime": 90.809, "eval_samples_per_second": 20.163, "eval_steps_per_second": 0.319, "step": 150 }, { "epoch": 0.3657142857142857, "grad_norm": 21.503001868471035, "learning_rate": 4.000011566683401e-07, "logits/chosen": 0.6514826416969299, "logits/rejected": 1.4523377418518066, "logps/chosen": -458.98016357421875, "logps/rejected": -519.3821411132812, "loss": 0.5709, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5963990688323975, "rewards/margins": 1.04099440574646, "rewards/rejected": -2.6373934745788574, "step": 160 }, { "epoch": 0.38857142857142857, "grad_norm": 22.124021346867316, "learning_rate": 3.8355148537705047e-07, "logits/chosen": 0.3275122046470642, "logits/rejected": 1.0541610717773438, "logps/chosen": -431.19708251953125, "logps/rejected": -465.676025390625, "loss": 0.5719, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2709215879440308, "rewards/margins": 0.7285287976264954, "rewards/rejected": -1.999450445175171, "step": 170 }, { "epoch": 0.4114285714285714, "grad_norm": 27.446979249488912, "learning_rate": 3.662488473675315e-07, "logits/chosen": -0.1673279106616974, "logits/rejected": 0.9503982663154602, "logps/chosen": -435.6683654785156, "logps/rejected": -467.34991455078125, "loss": 0.5746, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.054885745048523, "rewards/margins": 1.0191079378128052, "rewards/rejected": -2.073993682861328, "step": 180 }, { "epoch": 0.4342857142857143, "grad_norm": 21.261318648058893, "learning_rate": 3.48203751140067e-07, "logits/chosen": 0.8843255043029785, "logits/rejected": 1.7267229557037354, "logps/chosen": -421.56427001953125, "logps/rejected": -454.6175842285156, "loss": 0.5709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4530667066574097, "rewards/margins": 0.7481273412704468, "rewards/rejected": -2.2011942863464355, "step": 190 }, { "epoch": 0.45714285714285713, "grad_norm": 18.016130637147523, "learning_rate": 3.2953144712759537e-07, "logits/chosen": 0.8405307531356812, "logits/rejected": 1.9338849782943726, "logps/chosen": -396.523193359375, "logps/rejected": -462.48724365234375, "loss": 0.5618, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2818048000335693, "rewards/margins": 1.0542625188827515, "rewards/rejected": -2.336066961288452, "step": 200 }, { "epoch": 0.45714285714285713, "eval_logits/chosen": 1.353513240814209, "eval_logits/rejected": 2.633384943008423, "eval_logps/chosen": -407.1748962402344, "eval_logps/rejected": -473.4366149902344, "eval_loss": 0.5593964457511902, "eval_rewards/accuracies": 0.7456896305084229, "eval_rewards/chosen": -1.3157094717025757, "eval_rewards/margins": 1.2278952598571777, "eval_rewards/rejected": -2.543604850769043, "eval_runtime": 90.6049, "eval_samples_per_second": 20.209, "eval_steps_per_second": 0.32, "step": 200 }, { "epoch": 0.48, "grad_norm": 13.884449533300678, "learning_rate": 3.103511916141658e-07, "logits/chosen": 1.2088024616241455, "logits/rejected": 2.1989169120788574, "logps/chosen": -401.5423278808594, "logps/rejected": -482.7976989746094, "loss": 0.5543, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.380887746810913, "rewards/margins": 0.9942899942398071, "rewards/rejected": -2.3751778602600098, "step": 210 }, { "epoch": 0.5028571428571429, "grad_norm": 22.105068043395047, "learning_rate": 2.9078548506882117e-07, "logits/chosen": 1.1457306146621704, "logits/rejected": 2.113302707672119, "logps/chosen": -436.2627868652344, "logps/rejected": -482.525146484375, "loss": 0.573, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5119678974151611, "rewards/margins": 0.8514491319656372, "rewards/rejected": -2.363417387008667, "step": 220 }, { "epoch": 0.5257142857142857, "grad_norm": 22.834936526556735, "learning_rate": 2.709592897595191e-07, "logits/chosen": 1.5853362083435059, "logits/rejected": 2.6565065383911133, "logps/chosen": -436.3871154785156, "logps/rejected": -490.07977294921875, "loss": 0.5539, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.553279995918274, "rewards/margins": 0.9546509981155396, "rewards/rejected": -2.5079312324523926, "step": 230 }, { "epoch": 0.5485714285714286, "grad_norm": 24.160129408331898, "learning_rate": 2.509992316440332e-07, "logits/chosen": 1.073169469833374, "logits/rejected": 2.0948684215545654, "logps/chosen": -435.13104248046875, "logps/rejected": -528.5726318359375, "loss": 0.5519, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3642125129699707, "rewards/margins": 1.2072954177856445, "rewards/rejected": -2.5715081691741943, "step": 240 }, { "epoch": 0.5714285714285714, "grad_norm": 20.623735608734524, "learning_rate": 2.3103279163519918e-07, "logits/chosen": 1.123517394065857, "logits/rejected": 1.916009545326233, "logps/chosen": -412.7406311035156, "logps/rejected": -501.85394287109375, "loss": 0.558, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3211438655853271, "rewards/margins": 0.9953910112380981, "rewards/rejected": -2.316534996032715, "step": 250 }, { "epoch": 0.5714285714285714, "eval_logits/chosen": 1.5315722227096558, "eval_logits/rejected": 3.1078054904937744, "eval_logps/chosen": -408.7300720214844, "eval_logps/rejected": -478.3948059082031, "eval_loss": 0.5526223182678223, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -1.3312608003616333, "eval_rewards/margins": 1.2619256973266602, "eval_rewards/rejected": -2.593186378479004, "eval_runtime": 90.8269, "eval_samples_per_second": 20.159, "eval_steps_per_second": 0.319, "step": 250 }, { "epoch": 0.5942857142857143, "grad_norm": 18.937383972416836, "learning_rate": 2.1118749140573358e-07, "logits/chosen": 2.365237236022949, "logits/rejected": 2.8070249557495117, "logps/chosen": -467.107666015625, "logps/rejected": -548.4678344726562, "loss": 0.5559, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8732095956802368, "rewards/margins": 0.8780391812324524, "rewards/rejected": -2.751248598098755, "step": 260 }, { "epoch": 0.6171428571428571, "grad_norm": 21.474579443049855, "learning_rate": 1.9159007893272703e-07, "logits/chosen": 2.5897929668426514, "logits/rejected": 3.8603546619415283, "logps/chosen": -472.37420654296875, "logps/rejected": -535.6754150390625, "loss": 0.5468, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9984840154647827, "rewards/margins": 1.0531952381134033, "rewards/rejected": -3.0516793727874756, "step": 270 }, { "epoch": 0.64, "grad_norm": 26.380689829844055, "learning_rate": 1.7236571898357766e-07, "logits/chosen": 2.368163824081421, "logits/rejected": 3.2206153869628906, "logps/chosen": -457.65478515625, "logps/rejected": -559.1119384765625, "loss": 0.5472, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8461668491363525, "rewards/margins": 1.113884687423706, "rewards/rejected": -2.9600515365600586, "step": 280 }, { "epoch": 0.6628571428571428, "grad_norm": 21.888196354392417, "learning_rate": 1.5363719371356882e-07, "logits/chosen": 2.094879627227783, "logits/rejected": 2.8882927894592285, "logps/chosen": -477.3121643066406, "logps/rejected": -541.1532592773438, "loss": 0.5476, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.764062523841858, "rewards/margins": 0.991985023021698, "rewards/rejected": -2.7560477256774902, "step": 290 }, { "epoch": 0.6857142857142857, "grad_norm": 16.876283045841564, "learning_rate": 1.3552411848071565e-07, "logits/chosen": 2.240018844604492, "logits/rejected": 3.6286048889160156, "logps/chosen": -489.7718200683594, "logps/rejected": -560.1759033203125, "loss": 0.5399, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8786265850067139, "rewards/margins": 1.196711778640747, "rewards/rejected": -3.075338363647461, "step": 300 }, { "epoch": 0.6857142857142857, "eval_logits/chosen": 2.3192012310028076, "eval_logits/rejected": 3.8099868297576904, "eval_logps/chosen": -449.146484375, "eval_logps/rejected": -527.6614990234375, "eval_loss": 0.5464984178543091, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -1.7354251146316528, "eval_rewards/margins": 1.3504279851913452, "eval_rewards/rejected": -3.0858535766601562, "eval_runtime": 89.7327, "eval_samples_per_second": 20.405, "eval_steps_per_second": 0.323, "step": 300 }, { "epoch": 0.7085714285714285, "grad_norm": 17.86356014365164, "learning_rate": 1.1814217788631473e-07, "logits/chosen": 2.34942626953125, "logits/rejected": 3.1517879962921143, "logps/chosen": -443.734375, "logps/rejected": -516.211669921875, "loss": 0.5491, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8743022680282593, "rewards/margins": 0.9158605337142944, "rewards/rejected": -2.790163040161133, "step": 310 }, { "epoch": 0.7314285714285714, "grad_norm": 18.11439951327569, "learning_rate": 1.0160238692045331e-07, "logits/chosen": 2.115510940551758, "logits/rejected": 2.9685635566711426, "logps/chosen": -417.16412353515625, "logps/rejected": -496.8562927246094, "loss": 0.5474, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.728817343711853, "rewards/margins": 0.8298628926277161, "rewards/rejected": -2.558680295944214, "step": 320 }, { "epoch": 0.7542857142857143, "grad_norm": 23.20251472261412, "learning_rate": 8.601038193139438e-08, "logits/chosen": 1.6642992496490479, "logits/rejected": 2.858081579208374, "logps/chosen": -455.21453857421875, "logps/rejected": -516.3800048828125, "loss": 0.5498, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5599634647369385, "rewards/margins": 1.1259580850601196, "rewards/rejected": -2.6859214305877686, "step": 330 }, { "epoch": 0.7771428571428571, "grad_norm": 17.97386136540539, "learning_rate": 7.146574594727572e-08, "logits/chosen": 2.353365659713745, "logits/rejected": 3.1233067512512207, "logps/chosen": -446.12493896484375, "logps/rejected": -536.3157958984375, "loss": 0.5373, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.801740050315857, "rewards/margins": 1.141181230545044, "rewards/rejected": -2.9429211616516113, "step": 340 }, { "epoch": 0.8, "grad_norm": 21.871959889194457, "learning_rate": 5.8061372659157306e-08, "logits/chosen": 2.052710771560669, "logits/rejected": 3.2841498851776123, "logps/chosen": -481.0086975097656, "logps/rejected": -538.4657592773438, "loss": 0.5536, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9130144119262695, "rewards/margins": 0.9286670684814453, "rewards/rejected": -2.8416812419891357, "step": 350 }, { "epoch": 0.8, "eval_logits/chosen": 2.5496232509613037, "eval_logits/rejected": 4.100791931152344, "eval_logps/chosen": -459.9457702636719, "eval_logps/rejected": -537.3768310546875, "eval_loss": 0.5457741618156433, "eval_rewards/accuracies": 0.767241358757019, "eval_rewards/chosen": -1.8434182405471802, "eval_rewards/margins": 1.3395885229110718, "eval_rewards/rejected": -3.183006525039673, "eval_runtime": 90.4314, "eval_samples_per_second": 20.247, "eval_steps_per_second": 0.321, "step": 350 }, { "epoch": 0.8228571428571428, "grad_norm": 18.303747174951123, "learning_rate": 4.5882873127531614e-08, "logits/chosen": 2.120243549346924, "logits/rejected": 3.347181797027588, "logps/chosen": -475.18572998046875, "logps/rejected": -556.6296997070312, "loss": 0.5378, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8978074789047241, "rewards/margins": 1.1576240062713623, "rewards/rejected": -3.055431604385376, "step": 360 }, { "epoch": 0.8457142857142858, "grad_norm": 18.37433316843813, "learning_rate": 3.500802900154412e-08, "logits/chosen": 2.2264809608459473, "logits/rejected": 3.606698513031006, "logps/chosen": -450.2330627441406, "logps/rejected": -542.2530517578125, "loss": 0.5435, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7917168140411377, "rewards/margins": 1.2466588020324707, "rewards/rejected": -3.0383753776550293, "step": 370 }, { "epoch": 0.8685714285714285, "grad_norm": 21.62972776738107, "learning_rate": 2.550629574310309e-08, "logits/chosen": 1.8507616519927979, "logits/rejected": 3.276202440261841, "logps/chosen": -521.0817260742188, "logps/rejected": -551.79736328125, "loss": 0.543, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0302536487579346, "rewards/margins": 0.913725733757019, "rewards/rejected": -2.943979501724243, "step": 380 }, { "epoch": 0.8914285714285715, "grad_norm": 21.37326667405599, "learning_rate": 1.7438359028687983e-08, "logits/chosen": 2.1451709270477295, "logits/rejected": 2.937721014022827, "logps/chosen": -487.7579040527344, "logps/rejected": -573.7957763671875, "loss": 0.5485, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7638496160507202, "rewards/margins": 1.0263822078704834, "rewards/rejected": -2.7902321815490723, "step": 390 }, { "epoch": 0.9142857142857143, "grad_norm": 36.14381504781887, "learning_rate": 1.0855747162029361e-08, "logits/chosen": 2.2515556812286377, "logits/rejected": 2.859483480453491, "logps/chosen": -467.8270568847656, "logps/rejected": -549.77392578125, "loss": 0.5612, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.89093816280365, "rewards/margins": 0.9410650134086609, "rewards/rejected": -2.832003116607666, "step": 400 }, { "epoch": 0.9142857142857143, "eval_logits/chosen": 2.319566249847412, "eval_logits/rejected": 3.925558090209961, "eval_logps/chosen": -443.83612060546875, "eval_logps/rejected": -527.0015258789062, "eval_loss": 0.5425659418106079, "eval_rewards/accuracies": 0.7543103694915771, "eval_rewards/chosen": -1.6823216676712036, "eval_rewards/margins": 1.3969323635101318, "eval_rewards/rejected": -3.079254150390625, "eval_runtime": 90.5136, "eval_samples_per_second": 20.229, "eval_steps_per_second": 0.32, "step": 400 }, { "epoch": 0.9371428571428572, "grad_norm": 20.661658798051754, "learning_rate": 5.8005019731033615e-09, "logits/chosen": 2.158614158630371, "logits/rejected": 3.273141384124756, "logps/chosen": -486.8451232910156, "logps/rejected": -554.2020263671875, "loss": 0.5334, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9866514205932617, "rewards/margins": 0.9580680727958679, "rewards/rejected": -2.9447195529937744, "step": 410 }, { "epoch": 0.96, "grad_norm": 20.631088560363143, "learning_rate": 2.3049103053431886e-09, "logits/chosen": 2.0465073585510254, "logits/rejected": 3.5103249549865723, "logps/chosen": -440.74224853515625, "logps/rejected": -527.11376953125, "loss": 0.5445, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5799415111541748, "rewards/margins": 1.351327657699585, "rewards/rejected": -2.9312691688537598, "step": 420 }, { "epoch": 0.9828571428571429, "grad_norm": 21.077395280178123, "learning_rate": 3.9129780600541397e-10, "logits/chosen": 2.380171298980713, "logits/rejected": 3.3738162517547607, "logps/chosen": -453.9419860839844, "logps/rejected": -550.107421875, "loss": 0.5377, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7171010971069336, "rewards/margins": 1.1472762823104858, "rewards/rejected": -2.864377498626709, "step": 430 }, { "epoch": 0.9988571428571429, "step": 437, "total_flos": 0.0, "train_loss": 0.5805802214336614, "train_runtime": 11351.5205, "train_samples_per_second": 4.933, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 437, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }