sfulay's picture
Model save
ceeb09a verified
raw
history blame
28.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9988571428571429,
"eval_steps": 50,
"global_step": 437,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022857142857142857,
"grad_norm": 5.131386476065513,
"learning_rate": 1.1363636363636363e-07,
"logits/chosen": -2.7012476921081543,
"logits/rejected": -2.6254587173461914,
"logps/chosen": -301.2864685058594,
"logps/rejected": -281.73876953125,
"loss": 0.6931,
"rewards/accuracies": 0.36250001192092896,
"rewards/chosen": -3.3517978863528697e-06,
"rewards/margins": -0.00011216916755074635,
"rewards/rejected": 0.0001088173157768324,
"step": 10
},
{
"epoch": 0.045714285714285714,
"grad_norm": 4.052727080189218,
"learning_rate": 2.2727272727272726e-07,
"logits/chosen": -2.640902042388916,
"logits/rejected": -2.6057076454162598,
"logps/chosen": -278.9025573730469,
"logps/rejected": -254.7306671142578,
"loss": 0.6927,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0026862886734306812,
"rewards/margins": 0.0023018636275082827,
"rewards/rejected": 0.00038442533696070313,
"step": 20
},
{
"epoch": 0.06857142857142857,
"grad_norm": 4.434624613429657,
"learning_rate": 3.4090909090909085e-07,
"logits/chosen": -2.6378769874572754,
"logits/rejected": -2.6170475482940674,
"logps/chosen": -263.394287109375,
"logps/rejected": -263.47857666015625,
"loss": 0.6898,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.012888330034911633,
"rewards/margins": 0.007743468042463064,
"rewards/rejected": 0.005144862923771143,
"step": 30
},
{
"epoch": 0.09142857142857143,
"grad_norm": 5.043723997595032,
"learning_rate": 4.545454545454545e-07,
"logits/chosen": -2.648149013519287,
"logits/rejected": -2.5857646465301514,
"logps/chosen": -290.42108154296875,
"logps/rejected": -268.315185546875,
"loss": 0.6832,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.03576214984059334,
"rewards/margins": 0.042601048946380615,
"rewards/rejected": -0.006838902831077576,
"step": 40
},
{
"epoch": 0.11428571428571428,
"grad_norm": 7.751230449827184,
"learning_rate": 4.997124959943201e-07,
"logits/chosen": -2.675534725189209,
"logits/rejected": -2.5954620838165283,
"logps/chosen": -294.5313720703125,
"logps/rejected": -254.864013671875,
"loss": 0.6745,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.017789244651794434,
"rewards/margins": 0.09778620302677155,
"rewards/rejected": -0.07999695837497711,
"step": 50
},
{
"epoch": 0.11428571428571428,
"eval_logits/chosen": -2.537318468093872,
"eval_logits/rejected": -2.435882806777954,
"eval_logps/chosen": -278.02972412109375,
"eval_logps/rejected": -236.3332977294922,
"eval_loss": 0.6651818156242371,
"eval_rewards/accuracies": 0.6853448152542114,
"eval_rewards/chosen": -0.024257637560367584,
"eval_rewards/margins": 0.14831387996673584,
"eval_rewards/rejected": -0.17257152497768402,
"eval_runtime": 90.3676,
"eval_samples_per_second": 20.262,
"eval_steps_per_second": 0.321,
"step": 50
},
{
"epoch": 0.13714285714285715,
"grad_norm": 6.305194820143998,
"learning_rate": 4.979579212164186e-07,
"logits/chosen": -2.5774006843566895,
"logits/rejected": -2.4738996028900146,
"logps/chosen": -293.5943908691406,
"logps/rejected": -274.21575927734375,
"loss": 0.6582,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.13088412582874298,
"rewards/margins": 0.12575200200080872,
"rewards/rejected": -0.2566361129283905,
"step": 60
},
{
"epoch": 0.16,
"grad_norm": 7.496714062889331,
"learning_rate": 4.946196886175515e-07,
"logits/chosen": -2.6025655269622803,
"logits/rejected": -2.5487170219421387,
"logps/chosen": -289.283203125,
"logps/rejected": -295.7733459472656,
"loss": 0.6391,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.13504940271377563,
"rewards/margins": 0.22641614079475403,
"rewards/rejected": -0.36146557331085205,
"step": 70
},
{
"epoch": 0.18285714285714286,
"grad_norm": 9.182158909337614,
"learning_rate": 4.897191188239667e-07,
"logits/chosen": -2.635709047317505,
"logits/rejected": -2.6006340980529785,
"logps/chosen": -301.3983459472656,
"logps/rejected": -321.3275451660156,
"loss": 0.6321,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.3436935245990753,
"rewards/margins": 0.3164929151535034,
"rewards/rejected": -0.6601864099502563,
"step": 80
},
{
"epoch": 0.2057142857142857,
"grad_norm": 12.866627758013413,
"learning_rate": 4.832875107981763e-07,
"logits/chosen": -2.6577916145324707,
"logits/rejected": -2.6026382446289062,
"logps/chosen": -306.7483215332031,
"logps/rejected": -331.4389343261719,
"loss": 0.6272,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.30463024973869324,
"rewards/margins": 0.45427924394607544,
"rewards/rejected": -0.7589095830917358,
"step": 90
},
{
"epoch": 0.22857142857142856,
"grad_norm": 10.120425097316348,
"learning_rate": 4.753659419387223e-07,
"logits/chosen": -2.6471400260925293,
"logits/rejected": -2.5577075481414795,
"logps/chosen": -336.1173095703125,
"logps/rejected": -324.4599609375,
"loss": 0.6243,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.49730944633483887,
"rewards/margins": 0.41277560591697693,
"rewards/rejected": -0.9100850820541382,
"step": 100
},
{
"epoch": 0.22857142857142856,
"eval_logits/chosen": -2.496720790863037,
"eval_logits/rejected": -2.3884377479553223,
"eval_logps/chosen": -324.6850280761719,
"eval_logps/rejected": -321.733154296875,
"eval_loss": 0.6148081421852112,
"eval_rewards/accuracies": 0.7025862336158752,
"eval_rewards/chosen": -0.49081093072891235,
"eval_rewards/margins": 0.5357595682144165,
"eval_rewards/rejected": -1.0265703201293945,
"eval_runtime": 90.5627,
"eval_samples_per_second": 20.218,
"eval_steps_per_second": 0.32,
"step": 100
},
{
"epoch": 0.25142857142857145,
"grad_norm": 13.934592452644212,
"learning_rate": 4.660050057270191e-07,
"logits/chosen": -2.161785364151001,
"logits/rejected": -2.060573101043701,
"logps/chosen": -390.38104248046875,
"logps/rejected": -416.17913818359375,
"loss": 0.6043,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.7407739758491516,
"rewards/margins": 0.4419216215610504,
"rewards/rejected": -1.1826956272125244,
"step": 110
},
{
"epoch": 0.2742857142857143,
"grad_norm": 18.04191794745382,
"learning_rate": 4.5526448859687144e-07,
"logits/chosen": -0.9994190335273743,
"logits/rejected": -0.5663596391677856,
"logps/chosen": -400.4964904785156,
"logps/rejected": -379.3451232910156,
"loss": 0.5895,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8959201574325562,
"rewards/margins": 0.580389142036438,
"rewards/rejected": -1.4763094186782837,
"step": 120
},
{
"epoch": 0.29714285714285715,
"grad_norm": 23.569070805460544,
"learning_rate": 4.432129880904388e-07,
"logits/chosen": 0.06917886435985565,
"logits/rejected": 0.525992751121521,
"logps/chosen": -440.16839599609375,
"logps/rejected": -436.06341552734375,
"loss": 0.5759,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.350124716758728,
"rewards/margins": 0.505806565284729,
"rewards/rejected": -1.8559315204620361,
"step": 130
},
{
"epoch": 0.32,
"grad_norm": 17.135407563954015,
"learning_rate": 4.299274747394055e-07,
"logits/chosen": -0.5247443914413452,
"logits/rejected": -0.054386675357818604,
"logps/chosen": -396.3623352050781,
"logps/rejected": -422.71234130859375,
"loss": 0.5878,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.809146523475647,
"rewards/margins": 0.6988624930381775,
"rewards/rejected": -1.5080091953277588,
"step": 140
},
{
"epoch": 0.34285714285714286,
"grad_norm": 21.61040590906815,
"learning_rate": 4.1549280046953653e-07,
"logits/chosen": -0.23892001807689667,
"logits/rejected": 0.5097410082817078,
"logps/chosen": -382.4234924316406,
"logps/rejected": -453.66033935546875,
"loss": 0.5613,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9543758630752563,
"rewards/margins": 0.8192380666732788,
"rewards/rejected": -1.7736135721206665,
"step": 150
},
{
"epoch": 0.34285714285714286,
"eval_logits/chosen": 0.7831615805625916,
"eval_logits/rejected": 1.843852162361145,
"eval_logps/chosen": -431.0166015625,
"eval_logps/rejected": -483.92266845703125,
"eval_loss": 0.5827990770339966,
"eval_rewards/accuracies": 0.7241379022598267,
"eval_rewards/chosen": -1.5541267395019531,
"eval_rewards/margins": 1.0943388938903809,
"eval_rewards/rejected": -2.648465394973755,
"eval_runtime": 90.809,
"eval_samples_per_second": 20.163,
"eval_steps_per_second": 0.319,
"step": 150
},
{
"epoch": 0.3657142857142857,
"grad_norm": 21.503001868471035,
"learning_rate": 4.000011566683401e-07,
"logits/chosen": 0.6514826416969299,
"logits/rejected": 1.4523377418518066,
"logps/chosen": -458.98016357421875,
"logps/rejected": -519.3821411132812,
"loss": 0.5709,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5963990688323975,
"rewards/margins": 1.04099440574646,
"rewards/rejected": -2.6373934745788574,
"step": 160
},
{
"epoch": 0.38857142857142857,
"grad_norm": 22.124021346867316,
"learning_rate": 3.8355148537705047e-07,
"logits/chosen": 0.3275122046470642,
"logits/rejected": 1.0541610717773438,
"logps/chosen": -431.19708251953125,
"logps/rejected": -465.676025390625,
"loss": 0.5719,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2709215879440308,
"rewards/margins": 0.7285287976264954,
"rewards/rejected": -1.999450445175171,
"step": 170
},
{
"epoch": 0.4114285714285714,
"grad_norm": 27.446979249488912,
"learning_rate": 3.662488473675315e-07,
"logits/chosen": -0.1673279106616974,
"logits/rejected": 0.9503982663154602,
"logps/chosen": -435.6683654785156,
"logps/rejected": -467.34991455078125,
"loss": 0.5746,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.054885745048523,
"rewards/margins": 1.0191079378128052,
"rewards/rejected": -2.073993682861328,
"step": 180
},
{
"epoch": 0.4342857142857143,
"grad_norm": 21.261318648058893,
"learning_rate": 3.48203751140067e-07,
"logits/chosen": 0.8843255043029785,
"logits/rejected": 1.7267229557037354,
"logps/chosen": -421.56427001953125,
"logps/rejected": -454.6175842285156,
"loss": 0.5709,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4530667066574097,
"rewards/margins": 0.7481273412704468,
"rewards/rejected": -2.2011942863464355,
"step": 190
},
{
"epoch": 0.45714285714285713,
"grad_norm": 18.016130637147523,
"learning_rate": 3.2953144712759537e-07,
"logits/chosen": 0.8405307531356812,
"logits/rejected": 1.9338849782943726,
"logps/chosen": -396.523193359375,
"logps/rejected": -462.48724365234375,
"loss": 0.5618,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.2818048000335693,
"rewards/margins": 1.0542625188827515,
"rewards/rejected": -2.336066961288452,
"step": 200
},
{
"epoch": 0.45714285714285713,
"eval_logits/chosen": 1.353513240814209,
"eval_logits/rejected": 2.633384943008423,
"eval_logps/chosen": -407.1748962402344,
"eval_logps/rejected": -473.4366149902344,
"eval_loss": 0.5593964457511902,
"eval_rewards/accuracies": 0.7456896305084229,
"eval_rewards/chosen": -1.3157094717025757,
"eval_rewards/margins": 1.2278952598571777,
"eval_rewards/rejected": -2.543604850769043,
"eval_runtime": 90.6049,
"eval_samples_per_second": 20.209,
"eval_steps_per_second": 0.32,
"step": 200
},
{
"epoch": 0.48,
"grad_norm": 13.884449533300678,
"learning_rate": 3.103511916141658e-07,
"logits/chosen": 1.2088024616241455,
"logits/rejected": 2.1989169120788574,
"logps/chosen": -401.5423278808594,
"logps/rejected": -482.7976989746094,
"loss": 0.5543,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.380887746810913,
"rewards/margins": 0.9942899942398071,
"rewards/rejected": -2.3751778602600098,
"step": 210
},
{
"epoch": 0.5028571428571429,
"grad_norm": 22.105068043395047,
"learning_rate": 2.9078548506882117e-07,
"logits/chosen": 1.1457306146621704,
"logits/rejected": 2.113302707672119,
"logps/chosen": -436.2627868652344,
"logps/rejected": -482.525146484375,
"loss": 0.573,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.5119678974151611,
"rewards/margins": 0.8514491319656372,
"rewards/rejected": -2.363417387008667,
"step": 220
},
{
"epoch": 0.5257142857142857,
"grad_norm": 22.834936526556735,
"learning_rate": 2.709592897595191e-07,
"logits/chosen": 1.5853362083435059,
"logits/rejected": 2.6565065383911133,
"logps/chosen": -436.3871154785156,
"logps/rejected": -490.07977294921875,
"loss": 0.5539,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.553279995918274,
"rewards/margins": 0.9546509981155396,
"rewards/rejected": -2.5079312324523926,
"step": 230
},
{
"epoch": 0.5485714285714286,
"grad_norm": 24.160129408331898,
"learning_rate": 2.509992316440332e-07,
"logits/chosen": 1.073169469833374,
"logits/rejected": 2.0948684215545654,
"logps/chosen": -435.13104248046875,
"logps/rejected": -528.5726318359375,
"loss": 0.5519,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.3642125129699707,
"rewards/margins": 1.2072954177856445,
"rewards/rejected": -2.5715081691741943,
"step": 240
},
{
"epoch": 0.5714285714285714,
"grad_norm": 20.623735608734524,
"learning_rate": 2.3103279163519918e-07,
"logits/chosen": 1.123517394065857,
"logits/rejected": 1.916009545326233,
"logps/chosen": -412.7406311035156,
"logps/rejected": -501.85394287109375,
"loss": 0.558,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.3211438655853271,
"rewards/margins": 0.9953910112380981,
"rewards/rejected": -2.316534996032715,
"step": 250
},
{
"epoch": 0.5714285714285714,
"eval_logits/chosen": 1.5315722227096558,
"eval_logits/rejected": 3.1078054904937744,
"eval_logps/chosen": -408.7300720214844,
"eval_logps/rejected": -478.3948059082031,
"eval_loss": 0.5526223182678223,
"eval_rewards/accuracies": 0.7629310488700867,
"eval_rewards/chosen": -1.3312608003616333,
"eval_rewards/margins": 1.2619256973266602,
"eval_rewards/rejected": -2.593186378479004,
"eval_runtime": 90.8269,
"eval_samples_per_second": 20.159,
"eval_steps_per_second": 0.319,
"step": 250
},
{
"epoch": 0.5942857142857143,
"grad_norm": 18.937383972416836,
"learning_rate": 2.1118749140573358e-07,
"logits/chosen": 2.365237236022949,
"logits/rejected": 2.8070249557495117,
"logps/chosen": -467.107666015625,
"logps/rejected": -548.4678344726562,
"loss": 0.5559,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.8732095956802368,
"rewards/margins": 0.8780391812324524,
"rewards/rejected": -2.751248598098755,
"step": 260
},
{
"epoch": 0.6171428571428571,
"grad_norm": 21.474579443049855,
"learning_rate": 1.9159007893272703e-07,
"logits/chosen": 2.5897929668426514,
"logits/rejected": 3.8603546619415283,
"logps/chosen": -472.37420654296875,
"logps/rejected": -535.6754150390625,
"loss": 0.5468,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.9984840154647827,
"rewards/margins": 1.0531952381134033,
"rewards/rejected": -3.0516793727874756,
"step": 270
},
{
"epoch": 0.64,
"grad_norm": 26.380689829844055,
"learning_rate": 1.7236571898357766e-07,
"logits/chosen": 2.368163824081421,
"logits/rejected": 3.2206153869628906,
"logps/chosen": -457.65478515625,
"logps/rejected": -559.1119384765625,
"loss": 0.5472,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.8461668491363525,
"rewards/margins": 1.113884687423706,
"rewards/rejected": -2.9600515365600586,
"step": 280
},
{
"epoch": 0.6628571428571428,
"grad_norm": 21.888196354392417,
"learning_rate": 1.5363719371356882e-07,
"logits/chosen": 2.094879627227783,
"logits/rejected": 2.8882927894592285,
"logps/chosen": -477.3121643066406,
"logps/rejected": -541.1532592773438,
"loss": 0.5476,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.764062523841858,
"rewards/margins": 0.991985023021698,
"rewards/rejected": -2.7560477256774902,
"step": 290
},
{
"epoch": 0.6857142857142857,
"grad_norm": 16.876283045841564,
"learning_rate": 1.3552411848071565e-07,
"logits/chosen": 2.240018844604492,
"logits/rejected": 3.6286048889160156,
"logps/chosen": -489.7718200683594,
"logps/rejected": -560.1759033203125,
"loss": 0.5399,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8786265850067139,
"rewards/margins": 1.196711778640747,
"rewards/rejected": -3.075338363647461,
"step": 300
},
{
"epoch": 0.6857142857142857,
"eval_logits/chosen": 2.3192012310028076,
"eval_logits/rejected": 3.8099868297576904,
"eval_logps/chosen": -449.146484375,
"eval_logps/rejected": -527.6614990234375,
"eval_loss": 0.5464984178543091,
"eval_rewards/accuracies": 0.75,
"eval_rewards/chosen": -1.7354251146316528,
"eval_rewards/margins": 1.3504279851913452,
"eval_rewards/rejected": -3.0858535766601562,
"eval_runtime": 89.7327,
"eval_samples_per_second": 20.405,
"eval_steps_per_second": 0.323,
"step": 300
},
{
"epoch": 0.7085714285714285,
"grad_norm": 17.86356014365164,
"learning_rate": 1.1814217788631473e-07,
"logits/chosen": 2.34942626953125,
"logits/rejected": 3.1517879962921143,
"logps/chosen": -443.734375,
"logps/rejected": -516.211669921875,
"loss": 0.5491,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8743022680282593,
"rewards/margins": 0.9158605337142944,
"rewards/rejected": -2.790163040161133,
"step": 310
},
{
"epoch": 0.7314285714285714,
"grad_norm": 18.11439951327569,
"learning_rate": 1.0160238692045331e-07,
"logits/chosen": 2.115510940551758,
"logits/rejected": 2.9685635566711426,
"logps/chosen": -417.16412353515625,
"logps/rejected": -496.8562927246094,
"loss": 0.5474,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.728817343711853,
"rewards/margins": 0.8298628926277161,
"rewards/rejected": -2.558680295944214,
"step": 320
},
{
"epoch": 0.7542857142857143,
"grad_norm": 23.20251472261412,
"learning_rate": 8.601038193139438e-08,
"logits/chosen": 1.6642992496490479,
"logits/rejected": 2.858081579208374,
"logps/chosen": -455.21453857421875,
"logps/rejected": -516.3800048828125,
"loss": 0.5498,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.5599634647369385,
"rewards/margins": 1.1259580850601196,
"rewards/rejected": -2.6859214305877686,
"step": 330
},
{
"epoch": 0.7771428571428571,
"grad_norm": 17.97386136540539,
"learning_rate": 7.146574594727572e-08,
"logits/chosen": 2.353365659713745,
"logits/rejected": 3.1233067512512207,
"logps/chosen": -446.12493896484375,
"logps/rejected": -536.3157958984375,
"loss": 0.5373,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.801740050315857,
"rewards/margins": 1.141181230545044,
"rewards/rejected": -2.9429211616516113,
"step": 340
},
{
"epoch": 0.8,
"grad_norm": 21.871959889194457,
"learning_rate": 5.8061372659157306e-08,
"logits/chosen": 2.052710771560669,
"logits/rejected": 3.2841498851776123,
"logps/chosen": -481.0086975097656,
"logps/rejected": -538.4657592773438,
"loss": 0.5536,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.9130144119262695,
"rewards/margins": 0.9286670684814453,
"rewards/rejected": -2.8416812419891357,
"step": 350
},
{
"epoch": 0.8,
"eval_logits/chosen": 2.5496232509613037,
"eval_logits/rejected": 4.100791931152344,
"eval_logps/chosen": -459.9457702636719,
"eval_logps/rejected": -537.3768310546875,
"eval_loss": 0.5457741618156433,
"eval_rewards/accuracies": 0.767241358757019,
"eval_rewards/chosen": -1.8434182405471802,
"eval_rewards/margins": 1.3395885229110718,
"eval_rewards/rejected": -3.183006525039673,
"eval_runtime": 90.4314,
"eval_samples_per_second": 20.247,
"eval_steps_per_second": 0.321,
"step": 350
},
{
"epoch": 0.8228571428571428,
"grad_norm": 18.303747174951123,
"learning_rate": 4.5882873127531614e-08,
"logits/chosen": 2.120243549346924,
"logits/rejected": 3.347181797027588,
"logps/chosen": -475.18572998046875,
"logps/rejected": -556.6296997070312,
"loss": 0.5378,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.8978074789047241,
"rewards/margins": 1.1576240062713623,
"rewards/rejected": -3.055431604385376,
"step": 360
},
{
"epoch": 0.8457142857142858,
"grad_norm": 18.37433316843813,
"learning_rate": 3.500802900154412e-08,
"logits/chosen": 2.2264809608459473,
"logits/rejected": 3.606698513031006,
"logps/chosen": -450.2330627441406,
"logps/rejected": -542.2530517578125,
"loss": 0.5435,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.7917168140411377,
"rewards/margins": 1.2466588020324707,
"rewards/rejected": -3.0383753776550293,
"step": 370
},
{
"epoch": 0.8685714285714285,
"grad_norm": 21.62972776738107,
"learning_rate": 2.550629574310309e-08,
"logits/chosen": 1.8507616519927979,
"logits/rejected": 3.276202440261841,
"logps/chosen": -521.0817260742188,
"logps/rejected": -551.79736328125,
"loss": 0.543,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0302536487579346,
"rewards/margins": 0.913725733757019,
"rewards/rejected": -2.943979501724243,
"step": 380
},
{
"epoch": 0.8914285714285715,
"grad_norm": 21.37326667405599,
"learning_rate": 1.7438359028687983e-08,
"logits/chosen": 2.1451709270477295,
"logits/rejected": 2.937721014022827,
"logps/chosen": -487.7579040527344,
"logps/rejected": -573.7957763671875,
"loss": 0.5485,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7638496160507202,
"rewards/margins": 1.0263822078704834,
"rewards/rejected": -2.7902321815490723,
"step": 390
},
{
"epoch": 0.9142857142857143,
"grad_norm": 36.14381504781887,
"learning_rate": 1.0855747162029361e-08,
"logits/chosen": 2.2515556812286377,
"logits/rejected": 2.859483480453491,
"logps/chosen": -467.8270568847656,
"logps/rejected": -549.77392578125,
"loss": 0.5612,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.89093816280365,
"rewards/margins": 0.9410650134086609,
"rewards/rejected": -2.832003116607666,
"step": 400
},
{
"epoch": 0.9142857142857143,
"eval_logits/chosen": 2.319566249847412,
"eval_logits/rejected": 3.925558090209961,
"eval_logps/chosen": -443.83612060546875,
"eval_logps/rejected": -527.0015258789062,
"eval_loss": 0.5425659418106079,
"eval_rewards/accuracies": 0.7543103694915771,
"eval_rewards/chosen": -1.6823216676712036,
"eval_rewards/margins": 1.3969323635101318,
"eval_rewards/rejected": -3.079254150390625,
"eval_runtime": 90.5136,
"eval_samples_per_second": 20.229,
"eval_steps_per_second": 0.32,
"step": 400
},
{
"epoch": 0.9371428571428572,
"grad_norm": 20.661658798051754,
"learning_rate": 5.8005019731033615e-09,
"logits/chosen": 2.158614158630371,
"logits/rejected": 3.273141384124756,
"logps/chosen": -486.8451232910156,
"logps/rejected": -554.2020263671875,
"loss": 0.5334,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.9866514205932617,
"rewards/margins": 0.9580680727958679,
"rewards/rejected": -2.9447195529937744,
"step": 410
},
{
"epoch": 0.96,
"grad_norm": 20.631088560363143,
"learning_rate": 2.3049103053431886e-09,
"logits/chosen": 2.0465073585510254,
"logits/rejected": 3.5103249549865723,
"logps/chosen": -440.74224853515625,
"logps/rejected": -527.11376953125,
"loss": 0.5445,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5799415111541748,
"rewards/margins": 1.351327657699585,
"rewards/rejected": -2.9312691688537598,
"step": 420
},
{
"epoch": 0.9828571428571429,
"grad_norm": 21.077395280178123,
"learning_rate": 3.9129780600541397e-10,
"logits/chosen": 2.380171298980713,
"logits/rejected": 3.3738162517547607,
"logps/chosen": -453.9419860839844,
"logps/rejected": -550.107421875,
"loss": 0.5377,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.7171010971069336,
"rewards/margins": 1.1472762823104858,
"rewards/rejected": -2.864377498626709,
"step": 430
},
{
"epoch": 0.9988571428571429,
"step": 437,
"total_flos": 0.0,
"train_loss": 0.5805802214336614,
"train_runtime": 11351.5205,
"train_samples_per_second": 4.933,
"train_steps_per_second": 0.038
}
],
"logging_steps": 10,
"max_steps": 437,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}