sfulay's picture
Model save
98f260a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02092050209205021,
"grad_norm": 6.4084427221095295,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.630444049835205,
"logits/rejected": -2.576719045639038,
"logps/chosen": -288.65911865234375,
"logps/rejected": -275.90252685546875,
"loss": 0.6931,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": 0.00032657841802574694,
"rewards/margins": 0.0008425033884122968,
"rewards/rejected": -0.0005159247666597366,
"step": 10
},
{
"epoch": 0.04184100418410042,
"grad_norm": 5.7047484713043755,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.644824981689453,
"logits/rejected": -2.6137185096740723,
"logps/chosen": -293.5597839355469,
"logps/rejected": -259.2336730957031,
"loss": 0.6926,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.002616675803437829,
"rewards/margins": 0.001477306941524148,
"rewards/rejected": 0.0011393685126677155,
"step": 20
},
{
"epoch": 0.06276150627615062,
"grad_norm": 5.335002677652247,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.666045665740967,
"logits/rejected": -2.5890631675720215,
"logps/chosen": -294.64007568359375,
"logps/rejected": -287.18695068359375,
"loss": 0.6901,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.015856895595788956,
"rewards/margins": 0.008159220218658447,
"rewards/rejected": 0.0076976739801466465,
"step": 30
},
{
"epoch": 0.08368200836820083,
"grad_norm": 5.361991577232885,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.6365890502929688,
"logits/rejected": -2.5537407398223877,
"logps/chosen": -270.41375732421875,
"logps/rejected": -240.17959594726562,
"loss": 0.6825,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.03482341766357422,
"rewards/margins": 0.02616509422659874,
"rewards/rejected": 0.008658323436975479,
"step": 40
},
{
"epoch": 0.10460251046025104,
"grad_norm": 6.225003725839748,
"learning_rate": 4.999733114418725e-07,
"logits/chosen": -2.5800583362579346,
"logits/rejected": -2.571406364440918,
"logps/chosen": -264.10205078125,
"logps/rejected": -246.74868774414062,
"loss": 0.6687,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.013260206207633018,
"rewards/margins": 0.07922474294900894,
"rewards/rejected": -0.06596453487873077,
"step": 50
},
{
"epoch": 0.10460251046025104,
"eval_logits/chosen": -2.615793228149414,
"eval_logits/rejected": -2.5782690048217773,
"eval_logps/chosen": -260.0484313964844,
"eval_logps/rejected": -271.3634033203125,
"eval_loss": 0.649348258972168,
"eval_rewards/accuracies": 0.70703125,
"eval_rewards/chosen": 0.025814848020672798,
"eval_rewards/margins": 0.11282772570848465,
"eval_rewards/rejected": -0.0870128720998764,
"eval_runtime": 103.335,
"eval_samples_per_second": 19.355,
"eval_steps_per_second": 0.31,
"step": 50
},
{
"epoch": 0.12552301255230125,
"grad_norm": 9.898033994957384,
"learning_rate": 4.990398100856366e-07,
"logits/chosen": -2.543462038040161,
"logits/rejected": -2.50410795211792,
"logps/chosen": -268.82574462890625,
"logps/rejected": -262.0675964355469,
"loss": 0.6502,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.010042434558272362,
"rewards/margins": 0.14874781668186188,
"rewards/rejected": -0.15879027545452118,
"step": 60
},
{
"epoch": 0.14644351464435146,
"grad_norm": 9.743629526270734,
"learning_rate": 4.967775735898179e-07,
"logits/chosen": -2.6071839332580566,
"logits/rejected": -2.5465664863586426,
"logps/chosen": -304.88092041015625,
"logps/rejected": -310.2099609375,
"loss": 0.6315,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.198007270693779,
"rewards/margins": 0.2037646323442459,
"rewards/rejected": -0.4017719328403473,
"step": 70
},
{
"epoch": 0.16736401673640167,
"grad_norm": 15.248138814448389,
"learning_rate": 4.931986719649298e-07,
"logits/chosen": -1.8189195394515991,
"logits/rejected": -1.8181097507476807,
"logps/chosen": -303.1712951660156,
"logps/rejected": -337.1569519042969,
"loss": 0.6003,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.4402276873588562,
"rewards/margins": 0.30723443627357483,
"rewards/rejected": -0.7474621534347534,
"step": 80
},
{
"epoch": 0.18828451882845187,
"grad_norm": 14.823998474977419,
"learning_rate": 4.883222001996351e-07,
"logits/chosen": -0.9793803095817566,
"logits/rejected": -0.8350278735160828,
"logps/chosen": -324.31463623046875,
"logps/rejected": -369.8716735839844,
"loss": 0.5849,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5321702361106873,
"rewards/margins": 0.5584505796432495,
"rewards/rejected": -1.090620756149292,
"step": 90
},
{
"epoch": 0.20920502092050208,
"grad_norm": 22.816637306761073,
"learning_rate": 4.821741763807186e-07,
"logits/chosen": -0.4131926894187927,
"logits/rejected": 0.20026779174804688,
"logps/chosen": -381.84002685546875,
"logps/rejected": -387.60943603515625,
"loss": 0.5614,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8785870671272278,
"rewards/margins": 0.7347825765609741,
"rewards/rejected": -1.6133695840835571,
"step": 100
},
{
"epoch": 0.20920502092050208,
"eval_logits/chosen": -0.5290488004684448,
"eval_logits/rejected": -0.22265684604644775,
"eval_logps/chosen": -343.2120361328125,
"eval_logps/rejected": -421.86474609375,
"eval_loss": 0.5806925892829895,
"eval_rewards/accuracies": 0.7109375,
"eval_rewards/chosen": -0.8058211803436279,
"eval_rewards/margins": 0.7862052321434021,
"eval_rewards/rejected": -1.5920264720916748,
"eval_runtime": 104.2044,
"eval_samples_per_second": 19.193,
"eval_steps_per_second": 0.307,
"step": 100
},
{
"epoch": 0.2301255230125523,
"grad_norm": 19.02164952454875,
"learning_rate": 4.747874028753375e-07,
"logits/chosen": -0.43088096380233765,
"logits/rejected": -0.01379423774778843,
"logps/chosen": -391.91552734375,
"logps/rejected": -446.66961669921875,
"loss": 0.5589,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9717921018600464,
"rewards/margins": 0.7386372089385986,
"rewards/rejected": -1.7104294300079346,
"step": 110
},
{
"epoch": 0.2510460251046025,
"grad_norm": 23.262987016748436,
"learning_rate": 4.662012913161997e-07,
"logits/chosen": 0.030251333490014076,
"logits/rejected": 0.5855879187583923,
"logps/chosen": -378.17999267578125,
"logps/rejected": -441.85406494140625,
"loss": 0.5556,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0160324573516846,
"rewards/margins": 1.018049955368042,
"rewards/rejected": -2.0340826511383057,
"step": 120
},
{
"epoch": 0.2719665271966527,
"grad_norm": 21.032912122052533,
"learning_rate": 4.5646165232345103e-07,
"logits/chosen": -0.32387033104896545,
"logits/rejected": 0.246691033244133,
"logps/chosen": -401.41961669921875,
"logps/rejected": -456.65216064453125,
"loss": 0.542,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.039526343345642,
"rewards/margins": 0.8820648193359375,
"rewards/rejected": -1.9215911626815796,
"step": 130
},
{
"epoch": 0.2928870292887029,
"grad_norm": 18.49329811567727,
"learning_rate": 4.456204510851956e-07,
"logits/chosen": -0.7103713154792786,
"logits/rejected": -0.07352075725793839,
"logps/chosen": -398.73785400390625,
"logps/rejected": -456.5108337402344,
"loss": 0.5449,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.9843541979789734,
"rewards/margins": 0.8879961967468262,
"rewards/rejected": -1.8723503351211548,
"step": 140
},
{
"epoch": 0.3138075313807531,
"grad_norm": 22.991798597203633,
"learning_rate": 4.337355301007335e-07,
"logits/chosen": -0.8071734309196472,
"logits/rejected": 0.08783279359340668,
"logps/chosen": -380.7442321777344,
"logps/rejected": -434.64959716796875,
"loss": 0.5419,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.8969854116439819,
"rewards/margins": 0.7560001611709595,
"rewards/rejected": -1.6529855728149414,
"step": 150
},
{
"epoch": 0.3138075313807531,
"eval_logits/chosen": -0.0013768002390861511,
"eval_logits/rejected": 0.6415377259254456,
"eval_logps/chosen": -367.39569091796875,
"eval_logps/rejected": -469.2164611816406,
"eval_loss": 0.5584754347801208,
"eval_rewards/accuracies": 0.74609375,
"eval_rewards/chosen": -1.0476573705673218,
"eval_rewards/margins": 1.0178861618041992,
"eval_rewards/rejected": -2.0655436515808105,
"eval_runtime": 102.1901,
"eval_samples_per_second": 19.571,
"eval_steps_per_second": 0.313,
"step": 150
},
{
"epoch": 0.33472803347280333,
"grad_norm": 17.28842262569855,
"learning_rate": 4.2087030056579986e-07,
"logits/chosen": 0.42611104249954224,
"logits/rejected": 1.240443468093872,
"logps/chosen": -371.35699462890625,
"logps/rejected": -441.8954162597656,
"loss": 0.5371,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0083401203155518,
"rewards/margins": 1.0140050649642944,
"rewards/rejected": -2.0223450660705566,
"step": 160
},
{
"epoch": 0.35564853556485354,
"grad_norm": 23.711859022455013,
"learning_rate": 4.070934040463998e-07,
"logits/chosen": 0.9712627530097961,
"logits/rejected": 1.8879001140594482,
"logps/chosen": -393.61029052734375,
"logps/rejected": -474.1710510253906,
"loss": 0.5201,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.2388116121292114,
"rewards/margins": 0.9170275926589966,
"rewards/rejected": -2.155838966369629,
"step": 170
},
{
"epoch": 0.37656903765690375,
"grad_norm": 20.4986556938231,
"learning_rate": 3.9247834624635404e-07,
"logits/chosen": 1.3909448385238647,
"logits/rejected": 2.293593406677246,
"logps/chosen": -461.04046630859375,
"logps/rejected": -531.3743286132812,
"loss": 0.534,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.5673408508300781,
"rewards/margins": 1.0537548065185547,
"rewards/rejected": -2.621096134185791,
"step": 180
},
{
"epoch": 0.39748953974895396,
"grad_norm": 29.28443173248426,
"learning_rate": 3.7710310482256523e-07,
"logits/chosen": 0.8631278276443481,
"logits/rejected": 1.6216942071914673,
"logps/chosen": -393.5219421386719,
"logps/rejected": -465.5166015625,
"loss": 0.5181,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.397423505783081,
"rewards/margins": 0.8221076726913452,
"rewards/rejected": -2.2195310592651367,
"step": 190
},
{
"epoch": 0.41841004184100417,
"grad_norm": 24.61839368474693,
"learning_rate": 3.610497133404795e-07,
"logits/chosen": 1.1675374507904053,
"logits/rejected": 2.012094020843506,
"logps/chosen": -408.1467590332031,
"logps/rejected": -462.46875,
"loss": 0.526,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.4781768321990967,
"rewards/margins": 0.8430646657943726,
"rewards/rejected": -2.3212413787841797,
"step": 200
},
{
"epoch": 0.41841004184100417,
"eval_logits/chosen": 0.9801958799362183,
"eval_logits/rejected": 1.7427237033843994,
"eval_logps/chosen": -402.51995849609375,
"eval_logps/rejected": -517.015625,
"eval_loss": 0.5562114715576172,
"eval_rewards/accuracies": 0.76171875,
"eval_rewards/chosen": -1.3989005088806152,
"eval_rewards/margins": 1.1446349620819092,
"eval_rewards/rejected": -2.5435354709625244,
"eval_runtime": 106.3821,
"eval_samples_per_second": 18.8,
"eval_steps_per_second": 0.301,
"step": 200
},
{
"epoch": 0.4393305439330544,
"grad_norm": 24.016781344152104,
"learning_rate": 3.4440382358952115e-07,
"logits/chosen": 1.11398446559906,
"logits/rejected": 2.043394088745117,
"logps/chosen": -427.09759521484375,
"logps/rejected": -499.5668029785156,
"loss": 0.5271,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.5520648956298828,
"rewards/margins": 0.8630266189575195,
"rewards/rejected": -2.4150915145874023,
"step": 210
},
{
"epoch": 0.4602510460251046,
"grad_norm": 19.623224068729442,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": 0.09422092139720917,
"logits/rejected": 0.9946798086166382,
"logps/chosen": -387.312255859375,
"logps/rejected": -478.73699951171875,
"loss": 0.5347,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0305876731872559,
"rewards/margins": 1.044886827468872,
"rewards/rejected": -2.075474262237549,
"step": 220
},
{
"epoch": 0.4811715481171548,
"grad_norm": 20.279694061078374,
"learning_rate": 3.096924887558854e-07,
"logits/chosen": 0.5403207540512085,
"logits/rejected": 1.6011781692504883,
"logps/chosen": -414.5859375,
"logps/rejected": -522.0372314453125,
"loss": 0.4963,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.325188398361206,
"rewards/margins": 1.3052270412445068,
"rewards/rejected": -2.630415439605713,
"step": 230
},
{
"epoch": 0.502092050209205,
"grad_norm": 17.600900176176182,
"learning_rate": 2.9181224366319943e-07,
"logits/chosen": 1.2316529750823975,
"logits/rejected": 2.2336225509643555,
"logps/chosen": -405.16815185546875,
"logps/rejected": -461.0508728027344,
"loss": 0.5208,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.505174994468689,
"rewards/margins": 0.9252998232841492,
"rewards/rejected": -2.4304747581481934,
"step": 240
},
{
"epoch": 0.5230125523012552,
"grad_norm": 21.388657517716165,
"learning_rate": 2.7370891215954565e-07,
"logits/chosen": 0.33056747913360596,
"logits/rejected": 1.398674726486206,
"logps/chosen": -425.12738037109375,
"logps/rejected": -477.9986267089844,
"loss": 0.5202,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.3082549571990967,
"rewards/margins": 1.007277011871338,
"rewards/rejected": -2.3155319690704346,
"step": 250
},
{
"epoch": 0.5230125523012552,
"eval_logits/chosen": 0.6489181518554688,
"eval_logits/rejected": 1.4379549026489258,
"eval_logps/chosen": -376.8782653808594,
"eval_logps/rejected": -495.4537353515625,
"eval_loss": 0.5419028401374817,
"eval_rewards/accuracies": 0.7890625,
"eval_rewards/chosen": -1.142483115196228,
"eval_rewards/margins": 1.185433030128479,
"eval_rewards/rejected": -2.327916383743286,
"eval_runtime": 104.2974,
"eval_samples_per_second": 19.176,
"eval_steps_per_second": 0.307,
"step": 250
},
{
"epoch": 0.5439330543933054,
"grad_norm": 23.92722986686011,
"learning_rate": 2.55479083351317e-07,
"logits/chosen": 1.0843138694763184,
"logits/rejected": 2.0778467655181885,
"logps/chosen": -430.15911865234375,
"logps/rejected": -504.9864807128906,
"loss": 0.5125,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1564133167266846,
"rewards/margins": 1.2437217235565186,
"rewards/rejected": -2.4001352787017822,
"step": 260
},
{
"epoch": 0.5648535564853556,
"grad_norm": 21.06560694930431,
"learning_rate": 2.3722002126275822e-07,
"logits/chosen": 1.3407487869262695,
"logits/rejected": 2.4112818241119385,
"logps/chosen": -432.30023193359375,
"logps/rejected": -504.80670166015625,
"loss": 0.5096,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.3888903856277466,
"rewards/margins": 1.141367793083191,
"rewards/rejected": -2.5302579402923584,
"step": 270
},
{
"epoch": 0.5857740585774058,
"grad_norm": 21.8560737042057,
"learning_rate": 2.19029145890313e-07,
"logits/chosen": 1.2237210273742676,
"logits/rejected": 2.0341880321502686,
"logps/chosen": -389.9634704589844,
"logps/rejected": -492.298828125,
"loss": 0.5114,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.3260904550552368,
"rewards/margins": 1.0742130279541016,
"rewards/rejected": -2.400303363800049,
"step": 280
},
{
"epoch": 0.606694560669456,
"grad_norm": 23.993372955897243,
"learning_rate": 2.0100351342479216e-07,
"logits/chosen": 1.5786670446395874,
"logits/rejected": 2.3115456104278564,
"logps/chosen": -447.48382568359375,
"logps/rejected": -546.939697265625,
"loss": 0.513,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5614277124404907,
"rewards/margins": 1.1185188293457031,
"rewards/rejected": -2.6799466609954834,
"step": 290
},
{
"epoch": 0.6276150627615062,
"grad_norm": 22.907667503889392,
"learning_rate": 1.8323929841460178e-07,
"logits/chosen": 2.0128085613250732,
"logits/rejected": 2.6608686447143555,
"logps/chosen": -434.07830810546875,
"logps/rejected": -551.2879028320312,
"loss": 0.5054,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.6475236415863037,
"rewards/margins": 1.0819259881973267,
"rewards/rejected": -2.72944974899292,
"step": 300
},
{
"epoch": 0.6276150627615062,
"eval_logits/chosen": 1.477131962776184,
"eval_logits/rejected": 2.255997657775879,
"eval_logps/chosen": -402.4423522949219,
"eval_logps/rejected": -531.4893798828125,
"eval_loss": 0.5449927449226379,
"eval_rewards/accuracies": 0.77734375,
"eval_rewards/chosen": -1.3981244564056396,
"eval_rewards/margins": 1.290148377418518,
"eval_rewards/rejected": -2.688272714614868,
"eval_runtime": 103.6153,
"eval_samples_per_second": 19.302,
"eval_steps_per_second": 0.309,
"step": 300
},
{
"epoch": 0.6485355648535565,
"grad_norm": 20.97030138169365,
"learning_rate": 1.6583128063291573e-07,
"logits/chosen": 1.2844688892364502,
"logits/rejected": 2.06947660446167,
"logps/chosen": -437.86358642578125,
"logps/rejected": -538.4571533203125,
"loss": 0.5145,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4390218257904053,
"rewards/margins": 1.1747257709503174,
"rewards/rejected": -2.6137475967407227,
"step": 310
},
{
"epoch": 0.6694560669456067,
"grad_norm": 23.361486108617427,
"learning_rate": 1.488723393865766e-07,
"logits/chosen": 1.0614537000656128,
"logits/rejected": 2.0729637145996094,
"logps/chosen": -453.379150390625,
"logps/rejected": -521.8544311523438,
"loss": 0.4923,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.6114938259124756,
"rewards/margins": 1.0873934030532837,
"rewards/rejected": -2.6988871097564697,
"step": 320
},
{
"epoch": 0.6903765690376569,
"grad_norm": 24.156654265698716,
"learning_rate": 1.3245295796480788e-07,
"logits/chosen": 1.4917339086532593,
"logits/rejected": 2.5346505641937256,
"logps/chosen": -463.26202392578125,
"logps/rejected": -520.6095581054688,
"loss": 0.5059,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7963454723358154,
"rewards/margins": 0.9912670254707336,
"rewards/rejected": -2.7876124382019043,
"step": 330
},
{
"epoch": 0.7112970711297071,
"grad_norm": 19.422370738594758,
"learning_rate": 1.1666074087171627e-07,
"logits/chosen": 1.7316112518310547,
"logits/rejected": 2.561483144760132,
"logps/chosen": -413.0133361816406,
"logps/rejected": -536.4874877929688,
"loss": 0.5184,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.6657421588897705,
"rewards/margins": 1.103548288345337,
"rewards/rejected": -2.7692904472351074,
"step": 340
},
{
"epoch": 0.7322175732217573,
"grad_norm": 19.236548541895125,
"learning_rate": 1.0157994641835734e-07,
"logits/chosen": 1.52297842502594,
"logits/rejected": 2.436295986175537,
"logps/chosen": -450.27239990234375,
"logps/rejected": -541.5868530273438,
"loss": 0.497,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.680456519126892,
"rewards/margins": 1.0492502450942993,
"rewards/rejected": -2.7297065258026123,
"step": 350
},
{
"epoch": 0.7322175732217573,
"eval_logits/chosen": 1.3703731298446655,
"eval_logits/rejected": 2.225933074951172,
"eval_logps/chosen": -422.6754455566406,
"eval_logps/rejected": -549.4119873046875,
"eval_loss": 0.5301805138587952,
"eval_rewards/accuracies": 0.7734375,
"eval_rewards/chosen": -1.6004550457000732,
"eval_rewards/margins": 1.2670434713363647,
"eval_rewards/rejected": -2.8674986362457275,
"eval_runtime": 102.8183,
"eval_samples_per_second": 19.452,
"eval_steps_per_second": 0.311,
"step": 350
},
{
"epoch": 0.7531380753138075,
"grad_norm": 19.689504951232507,
"learning_rate": 8.729103716819111e-08,
"logits/chosen": 1.695031762123108,
"logits/rejected": 2.4951744079589844,
"logps/chosen": -429.60675048828125,
"logps/rejected": -540.1343383789062,
"loss": 0.5127,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8274829387664795,
"rewards/margins": 1.1173667907714844,
"rewards/rejected": -2.9448494911193848,
"step": 360
},
{
"epoch": 0.7740585774058577,
"grad_norm": 23.998324017815545,
"learning_rate": 7.387025063449081e-08,
"logits/chosen": 1.3235762119293213,
"logits/rejected": 2.6002328395843506,
"logps/chosen": -454.5791015625,
"logps/rejected": -572.55078125,
"loss": 0.495,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.667346715927124,
"rewards/margins": 1.3339135646820068,
"rewards/rejected": -3.001260280609131,
"step": 370
},
{
"epoch": 0.7949790794979079,
"grad_norm": 20.911379608221466,
"learning_rate": 6.138919252022435e-08,
"logits/chosen": 1.435396432876587,
"logits/rejected": 2.2644729614257812,
"logps/chosen": -481.06378173828125,
"logps/rejected": -542.4463500976562,
"loss": 0.5119,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6459989547729492,
"rewards/margins": 0.9545661211013794,
"rewards/rejected": -2.600564956665039,
"step": 380
},
{
"epoch": 0.8158995815899581,
"grad_norm": 33.66392923940902,
"learning_rate": 4.991445467064689e-08,
"logits/chosen": 1.5889087915420532,
"logits/rejected": 2.5967953205108643,
"logps/chosen": -469.99884033203125,
"logps/rejected": -584.1378784179688,
"loss": 0.5084,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.6343872547149658,
"rewards/margins": 1.305241346359253,
"rewards/rejected": -2.9396286010742188,
"step": 390
},
{
"epoch": 0.8368200836820083,
"grad_norm": 21.820601306046125,
"learning_rate": 3.9507259776993954e-08,
"logits/chosen": 1.4544366598129272,
"logits/rejected": 2.2073497772216797,
"logps/chosen": -470.62188720703125,
"logps/rejected": -558.2474365234375,
"loss": 0.5076,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.766871452331543,
"rewards/margins": 1.029807209968567,
"rewards/rejected": -2.796678304672241,
"step": 400
},
{
"epoch": 0.8368200836820083,
"eval_logits/chosen": 1.3332302570343018,
"eval_logits/rejected": 2.278512477874756,
"eval_logps/chosen": -423.95953369140625,
"eval_logps/rejected": -558.9130859375,
"eval_loss": 0.5348207950592041,
"eval_rewards/accuracies": 0.7890625,
"eval_rewards/chosen": -1.6132957935333252,
"eval_rewards/margins": 1.3492140769958496,
"eval_rewards/rejected": -2.962510108947754,
"eval_runtime": 105.1303,
"eval_samples_per_second": 19.024,
"eval_steps_per_second": 0.304,
"step": 400
},
{
"epoch": 0.8577405857740585,
"grad_norm": 23.0331791932855,
"learning_rate": 3.022313472693447e-08,
"logits/chosen": 1.0064998865127563,
"logits/rejected": 1.9991016387939453,
"logps/chosen": -479.11846923828125,
"logps/rejected": -583.687255859375,
"loss": 0.4977,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.6211599111557007,
"rewards/margins": 1.2330563068389893,
"rewards/rejected": -2.8542163372039795,
"step": 410
},
{
"epoch": 0.8786610878661087,
"grad_norm": 25.09829796115416,
"learning_rate": 2.2111614344599684e-08,
"logits/chosen": 1.3670974969863892,
"logits/rejected": 2.6799685955047607,
"logps/chosen": -465.97674560546875,
"logps/rejected": -562.8436279296875,
"loss": 0.4821,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.6978868246078491,
"rewards/margins": 1.2206120491027832,
"rewards/rejected": -2.9184985160827637,
"step": 420
},
{
"epoch": 0.899581589958159,
"grad_norm": 21.301017285621448,
"learning_rate": 1.521597710086439e-08,
"logits/chosen": 1.232881784439087,
"logits/rejected": 2.3901188373565674,
"logps/chosen": -466.0107421875,
"logps/rejected": -540.5730590820312,
"loss": 0.5114,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.70029616355896,
"rewards/margins": 1.1120824813842773,
"rewards/rejected": -2.8123791217803955,
"step": 430
},
{
"epoch": 0.9205020920502092,
"grad_norm": 17.114528155184686,
"learning_rate": 9.57301420397924e-09,
"logits/chosen": 1.5404767990112305,
"logits/rejected": 2.9349493980407715,
"logps/chosen": -467.8946228027344,
"logps/rejected": -551.1821899414062,
"loss": 0.5065,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.856208086013794,
"rewards/margins": 1.1952415704727173,
"rewards/rejected": -3.051449775695801,
"step": 440
},
{
"epoch": 0.9414225941422594,
"grad_norm": 19.50832169815014,
"learning_rate": 5.212833302556258e-09,
"logits/chosen": 1.3590748310089111,
"logits/rejected": 2.3424363136291504,
"logps/chosen": -451.4662170410156,
"logps/rejected": -575.4518432617188,
"loss": 0.5092,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.667201042175293,
"rewards/margins": 1.312021017074585,
"rewards/rejected": -2.979222059249878,
"step": 450
},
{
"epoch": 0.9414225941422594,
"eval_logits/chosen": 1.3857550621032715,
"eval_logits/rejected": 2.3444478511810303,
"eval_logps/chosen": -429.63800048828125,
"eval_logps/rejected": -565.6296997070312,
"eval_loss": 0.5340853333473206,
"eval_rewards/accuracies": 0.78515625,
"eval_rewards/chosen": -1.6700804233551025,
"eval_rewards/margins": 1.3595958948135376,
"eval_rewards/rejected": -3.0296761989593506,
"eval_runtime": 102.8547,
"eval_samples_per_second": 19.445,
"eval_steps_per_second": 0.311,
"step": 450
},
{
"epoch": 0.9623430962343096,
"grad_norm": 18.50816041515539,
"learning_rate": 2.158697848236607e-09,
"logits/chosen": 1.729901909828186,
"logits/rejected": 2.7589685916900635,
"logps/chosen": -457.56207275390625,
"logps/rejected": -565.1722412109375,
"loss": 0.4978,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.942845106124878,
"rewards/margins": 1.1519941091537476,
"rewards/rejected": -3.094839334487915,
"step": 460
},
{
"epoch": 0.9832635983263598,
"grad_norm": 18.70014449283038,
"learning_rate": 4.269029751107489e-10,
"logits/chosen": 1.6632955074310303,
"logits/rejected": 2.187119960784912,
"logps/chosen": -438.1412048339844,
"logps/rejected": -556.7476806640625,
"loss": 0.5241,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.7501564025878906,
"rewards/margins": 1.1538572311401367,
"rewards/rejected": -2.9040138721466064,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 0.5431942655451627,
"train_runtime": 12655.0807,
"train_samples_per_second": 4.831,
"train_steps_per_second": 0.038
}
],
"logging_steps": 10,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}