diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3043 @@ +{ + "best_metric": 0.42871206998825073, + "best_model_checkpoint": "./results2/checkpoint-3500", + "epoch": 0.9731007159240982, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005004517967609648, + "grad_norm": 71.42864990234375, + "learning_rate": 5.999926320167437e-05, + "logits/chosen": -2.8284192085266113, + "logits/rejected": -2.5609850883483887, + "logps/chosen": -327.4080505371094, + "logps/rejected": -261.88433837890625, + "loss": 0.6274, + "rewards/accuracies": 0.5416666865348816, + "rewards/chosen": 0.14137662947177887, + "rewards/margins": 0.22425617277622223, + "rewards/rejected": -0.08287954330444336, + "step": 18 + }, + { + "epoch": 0.010009035935219296, + "grad_norm": 65.42167663574219, + "learning_rate": 5.99922178723054e-05, + "logits/chosen": -2.915534019470215, + "logits/rejected": -2.618929862976074, + "logps/chosen": -271.7569580078125, + "logps/rejected": -217.84347534179688, + "loss": 0.5222, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": 0.280923992395401, + "rewards/margins": 1.0729061365127563, + "rewards/rejected": -0.791982114315033, + "step": 36 + }, + { + "epoch": 0.015013553902828942, + "grad_norm": 62.74685287475586, + "learning_rate": 5.9977714519088995e-05, + "logits/chosen": -2.7717418670654297, + "logits/rejected": -2.5094423294067383, + "logps/chosen": -247.0843963623047, + "logps/rejected": -200.96141052246094, + "loss": 0.5444, + "rewards/accuracies": 0.6111111044883728, + "rewards/chosen": 0.2706214189529419, + "rewards/margins": 1.396517038345337, + "rewards/rejected": -1.125895619392395, + "step": 54 + }, + { + "epoch": 0.02001807187043859, + "grad_norm": 46.2154426574707, + "learning_rate": 5.9955756748505816e-05, + "logits/chosen": -2.7943055629730225, + "logits/rejected": -2.5575947761535645, + "logps/chosen": -292.98492431640625, + "logps/rejected": -245.57606506347656, + "loss": 0.5736, + "rewards/accuracies": 0.5833333134651184, + "rewards/chosen": 0.37114933133125305, + "rewards/margins": 1.2226431369781494, + "rewards/rejected": -0.8514936566352844, + "step": 72 + }, + { + "epoch": 0.025022589838048236, + "grad_norm": 69.38668823242188, + "learning_rate": 5.9926350020691476e-05, + "logits/chosen": -2.795102119445801, + "logits/rejected": -2.536952018737793, + "logps/chosen": -299.3870849609375, + "logps/rejected": -233.71807861328125, + "loss": 0.5852, + "rewards/accuracies": 0.6458333134651184, + "rewards/chosen": -0.22023707628250122, + "rewards/margins": 0.8623918890953064, + "rewards/rejected": -1.0826289653778076, + "step": 90 + }, + { + "epoch": 0.030027107805657885, + "grad_norm": 43.98723220825195, + "learning_rate": 5.988950164807875e-05, + "logits/chosen": -2.8627054691314697, + "logits/rejected": -2.5194735527038574, + "logps/chosen": -306.3284606933594, + "logps/rejected": -225.8386688232422, + "loss": 0.4759, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": 0.004159212112426758, + "rewards/margins": 1.6864651441574097, + "rewards/rejected": -1.6823060512542725, + "step": 108 + }, + { + "epoch": 0.03503162577326753, + "grad_norm": 33.66733932495117, + "learning_rate": 5.984522079357927e-05, + "logits/chosen": -2.7456674575805664, + "logits/rejected": -2.436127185821533, + "logps/chosen": -269.4483642578125, + "logps/rejected": -215.54396057128906, + "loss": 0.4599, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": -0.1289471834897995, + "rewards/margins": 1.6866822242736816, + "rewards/rejected": -1.8156293630599976, + "step": 126 + }, + { + "epoch": 0.04003614374087718, + "grad_norm": 52.28266143798828, + "learning_rate": 5.9793518468305e-05, + "logits/chosen": -2.7589292526245117, + "logits/rejected": -2.484504461288452, + "logps/chosen": -308.44696044921875, + "logps/rejected": -247.86181640625, + "loss": 0.5167, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -0.3287951350212097, + "rewards/margins": 1.7950549125671387, + "rewards/rejected": -2.123850107192993, + "step": 144 + }, + { + "epoch": 0.04504066170848683, + "grad_norm": 58.46402359008789, + "learning_rate": 5.973440752883014e-05, + "logits/chosen": -2.7986905574798584, + "logits/rejected": -2.5123918056488037, + "logps/chosen": -313.9617614746094, + "logps/rejected": -262.1242980957031, + "loss": 0.4914, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.03435669466853142, + "rewards/margins": 1.6825064420700073, + "rewards/rejected": -1.7168630361557007, + "step": 162 + }, + { + "epoch": 0.05004517967609647, + "grad_norm": 59.80794143676758, + "learning_rate": 5.966790267399417e-05, + "logits/chosen": -2.776872158050537, + "logits/rejected": -2.4881160259246826, + "logps/chosen": -306.0843505859375, + "logps/rejected": -244.2305450439453, + "loss": 0.4843, + "rewards/accuracies": 0.6597222089767456, + "rewards/chosen": -0.3464333117008209, + "rewards/margins": 1.6409136056900024, + "rewards/rejected": -1.98734712600708, + "step": 180 + }, + { + "epoch": 0.055049697643706125, + "grad_norm": 46.00302505493164, + "learning_rate": 5.9594020441246764e-05, + "logits/chosen": -2.8051772117614746, + "logits/rejected": -2.482142925262451, + "logps/chosen": -302.67144775390625, + "logps/rejected": -245.79144287109375, + "loss": 0.4665, + "rewards/accuracies": 0.7083333134651184, + "rewards/chosen": -0.7705867290496826, + "rewards/margins": 1.598375678062439, + "rewards/rejected": -2.368962526321411, + "step": 198 + }, + { + "epoch": 0.06005421561131577, + "grad_norm": 35.59434509277344, + "learning_rate": 5.9512779202535464e-05, + "logits/chosen": -2.9093472957611084, + "logits/rejected": -2.6116788387298584, + "logps/chosen": -280.0508728027344, + "logps/rejected": -235.8525390625, + "loss": 0.4887, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -0.4653184711933136, + "rewards/margins": 2.118915557861328, + "rewards/rejected": -2.5842342376708984, + "step": 216 + }, + { + "epoch": 0.06505873357892542, + "grad_norm": 30.26572608947754, + "learning_rate": 5.942419915973725e-05, + "logits/chosen": -2.8391218185424805, + "logits/rejected": -2.546337604522705, + "logps/chosen": -306.25762939453125, + "logps/rejected": -254.07443237304688, + "loss": 0.5301, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": -0.609345555305481, + "rewards/margins": 1.578539252281189, + "rewards/rejected": -2.187884569168091, + "step": 234 + }, + { + "epoch": 0.07006325154653506, + "grad_norm": 47.70458984375, + "learning_rate": 5.932830233963503e-05, + "logits/chosen": -2.8508825302124023, + "logits/rejected": -2.500095844268799, + "logps/chosen": -322.3190612792969, + "logps/rejected": -253.78439331054688, + "loss": 0.472, + "rewards/accuracies": 0.7569444179534912, + "rewards/chosen": -0.6305109858512878, + "rewards/margins": 1.85237717628479, + "rewards/rejected": -2.4828882217407227, + "step": 252 + }, + { + "epoch": 0.07506776951414472, + "grad_norm": 25.21255111694336, + "learning_rate": 5.9225112588440296e-05, + "logits/chosen": -2.8125343322753906, + "logits/rejected": -2.464313268661499, + "logps/chosen": -297.6902770996094, + "logps/rejected": -236.00732421875, + "loss": 0.437, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -0.4617515802383423, + "rewards/margins": 2.1608004570007324, + "rewards/rejected": -2.622551918029785, + "step": 270 + }, + { + "epoch": 0.08007228748175436, + "grad_norm": 41.022037506103516, + "learning_rate": 5.911465556586348e-05, + "logits/chosen": -2.7316839694976807, + "logits/rejected": -2.440063714981079, + "logps/chosen": -300.703125, + "logps/rejected": -251.49449157714844, + "loss": 0.4565, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -0.3301756680011749, + "rewards/margins": 1.998940110206604, + "rewards/rejected": -2.329115867614746, + "step": 288 + }, + { + "epoch": 0.08507680544936401, + "grad_norm": 29.881244659423828, + "learning_rate": 5.8996958738733195e-05, + "logits/chosen": -2.763925790786743, + "logits/rejected": -2.4390058517456055, + "logps/chosen": -289.1147155761719, + "logps/rejected": -241.44349670410156, + "loss": 0.5313, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": 0.06746254861354828, + "rewards/margins": 1.7309118509292603, + "rewards/rejected": -1.6634494066238403, + "step": 306 + }, + { + "epoch": 0.09008132341697365, + "grad_norm": 51.288818359375, + "learning_rate": 5.8872051374166255e-05, + "logits/chosen": -2.866205930709839, + "logits/rejected": -2.507150888442993, + "logps/chosen": -312.8473205566406, + "logps/rejected": -257.2422180175781, + "loss": 0.4655, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -0.2781409025192261, + "rewards/margins": 1.8859078884124756, + "rewards/rejected": -2.164048433303833, + "step": 324 + }, + { + "epoch": 0.0950858413845833, + "grad_norm": 27.794857025146484, + "learning_rate": 5.87399645322899e-05, + "logits/chosen": -2.8275394439697266, + "logits/rejected": -2.5013062953948975, + "logps/chosen": -293.5814514160156, + "logps/rejected": -250.40774536132812, + "loss": 0.4429, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -0.6691242456436157, + "rewards/margins": 2.115230083465576, + "rewards/rejected": -2.7843542098999023, + "step": 342 + }, + { + "epoch": 0.10009035935219295, + "grad_norm": 66.16268920898438, + "learning_rate": 5.860073105851826e-05, + "logits/chosen": -2.7910256385803223, + "logits/rejected": -2.480933427810669, + "logps/chosen": -296.1370849609375, + "logps/rejected": -256.7040100097656, + "loss": 0.4922, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -0.6868242621421814, + "rewards/margins": 2.4337074756622314, + "rewards/rejected": -3.1205317974090576, + "step": 360 + }, + { + "epoch": 0.1050948773198026, + "grad_norm": 51.62457275390625, + "learning_rate": 5.845438557538483e-05, + "logits/chosen": -2.7872157096862793, + "logits/rejected": -2.4951987266540527, + "logps/chosen": -285.5191345214844, + "logps/rejected": -252.59222412109375, + "loss": 0.4102, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7822558879852295, + "rewards/margins": 2.5719451904296875, + "rewards/rejected": -3.354201316833496, + "step": 378 + }, + { + "epoch": 0.11009939528741225, + "grad_norm": 17.33948516845703, + "learning_rate": 5.830096447393304e-05, + "logits/chosen": -2.674931526184082, + "logits/rejected": -2.3955085277557373, + "logps/chosen": -288.59674072265625, + "logps/rejected": -254.3195037841797, + "loss": 0.5482, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": -0.34463223814964294, + "rewards/margins": 2.307426691055298, + "rewards/rejected": -2.6520586013793945, + "step": 396 + }, + { + "epoch": 0.1151039132550219, + "grad_norm": 20.4786319732666, + "learning_rate": 5.814050590466707e-05, + "logits/chosen": -2.7439794540405273, + "logits/rejected": -2.4365010261535645, + "logps/chosen": -290.3387451171875, + "logps/rejected": -244.3264923095703, + "loss": 0.4771, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -1.0573636293411255, + "rewards/margins": 2.1369895935058594, + "rewards/rejected": -3.1943538188934326, + "step": 414 + }, + { + "epoch": 0.12010843122263154, + "grad_norm": 46.82477951049805, + "learning_rate": 5.797304976806514e-05, + "logits/chosen": -2.7844152450561523, + "logits/rejected": -2.5251095294952393, + "logps/chosen": -297.169921875, + "logps/rejected": -275.9124450683594, + "loss": 0.4525, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.0830460786819458, + "rewards/margins": 2.440809488296509, + "rewards/rejected": -3.523855209350586, + "step": 432 + }, + { + "epoch": 0.12511294919024118, + "grad_norm": 47.5440673828125, + "learning_rate": 5.7798637704657666e-05, + "logits/chosen": -2.795816659927368, + "logits/rejected": -2.4617395401000977, + "logps/chosen": -305.271240234375, + "logps/rejected": -259.6396789550781, + "loss": 0.4929, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -1.3109158277511597, + "rewards/margins": 2.4858694076538086, + "rewards/rejected": -3.796785593032837, + "step": 450 + }, + { + "epoch": 0.13011746715785083, + "grad_norm": 29.152238845825195, + "learning_rate": 5.7617313084672656e-05, + "logits/chosen": -2.736605644226074, + "logits/rejected": -2.3923282623291016, + "logps/chosen": -284.2351379394531, + "logps/rejected": -232.88275146484375, + "loss": 0.4949, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -0.9013742804527283, + "rewards/margins": 2.1247429847717285, + "rewards/rejected": -3.0261170864105225, + "step": 468 + }, + { + "epoch": 0.13512198512546048, + "grad_norm": 48.9805908203125, + "learning_rate": 5.742912099725111e-05, + "logits/chosen": -2.742039680480957, + "logits/rejected": -2.3728723526000977, + "logps/chosen": -316.6728820800781, + "logps/rejected": -262.7744140625, + "loss": 0.4539, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -0.7503696084022522, + "rewards/margins": 2.6848859786987305, + "rewards/rejected": -3.435255289077759, + "step": 486 + }, + { + "epoch": 0.13901438798915688, + "eval_logits/chosen": -2.760572910308838, + "eval_logits/rejected": -2.425795078277588, + "eval_logps/chosen": -301.9501953125, + "eval_logps/rejected": -252.17218017578125, + "eval_loss": 0.4661794602870941, + "eval_rewards/accuracies": 0.7140287756919861, + "eval_rewards/chosen": -0.8885634541511536, + "eval_rewards/margins": 2.801868200302124, + "eval_rewards/rejected": -3.690431594848633, + "eval_runtime": 2889.0225, + "eval_samples_per_second": 1.347, + "eval_steps_per_second": 0.674, + "step": 500 + }, + { + "epoch": 0.14012650309307012, + "grad_norm": 46.688716888427734, + "learning_rate": 5.7234108239234844e-05, + "logits/chosen": -2.7770731449127197, + "logits/rejected": -2.4328150749206543, + "logps/chosen": -338.7681579589844, + "logps/rejected": -288.3874816894531, + "loss": 0.5026, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9047598242759705, + "rewards/margins": 2.591067314147949, + "rewards/rejected": -3.4958269596099854, + "step": 504 + }, + { + "epoch": 0.1451310210606798, + "grad_norm": 52.954917907714844, + "learning_rate": 5.7032323303529793e-05, + "logits/chosen": -2.8162384033203125, + "logits/rejected": -2.475491523742676, + "logps/chosen": -339.5991516113281, + "logps/rejected": -296.40069580078125, + "loss": 0.4951, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3801913261413574, + "rewards/margins": 2.788844108581543, + "rewards/rejected": -4.169035911560059, + "step": 522 + }, + { + "epoch": 0.15013553902828944, + "grad_norm": 33.80255126953125, + "learning_rate": 5.6823816367047484e-05, + "logits/chosen": -2.7102253437042236, + "logits/rejected": -2.4160075187683105, + "logps/chosen": -308.2735290527344, + "logps/rejected": -269.2956237792969, + "loss": 0.5361, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.6627583503723145, + "rewards/margins": 2.620126247406006, + "rewards/rejected": -4.28288459777832, + "step": 540 + }, + { + "epoch": 0.15514005699589908, + "grad_norm": 25.202301025390625, + "learning_rate": 5.660863927822776e-05, + "logits/chosen": -2.78639554977417, + "logits/rejected": -2.4165213108062744, + "logps/chosen": -304.6817932128906, + "logps/rejected": -246.72067260742188, + "loss": 0.5113, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -1.2862390279769897, + "rewards/margins": 2.729475975036621, + "rewards/rejected": -4.015714645385742, + "step": 558 + }, + { + "epoch": 0.16014457496350873, + "grad_norm": 32.291770935058594, + "learning_rate": 5.638684554414593e-05, + "logits/chosen": -2.8286290168762207, + "logits/rejected": -2.4456050395965576, + "logps/chosen": -297.7843322753906, + "logps/rejected": -251.6412811279297, + "loss": 0.4246, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -0.9567468166351318, + "rewards/margins": 2.9154627323150635, + "rewards/rejected": -3.872209310531616, + "step": 576 + }, + { + "epoch": 0.16514909293111837, + "grad_norm": 18.591249465942383, + "learning_rate": 5.615849031720735e-05, + "logits/chosen": -2.6336727142333984, + "logits/rejected": -2.380577564239502, + "logps/chosen": -307.2437438964844, + "logps/rejected": -261.6679992675781, + "loss": 0.476, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -1.205333948135376, + "rewards/margins": 2.628098487854004, + "rewards/rejected": -3.83343243598938, + "step": 594 + }, + { + "epoch": 0.17015361089872802, + "grad_norm": 17.27914047241211, + "learning_rate": 5.5923630381432986e-05, + "logits/chosen": -2.73476505279541, + "logits/rejected": -2.411391019821167, + "logps/chosen": -275.2574462890625, + "logps/rejected": -236.94528198242188, + "loss": 0.5157, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9271820783615112, + "rewards/margins": 2.7185263633728027, + "rewards/rejected": -4.645708084106445, + "step": 612 + }, + { + "epoch": 0.17515812886633766, + "grad_norm": 83.5201187133789, + "learning_rate": 5.568232413833916e-05, + "logits/chosen": -2.6982274055480957, + "logits/rejected": -2.4683725833892822, + "logps/chosen": -317.6478271484375, + "logps/rejected": -284.2525939941406, + "loss": 0.5373, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.087332248687744, + "rewards/margins": 2.7045679092407227, + "rewards/rejected": -5.791900634765625, + "step": 630 + }, + { + "epoch": 0.1801626468339473, + "grad_norm": 20.350605010986328, + "learning_rate": 5.543463159241515e-05, + "logits/chosen": -2.7724947929382324, + "logits/rejected": -2.483661651611328, + "logps/chosen": -337.3479309082031, + "logps/rejected": -293.8812561035156, + "loss": 0.5321, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -3.037468433380127, + "rewards/margins": 3.4854342937469482, + "rewards/rejected": -6.522902488708496, + "step": 648 + }, + { + "epoch": 0.18516716480155695, + "grad_norm": 42.29398727416992, + "learning_rate": 5.518061433620214e-05, + "logits/chosen": -2.7252228260040283, + "logits/rejected": -2.535196304321289, + "logps/chosen": -293.3070373535156, + "logps/rejected": -253.4528045654297, + "loss": 0.5152, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -1.1113444566726685, + "rewards/margins": 2.681729793548584, + "rewards/rejected": -3.793074369430542, + "step": 666 + }, + { + "epoch": 0.1901716827691666, + "grad_norm": 34.496341705322266, + "learning_rate": 5.492033553497726e-05, + "logits/chosen": -2.667125940322876, + "logits/rejected": -2.37849497795105, + "logps/chosen": -303.0618591308594, + "logps/rejected": -249.07188415527344, + "loss": 0.5542, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2092381715774536, + "rewards/margins": 2.3517844676971436, + "rewards/rejected": -3.5610220432281494, + "step": 684 + }, + { + "epoch": 0.19517620073677625, + "grad_norm": 35.061683654785156, + "learning_rate": 5.4653859911046666e-05, + "logits/chosen": -2.770714282989502, + "logits/rejected": -2.3620755672454834, + "logps/chosen": -303.9200744628906, + "logps/rejected": -240.67138671875, + "loss": 0.5658, + "rewards/accuracies": 0.6180555820465088, + "rewards/chosen": -1.6102126836776733, + "rewards/margins": 2.3924648761749268, + "rewards/rejected": -4.002677917480469, + "step": 702 + }, + { + "epoch": 0.2001807187043859, + "grad_norm": 46.6428337097168, + "learning_rate": 5.438125372765125e-05, + "logits/chosen": -2.8188021183013916, + "logits/rejected": -2.554450035095215, + "logps/chosen": -320.1256103515625, + "logps/rejected": -290.2034606933594, + "loss": 0.5168, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -1.0621830224990845, + "rewards/margins": 2.7468276023864746, + "rewards/rejected": -3.8090105056762695, + "step": 720 + }, + { + "epoch": 0.20518523667199556, + "grad_norm": 32.1719856262207, + "learning_rate": 5.410258477248931e-05, + "logits/chosen": -2.7152578830718994, + "logits/rejected": -2.3794937133789062, + "logps/chosen": -282.10394287109375, + "logps/rejected": -243.0301971435547, + "loss": 0.4414, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -0.9664502739906311, + "rewards/margins": 2.960956573486328, + "rewards/rejected": -3.9274072647094727, + "step": 738 + }, + { + "epoch": 0.2101897546396052, + "grad_norm": 37.132904052734375, + "learning_rate": 5.381792234086014e-05, + "logits/chosen": -2.7982935905456543, + "logits/rejected": -2.4567694664001465, + "logps/chosen": -319.1950988769531, + "logps/rejected": -262.9441223144531, + "loss": 0.4461, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -1.6922187805175781, + "rewards/margins": 2.850379467010498, + "rewards/rejected": -4.542598724365234, + "step": 756 + }, + { + "epoch": 0.21519427260721485, + "grad_norm": 30.27312660217285, + "learning_rate": 5.352733721843259e-05, + "logits/chosen": -2.7308244705200195, + "logits/rejected": -2.4927947521209717, + "logps/chosen": -312.2353515625, + "logps/rejected": -280.8824462890625, + "loss": 0.4246, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.527901291847229, + "rewards/margins": 3.861581563949585, + "rewards/rejected": -5.3894829750061035, + "step": 774 + }, + { + "epoch": 0.2201987905748245, + "grad_norm": 24.14598846435547, + "learning_rate": 5.323090166364326e-05, + "logits/chosen": -2.8204898834228516, + "logits/rejected": -2.4707374572753906, + "logps/chosen": -305.7160949707031, + "logps/rejected": -258.2662658691406, + "loss": 0.463, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.7590423822402954, + "rewards/margins": 2.972476005554199, + "rewards/rejected": -4.731518745422363, + "step": 792 + }, + { + "epoch": 0.22520330854243414, + "grad_norm": 64.99732971191406, + "learning_rate": 5.292868938972824e-05, + "logits/chosen": -2.7508084774017334, + "logits/rejected": -2.4071645736694336, + "logps/chosen": -319.9965515136719, + "logps/rejected": -263.8896484375, + "loss": 0.4901, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -2.045604705810547, + "rewards/margins": 3.3753106594085693, + "rewards/rejected": -5.420915603637695, + "step": 810 + }, + { + "epoch": 0.2302078265100438, + "grad_norm": 39.983245849609375, + "learning_rate": 5.2620775546393186e-05, + "logits/chosen": -2.7872860431671143, + "logits/rejected": -2.4374520778656006, + "logps/chosen": -282.57562255859375, + "logps/rejected": -247.69422912597656, + "loss": 0.388, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": -1.401705026626587, + "rewards/margins": 3.5087778568267822, + "rewards/rejected": -4.910482406616211, + "step": 828 + }, + { + "epoch": 0.23521234447765343, + "grad_norm": 23.027610778808594, + "learning_rate": 5.230723670112628e-05, + "logits/chosen": -2.762430429458618, + "logits/rejected": -2.371443748474121, + "logps/chosen": -341.0254211425781, + "logps/rejected": -295.3038330078125, + "loss": 0.4323, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -2.3675320148468018, + "rewards/margins": 4.257319450378418, + "rewards/rejected": -6.624851703643799, + "step": 846 + }, + { + "epoch": 0.24021686244526308, + "grad_norm": 17.208820343017578, + "learning_rate": 5.198815082015845e-05, + "logits/chosen": -2.7585699558258057, + "logits/rejected": -2.3666555881500244, + "logps/chosen": -294.0066223144531, + "logps/rejected": -256.5178527832031, + "loss": 0.4013, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -2.565335512161255, + "rewards/margins": 3.4496445655822754, + "rewards/rejected": -6.014980316162109, + "step": 864 + }, + { + "epoch": 0.24522138041287272, + "grad_norm": 36.96328353881836, + "learning_rate": 5.166359724907592e-05, + "logits/chosen": -2.852029323577881, + "logits/rejected": -2.520498514175415, + "logps/chosen": -326.54595947265625, + "logps/rejected": -276.0726318359375, + "loss": 0.4765, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -1.8649635314941406, + "rewards/margins": 2.598454713821411, + "rewards/rejected": -4.463418483734131, + "step": 882 + }, + { + "epoch": 0.25022589838048237, + "grad_norm": 48.351322174072266, + "learning_rate": 5.1333656693089757e-05, + "logits/chosen": -2.729527711868286, + "logits/rejected": -2.4688310623168945, + "logps/chosen": -328.0089111328125, + "logps/rejected": -288.0612487792969, + "loss": 0.4612, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.531855344772339, + "rewards/margins": 3.2491233348846436, + "rewards/rejected": -5.780978679656982, + "step": 900 + }, + { + "epoch": 0.255230416348092, + "grad_norm": 25.59536361694336, + "learning_rate": 5.0998411196967205e-05, + "logits/chosen": -2.8285937309265137, + "logits/rejected": -2.398197889328003, + "logps/chosen": -314.901123046875, + "logps/rejected": -277.6917724609375, + "loss": 0.5526, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -3.353430986404419, + "rewards/margins": 3.2675201892852783, + "rewards/rejected": -6.620951175689697, + "step": 918 + }, + { + "epoch": 0.26023493431570166, + "grad_norm": 35.946563720703125, + "learning_rate": 5.0657944124630024e-05, + "logits/chosen": -2.6806328296661377, + "logits/rejected": -2.3879404067993164, + "logps/chosen": -303.7491149902344, + "logps/rejected": -276.8700866699219, + "loss": 0.4603, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -2.5103557109832764, + "rewards/margins": 2.7431862354278564, + "rewards/rejected": -5.253542423248291, + "step": 936 + }, + { + "epoch": 0.2652394522833113, + "grad_norm": 36.0608024597168, + "learning_rate": 5.031234013842485e-05, + "logits/chosen": -2.819031238555908, + "logits/rejected": -2.405395269393921, + "logps/chosen": -324.1274108886719, + "logps/rejected": -263.674072265625, + "loss": 0.4374, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -1.4296331405639648, + "rewards/margins": 2.8712069988250732, + "rewards/rejected": -4.300840377807617, + "step": 954 + }, + { + "epoch": 0.27024397025092095, + "grad_norm": 22.51164436340332, + "learning_rate": 4.996168517807048e-05, + "logits/chosen": -2.8697896003723145, + "logits/rejected": -2.5542986392974854, + "logps/chosen": -301.4720458984375, + "logps/rejected": -264.41552734375, + "loss": 0.5852, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -2.1879396438598633, + "rewards/margins": 2.2755274772644043, + "rewards/rejected": -4.463467121124268, + "step": 972 + }, + { + "epoch": 0.2752484882185306, + "grad_norm": 15.381638526916504, + "learning_rate": 4.960606643928776e-05, + "logits/chosen": -2.8073878288269043, + "logits/rejected": -2.4859182834625244, + "logps/chosen": -329.6947021484375, + "logps/rejected": -279.3440246582031, + "loss": 0.4534, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.6783061027526855, + "rewards/margins": 2.6856889724731445, + "rewards/rejected": -4.36399507522583, + "step": 990 + }, + { + "epoch": 0.27802877597831377, + "eval_logits/chosen": -2.7802391052246094, + "eval_logits/rejected": -2.429973602294922, + "eval_logps/chosen": -307.14599609375, + "eval_logps/rejected": -255.32652282714844, + "eval_loss": 0.48188233375549316, + "eval_rewards/accuracies": 0.7099177837371826, + "eval_rewards/chosen": -1.408141851425171, + "eval_rewards/margins": 2.5977249145507812, + "eval_rewards/rejected": -4.005866527557373, + "eval_runtime": 2887.9715, + "eval_samples_per_second": 1.347, + "eval_steps_per_second": 0.674, + "step": 1000 + }, + { + "epoch": 0.28025300618614024, + "grad_norm": 52.02113723754883, + "learning_rate": 4.924557235211695e-05, + "logits/chosen": -2.748286008834839, + "logits/rejected": -2.395451545715332, + "logps/chosen": -323.3247985839844, + "logps/rejected": -266.3089294433594, + "loss": 0.4726, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -1.3913843631744385, + "rewards/margins": 2.7355737686157227, + "rewards/rejected": -4.126958847045898, + "step": 1008 + }, + { + "epoch": 0.2852575241537499, + "grad_norm": 21.775924682617188, + "learning_rate": 4.88802925589283e-05, + "logits/chosen": -2.705214023590088, + "logits/rejected": -2.342489242553711, + "logps/chosen": -321.12646484375, + "logps/rejected": -264.2709045410156, + "loss": 0.4525, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5450568199157715, + "rewards/margins": 2.7026331424713135, + "rewards/rejected": -4.247690200805664, + "step": 1026 + }, + { + "epoch": 0.2902620421213596, + "grad_norm": 53.91025924682617, + "learning_rate": 4.8510317892131084e-05, + "logits/chosen": -2.709462881088257, + "logits/rejected": -2.368770122528076, + "logps/chosen": -322.87091064453125, + "logps/rejected": -269.23291015625, + "loss": 0.5043, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -2.618788719177246, + "rewards/margins": 2.7418603897094727, + "rewards/rejected": -5.360649585723877, + "step": 1044 + }, + { + "epoch": 0.29526656008896923, + "grad_norm": 49.668338775634766, + "learning_rate": 4.813574035158671e-05, + "logits/chosen": -2.7249040603637695, + "logits/rejected": -2.379298210144043, + "logps/chosen": -283.1727600097656, + "logps/rejected": -255.97384643554688, + "loss": 0.4848, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.7925028800964355, + "rewards/margins": 2.7203238010406494, + "rewards/rejected": -4.512826919555664, + "step": 1062 + }, + { + "epoch": 0.3002710780565789, + "grad_norm": 12.902231216430664, + "learning_rate": 4.7756653081731606e-05, + "logits/chosen": -2.719536542892456, + "logits/rejected": -2.4114749431610107, + "logps/chosen": -331.2315673828125, + "logps/rejected": -266.7965087890625, + "loss": 0.4634, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9296529293060303, + "rewards/margins": 2.834092378616333, + "rewards/rejected": -4.7637457847595215, + "step": 1080 + }, + { + "epoch": 0.3052755960241885, + "grad_norm": 56.96560287475586, + "learning_rate": 4.737315034841532e-05, + "logits/chosen": -2.823291540145874, + "logits/rejected": -2.481924533843994, + "logps/chosen": -317.013427734375, + "logps/rejected": -258.55181884765625, + "loss": 0.4776, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": -1.8802590370178223, + "rewards/margins": 2.4822633266448975, + "rewards/rejected": -4.362522602081299, + "step": 1098 + }, + { + "epoch": 0.31028011399179817, + "grad_norm": 49.195804595947266, + "learning_rate": 4.698532751546002e-05, + "logits/chosen": -2.85477352142334, + "logits/rejected": -2.5225324630737305, + "logps/chosen": -312.89569091796875, + "logps/rejected": -258.9193115234375, + "loss": 0.5338, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": -1.4863510131835938, + "rewards/margins": 2.5749833583831787, + "rewards/rejected": -4.061334609985352, + "step": 1116 + }, + { + "epoch": 0.3152846319594078, + "grad_norm": 36.36430740356445, + "learning_rate": 4.659328102094669e-05, + "logits/chosen": -2.7769174575805664, + "logits/rejected": -2.3771677017211914, + "logps/chosen": -304.90191650390625, + "logps/rejected": -235.15464782714844, + "loss": 0.4854, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -1.5702778100967407, + "rewards/margins": 2.770118474960327, + "rewards/rejected": -4.340396404266357, + "step": 1134 + }, + { + "epoch": 0.32028914992701746, + "grad_norm": 35.03776550292969, + "learning_rate": 4.619710835323444e-05, + "logits/chosen": -2.818751335144043, + "logits/rejected": -2.4510741233825684, + "logps/chosen": -336.0886535644531, + "logps/rejected": -268.6121826171875, + "loss": 0.5096, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -2.0941898822784424, + "rewards/margins": 2.8020951747894287, + "rewards/rejected": -4.896285057067871, + "step": 1152 + }, + { + "epoch": 0.3252936678946271, + "grad_norm": 37.2415771484375, + "learning_rate": 4.57969080267184e-05, + "logits/chosen": -2.788228750228882, + "logits/rejected": -2.4594948291778564, + "logps/chosen": -298.81951904296875, + "logps/rejected": -247.10076904296875, + "loss": 0.5616, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -1.624724268913269, + "rewards/margins": 2.3189942836761475, + "rewards/rejected": -3.943718433380127, + "step": 1170 + }, + { + "epoch": 0.33029818586223675, + "grad_norm": 59.08698654174805, + "learning_rate": 4.5392779557332744e-05, + "logits/chosen": -2.8354766368865967, + "logits/rejected": -2.498143196105957, + "logps/chosen": -306.4485778808594, + "logps/rejected": -250.01193237304688, + "loss": 0.425, + "rewards/accuracies": 0.7708333134651184, + "rewards/chosen": -1.5983328819274902, + "rewards/margins": 3.0345067977905273, + "rewards/rejected": -4.632839679718018, + "step": 1188 + }, + { + "epoch": 0.3353027038298464, + "grad_norm": 55.074493408203125, + "learning_rate": 4.498482343780445e-05, + "logits/chosen": -2.785553216934204, + "logits/rejected": -2.6019632816314697, + "logps/chosen": -295.4771423339844, + "logps/rejected": -269.24420166015625, + "loss": 0.4736, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -1.1955021619796753, + "rewards/margins": 2.590818405151367, + "rewards/rejected": -3.786320686340332, + "step": 1206 + }, + { + "epoch": 0.34030722179745604, + "grad_norm": 43.06817626953125, + "learning_rate": 4.457314111266426e-05, + "logits/chosen": -2.8709800243377686, + "logits/rejected": -2.440617084503174, + "logps/chosen": -297.2735595703125, + "logps/rejected": -245.7543182373047, + "loss": 0.4666, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.0809131860733032, + "rewards/margins": 3.186699867248535, + "rewards/rejected": -4.267612934112549, + "step": 1224 + }, + { + "epoch": 0.3453117397650657, + "grad_norm": 14.86072826385498, + "learning_rate": 4.4157834953020966e-05, + "logits/chosen": -2.7868971824645996, + "logits/rejected": -2.4189891815185547, + "logps/chosen": -304.0820007324219, + "logps/rejected": -247.03460693359375, + "loss": 0.4693, + "rewards/accuracies": 0.7569444179534912, + "rewards/chosen": -1.3618935346603394, + "rewards/margins": 2.7727527618408203, + "rewards/rejected": -4.134645938873291, + "step": 1242 + }, + { + "epoch": 0.35031625773267533, + "grad_norm": 25.05876350402832, + "learning_rate": 4.373900823110524e-05, + "logits/chosen": -2.8263025283813477, + "logits/rejected": -2.4502522945404053, + "logps/chosen": -322.935546875, + "logps/rejected": -264.3840637207031, + "loss": 0.4631, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -1.761283278465271, + "rewards/margins": 3.0156145095825195, + "rewards/rejected": -4.776896953582764, + "step": 1260 + }, + { + "epoch": 0.355320775700285, + "grad_norm": 64.55970001220703, + "learning_rate": 4.331676509458943e-05, + "logits/chosen": -2.7206239700317383, + "logits/rejected": -2.418269634246826, + "logps/chosen": -305.78369140625, + "logps/rejected": -272.55438232421875, + "loss": 0.5045, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -1.4667495489120483, + "rewards/margins": 3.219045639038086, + "rewards/rejected": -4.685795307159424, + "step": 1278 + }, + { + "epoch": 0.3603252936678946, + "grad_norm": 37.59148025512695, + "learning_rate": 4.289121054068965e-05, + "logits/chosen": -2.6927237510681152, + "logits/rejected": -2.3527231216430664, + "logps/chosen": -318.378662109375, + "logps/rejected": -273.1434326171875, + "loss": 0.3847, + "rewards/accuracies": 0.7847222089767456, + "rewards/chosen": -1.0431514978408813, + "rewards/margins": 3.9186151027679443, + "rewards/rejected": -4.961766719818115, + "step": 1296 + }, + { + "epoch": 0.36532981163550426, + "grad_norm": 29.348026275634766, + "learning_rate": 4.24624503900566e-05, + "logits/chosen": -2.8235292434692383, + "logits/rejected": -2.5070762634277344, + "logps/chosen": -307.8551940917969, + "logps/rejected": -262.47198486328125, + "loss": 0.4807, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.3642220497131348, + "rewards/margins": 3.129309892654419, + "rewards/rejected": -4.493532180786133, + "step": 1314 + }, + { + "epoch": 0.3703343296031139, + "grad_norm": 13.151726722717285, + "learning_rate": 4.203059126046166e-05, + "logits/chosen": -2.7534186840057373, + "logits/rejected": -2.427877426147461, + "logps/chosen": -303.9079895019531, + "logps/rejected": -262.9033508300781, + "loss": 0.4799, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6105862855911255, + "rewards/margins": 3.6808276176452637, + "rewards/rejected": -5.2914137840271, + "step": 1332 + }, + { + "epoch": 0.37533884757072355, + "grad_norm": 52.22329330444336, + "learning_rate": 4.1595740540284744e-05, + "logits/chosen": -2.821272134780884, + "logits/rejected": -2.4599618911743164, + "logps/chosen": -299.4214172363281, + "logps/rejected": -252.08761596679688, + "loss": 0.3859, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -1.7845611572265625, + "rewards/margins": 3.4010486602783203, + "rewards/rejected": -5.185609817504883, + "step": 1350 + }, + { + "epoch": 0.3803433655383332, + "grad_norm": 47.97208023071289, + "learning_rate": 4.1158006361810534e-05, + "logits/chosen": -2.7893478870391846, + "logits/rejected": -2.5072147846221924, + "logps/chosen": -302.7026062011719, + "logps/rejected": -257.9830322265625, + "loss": 0.5735, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -2.1573987007141113, + "rewards/margins": 3.0353074073791504, + "rewards/rejected": -5.192706108093262, + "step": 1368 + }, + { + "epoch": 0.38534788350594285, + "grad_norm": 32.802825927734375, + "learning_rate": 4.071749757433974e-05, + "logits/chosen": -2.8340954780578613, + "logits/rejected": -2.516873359680176, + "logps/chosen": -331.8294677734375, + "logps/rejected": -307.69000244140625, + "loss": 0.5361, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -2.872102737426758, + "rewards/margins": 3.509077787399292, + "rewards/rejected": -6.381181240081787, + "step": 1386 + }, + { + "epoch": 0.3903524014735525, + "grad_norm": 31.112899780273438, + "learning_rate": 4.027432371712202e-05, + "logits/chosen": -2.781808853149414, + "logits/rejected": -2.506070613861084, + "logps/chosen": -302.9484558105469, + "logps/rejected": -271.6375732421875, + "loss": 0.4943, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.8420933485031128, + "rewards/margins": 3.083890199661255, + "rewards/rejected": -4.925983905792236, + "step": 1404 + }, + { + "epoch": 0.39535691944116214, + "grad_norm": 39.01347351074219, + "learning_rate": 3.982859499211743e-05, + "logits/chosen": -2.8581554889678955, + "logits/rejected": -2.522110939025879, + "logps/chosen": -300.3226623535156, + "logps/rejected": -248.32310485839844, + "loss": 0.4146, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.0712960958480835, + "rewards/margins": 3.049311637878418, + "rewards/rejected": -4.120607852935791, + "step": 1422 + }, + { + "epoch": 0.4003614374087718, + "grad_norm": 47.431541442871094, + "learning_rate": 3.9380422236593e-05, + "logits/chosen": -2.8269362449645996, + "logits/rejected": -2.476876735687256, + "logps/chosen": -325.6385498046875, + "logps/rejected": -271.75482177734375, + "loss": 0.4796, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -0.9542015790939331, + "rewards/margins": 2.940296173095703, + "rewards/rejected": -3.894497871398926, + "step": 1440 + }, + { + "epoch": 0.4053659553763815, + "grad_norm": 45.60956954956055, + "learning_rate": 3.8929916895561286e-05, + "logits/chosen": -2.788536548614502, + "logits/rejected": -2.517867088317871, + "logps/chosen": -306.3862609863281, + "logps/rejected": -260.1548767089844, + "loss": 0.5601, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -1.4891281127929688, + "rewards/margins": 2.7551279067993164, + "rewards/rejected": -4.244256496429443, + "step": 1458 + }, + { + "epoch": 0.4103704733439911, + "grad_norm": 37.900238037109375, + "learning_rate": 3.8477190994067985e-05, + "logits/chosen": -2.8547329902648926, + "logits/rejected": -2.391421318054199, + "logps/chosen": -288.4437561035156, + "logps/rejected": -235.8981170654297, + "loss": 0.4304, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -1.1458418369293213, + "rewards/margins": 3.2426905632019043, + "rewards/rejected": -4.388532638549805, + "step": 1476 + }, + { + "epoch": 0.41537499131160077, + "grad_norm": 28.052438735961914, + "learning_rate": 3.802235710933511e-05, + "logits/chosen": -2.7829697132110596, + "logits/rejected": -2.4879438877105713, + "logps/chosen": -297.0559997558594, + "logps/rejected": -274.1761474609375, + "loss": 0.3739, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -1.3499208688735962, + "rewards/margins": 3.3596606254577637, + "rewards/rejected": -4.70958137512207, + "step": 1494 + }, + { + "epoch": 0.41704316396747065, + "eval_logits/chosen": -2.818539619445801, + "eval_logits/rejected": -2.4813146591186523, + "eval_logps/chosen": -303.57452392578125, + "eval_logps/rejected": -255.39651489257812, + "eval_loss": 0.47764262557029724, + "eval_rewards/accuracies": 0.7150565385818481, + "eval_rewards/chosen": -1.050990343093872, + "eval_rewards/margins": 2.9618752002716064, + "eval_rewards/rejected": -4.01286506652832, + "eval_runtime": 2890.0039, + "eval_samples_per_second": 1.346, + "eval_steps_per_second": 0.673, + "step": 1500 + }, + { + "epoch": 0.4203795092792104, + "grad_norm": 39.25442886352539, + "learning_rate": 3.75655283427669e-05, + "logits/chosen": -2.75813627243042, + "logits/rejected": -2.4602572917938232, + "logps/chosen": -304.7848815917969, + "logps/rejected": -265.5973205566406, + "loss": 0.4398, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -1.1466350555419922, + "rewards/margins": 3.0892410278320312, + "rewards/rejected": -4.235876083374023, + "step": 1512 + }, + { + "epoch": 0.42538402724682006, + "grad_norm": 23.674659729003906, + "learning_rate": 3.710681829182556e-05, + "logits/chosen": -2.8328258991241455, + "logits/rejected": -2.46769118309021, + "logps/chosen": -293.65582275390625, + "logps/rejected": -246.4385528564453, + "loss": 0.4921, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -1.0869251489639282, + "rewards/margins": 2.648738384246826, + "rewards/rejected": -3.7356631755828857, + "step": 1530 + }, + { + "epoch": 0.4303885452144297, + "grad_norm": 51.24037551879883, + "learning_rate": 3.66463410217834e-05, + "logits/chosen": -2.8156230449676514, + "logits/rejected": -2.488452672958374, + "logps/chosen": -317.01177978515625, + "logps/rejected": -262.98040771484375, + "loss": 0.4375, + "rewards/accuracies": 0.7569444179534912, + "rewards/chosen": -2.3397586345672607, + "rewards/margins": 2.8332090377807617, + "rewards/rejected": -5.172967910766602, + "step": 1548 + }, + { + "epoch": 0.43539306318203935, + "grad_norm": 48.07794952392578, + "learning_rate": 3.618421103735881e-05, + "logits/chosen": -2.732844591140747, + "logits/rejected": -2.342611074447632, + "logps/chosen": -333.2563781738281, + "logps/rejected": -285.69580078125, + "loss": 0.4959, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -3.5013587474823, + "rewards/margins": 3.6540586948394775, + "rewards/rejected": -7.155416488647461, + "step": 1566 + }, + { + "epoch": 0.440397581149649, + "grad_norm": 56.5835075378418, + "learning_rate": 3.572054325424288e-05, + "logits/chosen": -2.719848155975342, + "logits/rejected": -2.3940811157226562, + "logps/chosen": -292.4789733886719, + "logps/rejected": -255.9777069091797, + "loss": 0.5122, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8631534576416016, + "rewards/margins": 2.864680051803589, + "rewards/rejected": -4.7278337478637695, + "step": 1584 + }, + { + "epoch": 0.44540209911725864, + "grad_norm": 11.798177719116211, + "learning_rate": 3.525545297052389e-05, + "logits/chosen": -2.724909782409668, + "logits/rejected": -2.322401762008667, + "logps/chosen": -354.0780944824219, + "logps/rejected": -285.12091064453125, + "loss": 0.3962, + "rewards/accuracies": 0.7986111044883728, + "rewards/chosen": -1.9553453922271729, + "rewards/margins": 3.6202268600463867, + "rewards/rejected": -5.5755720138549805, + "step": 1602 + }, + { + "epoch": 0.4504066170848683, + "grad_norm": 39.22509002685547, + "learning_rate": 3.478905583801666e-05, + "logits/chosen": -2.7018537521362305, + "logits/rejected": -2.2796404361724854, + "logps/chosen": -336.0934753417969, + "logps/rejected": -263.1202087402344, + "loss": 0.4417, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -1.7801519632339478, + "rewards/margins": 3.4768850803375244, + "rewards/rejected": -5.2570366859436035, + "step": 1620 + }, + { + "epoch": 0.45541113505247793, + "grad_norm": 29.088903427124023, + "learning_rate": 3.432146783350394e-05, + "logits/chosen": -2.7141613960266113, + "logits/rejected": -2.482086181640625, + "logps/chosen": -304.6131286621094, + "logps/rejected": -287.9849853515625, + "loss": 0.5199, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.4703941345214844, + "rewards/margins": 2.9347872734069824, + "rewards/rejected": -4.405181407928467, + "step": 1638 + }, + { + "epoch": 0.4604156530200876, + "grad_norm": 36.439937591552734, + "learning_rate": 3.3852805229897016e-05, + "logits/chosen": -2.632075071334839, + "logits/rejected": -2.3536500930786133, + "logps/chosen": -296.20025634765625, + "logps/rejected": -265.7930603027344, + "loss": 0.478, + "rewards/accuracies": 0.7083333134651184, + "rewards/chosen": -1.968884825706482, + "rewards/margins": 3.341083288192749, + "rewards/rejected": -5.309967994689941, + "step": 1656 + }, + { + "epoch": 0.4654201709876972, + "grad_norm": 16.566734313964844, + "learning_rate": 3.3383184567322724e-05, + "logits/chosen": -2.746687889099121, + "logits/rejected": -2.3198723793029785, + "logps/chosen": -309.37493896484375, + "logps/rejected": -270.9768371582031, + "loss": 0.3965, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": -1.8034520149230957, + "rewards/margins": 3.4195146560668945, + "rewards/rejected": -5.22296667098999, + "step": 1674 + }, + { + "epoch": 0.47042468895530687, + "grad_norm": 34.576419830322266, + "learning_rate": 3.2912722624143885e-05, + "logits/chosen": -2.7036242485046387, + "logits/rejected": -2.358764886856079, + "logps/chosen": -302.0757141113281, + "logps/rejected": -251.74368286132812, + "loss": 0.53, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -1.2215288877487183, + "rewards/margins": 2.953450918197632, + "rewards/rejected": -4.174980163574219, + "step": 1692 + }, + { + "epoch": 0.4754292069229165, + "grad_norm": 28.038728713989258, + "learning_rate": 3.244153638792062e-05, + "logits/chosen": -2.7422502040863037, + "logits/rejected": -2.337440013885498, + "logps/chosen": -325.7433166503906, + "logps/rejected": -265.1995849609375, + "loss": 0.4173, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1670732498168945, + "rewards/margins": 3.3245160579681396, + "rewards/rejected": -5.4915900230407715, + "step": 1710 + }, + { + "epoch": 0.48043372489052616, + "grad_norm": 18.412261962890625, + "learning_rate": 3.1969743026319595e-05, + "logits/chosen": -2.7886910438537598, + "logits/rejected": -2.3111114501953125, + "logps/chosen": -301.91400146484375, + "logps/rejected": -247.4539337158203, + "loss": 0.4836, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -1.845736026763916, + "rewards/margins": 3.18009614944458, + "rewards/rejected": -5.025832176208496, + "step": 1728 + }, + { + "epoch": 0.4854382428581358, + "grad_norm": 17.696378707885742, + "learning_rate": 3.14974598579785e-05, + "logits/chosen": -2.731029510498047, + "logits/rejected": -2.39078688621521, + "logps/chosen": -325.11383056640625, + "logps/rejected": -290.14825439453125, + "loss": 0.4634, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -2.235804557800293, + "rewards/margins": 3.482968330383301, + "rewards/rejected": -5.718773365020752, + "step": 1746 + }, + { + "epoch": 0.49044276082574545, + "grad_norm": 26.636018753051758, + "learning_rate": 3.1024804323332946e-05, + "logits/chosen": -2.7128775119781494, + "logits/rejected": -2.340599536895752, + "logps/chosen": -319.22149658203125, + "logps/rejected": -282.0538330078125, + "loss": 0.4392, + "rewards/accuracies": 0.7708333134651184, + "rewards/chosen": -2.816331624984741, + "rewards/margins": 3.1326615810394287, + "rewards/rejected": -5.948994159698486, + "step": 1764 + }, + { + "epoch": 0.4954472787933551, + "grad_norm": 57.90165710449219, + "learning_rate": 3.0551893955413146e-05, + "logits/chosen": -2.7226057052612305, + "logits/rejected": -2.3010740280151367, + "logps/chosen": -322.99603271484375, + "logps/rejected": -277.3245849609375, + "loss": 0.515, + "rewards/accuracies": 0.7569444179534912, + "rewards/chosen": -1.6943638324737549, + "rewards/margins": 3.3747987747192383, + "rewards/rejected": -5.069162368774414, + "step": 1782 + }, + { + "epoch": 0.5004517967609647, + "grad_norm": 27.138952255249023, + "learning_rate": 3.0078846350617494e-05, + "logits/chosen": -2.7158679962158203, + "logits/rejected": -2.421780824661255, + "logps/chosen": -305.9707336425781, + "logps/rejected": -288.4151916503906, + "loss": 0.3972, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.6311149597167969, + "rewards/margins": 3.504594564437866, + "rewards/rejected": -5.135709762573242, + "step": 1800 + }, + { + "epoch": 0.5054563147285744, + "grad_norm": 28.006305694580078, + "learning_rate": 2.9605779139470432e-05, + "logits/chosen": -2.727426290512085, + "logits/rejected": -2.442085027694702, + "logps/chosen": -319.677734375, + "logps/rejected": -284.64154052734375, + "loss": 0.4361, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -2.300724506378174, + "rewards/margins": 3.6030399799346924, + "rewards/rejected": -5.903764724731445, + "step": 1818 + }, + { + "epoch": 0.510460832696184, + "grad_norm": 30.317636489868164, + "learning_rate": 2.9132809957371824e-05, + "logits/chosen": -2.725233793258667, + "logits/rejected": -2.399477958679199, + "logps/chosen": -327.3699035644531, + "logps/rejected": -273.03228759765625, + "loss": 0.3842, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -2.5508806705474854, + "rewards/margins": 3.668443441390991, + "rewards/rejected": -6.219324111938477, + "step": 1836 + }, + { + "epoch": 0.5154653506637937, + "grad_norm": 41.09893798828125, + "learning_rate": 2.86600564153451e-05, + "logits/chosen": -2.826190710067749, + "logits/rejected": -2.542912244796753, + "logps/chosen": -312.857421875, + "logps/rejected": -283.3468017578125, + "loss": 0.4551, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -2.158759355545044, + "rewards/margins": 3.1043689250946045, + "rewards/rejected": -5.263128280639648, + "step": 1854 + }, + { + "epoch": 0.5204698686314033, + "grad_norm": 28.608667373657227, + "learning_rate": 2.818763607079151e-05, + "logits/chosen": -2.8934526443481445, + "logits/rejected": -2.500380516052246, + "logps/chosen": -324.0126037597656, + "logps/rejected": -251.84005737304688, + "loss": 0.4401, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.5546883344650269, + "rewards/margins": 3.3671345710754395, + "rewards/rejected": -4.921822547912598, + "step": 1872 + }, + { + "epoch": 0.525474386599013, + "grad_norm": 30.46738052368164, + "learning_rate": 2.7715666398257643e-05, + "logits/chosen": -2.8571200370788574, + "logits/rejected": -2.5232975482940674, + "logps/chosen": -305.3577575683594, + "logps/rejected": -263.7037353515625, + "loss": 0.4215, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.7066543102264404, + "rewards/margins": 3.4266107082366943, + "rewards/rejected": -5.133265018463135, + "step": 1890 + }, + { + "epoch": 0.5304789045666226, + "grad_norm": 35.60761260986328, + "learning_rate": 2.7244264760223567e-05, + "logits/chosen": -2.8389198780059814, + "logits/rejected": -2.4737510681152344, + "logps/chosen": -305.3123474121094, + "logps/rejected": -270.08349609375, + "loss": 0.4825, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -2.4751136302948, + "rewards/margins": 3.4185471534729004, + "rewards/rejected": -5.893660545349121, + "step": 1908 + }, + { + "epoch": 0.5354834225342323, + "grad_norm": 18.354534149169922, + "learning_rate": 2.677354837791895e-05, + "logits/chosen": -2.8203418254852295, + "logits/rejected": -2.4711532592773438, + "logps/chosen": -310.1116638183594, + "logps/rejected": -271.8264465332031, + "loss": 0.3957, + "rewards/accuracies": 0.8194444179534912, + "rewards/chosen": -2.1739501953125, + "rewards/margins": 3.923552989959717, + "rewards/rejected": -6.097503662109375, + "step": 1926 + }, + { + "epoch": 0.5404879405018419, + "grad_norm": 33.30379104614258, + "learning_rate": 2.6303634302174067e-05, + "logits/chosen": -2.8017544746398926, + "logits/rejected": -2.5004069805145264, + "logps/chosen": -327.4877624511719, + "logps/rejected": -286.92364501953125, + "loss": 0.4031, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": -2.1811113357543945, + "rewards/margins": 3.1494991779327393, + "rewards/rejected": -5.330610752105713, + "step": 1944 + }, + { + "epoch": 0.5454924584694516, + "grad_norm": 59.1282844543457, + "learning_rate": 2.5834639384313426e-05, + "logits/chosen": -2.8410487174987793, + "logits/rejected": -2.506558656692505, + "logps/chosen": -330.2596740722656, + "logps/rejected": -291.4815368652344, + "loss": 0.5108, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -1.4231274127960205, + "rewards/margins": 3.291710376739502, + "rewards/rejected": -4.714838027954102, + "step": 1962 + }, + { + "epoch": 0.5504969764370612, + "grad_norm": 32.75850296020508, + "learning_rate": 2.536668024709884e-05, + "logits/chosen": -2.784243583679199, + "logits/rejected": -2.4926466941833496, + "logps/chosen": -332.8819580078125, + "logps/rejected": -274.2498474121094, + "loss": 0.5484, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -1.9837809801101685, + "rewards/margins": 2.8485584259033203, + "rewards/rejected": -4.832339763641357, + "step": 1980 + }, + { + "epoch": 0.5555014944046709, + "grad_norm": 10.708755493164062, + "learning_rate": 2.4899873255729477e-05, + "logits/chosen": -2.792935609817505, + "logits/rejected": -2.4582226276397705, + "logps/chosen": -302.2508544921875, + "logps/rejected": -274.1002197265625, + "loss": 0.3274, + "rewards/accuracies": 0.8055555820465088, + "rewards/chosen": -1.6284408569335938, + "rewards/margins": 3.9526689052581787, + "rewards/rejected": -5.581110000610352, + "step": 1998 + }, + { + "epoch": 0.5560575519566275, + "eval_logits/chosen": -2.7665135860443115, + "eval_logits/rejected": -2.44524884223938, + "eval_logps/chosen": -309.2798767089844, + "eval_logps/rejected": -264.00555419921875, + "eval_loss": 0.44785141944885254, + "eval_rewards/accuracies": 0.7284172773361206, + "eval_rewards/chosen": -1.6215273141860962, + "eval_rewards/margins": 3.2522411346435547, + "eval_rewards/rejected": -4.873768329620361, + "eval_runtime": 2888.8904, + "eval_samples_per_second": 1.347, + "eval_steps_per_second": 0.674, + "step": 2000 + }, + { + "epoch": 0.5605060123722805, + "grad_norm": 5.3380842208862305, + "learning_rate": 2.443433448890575e-05, + "logits/chosen": -2.793086290359497, + "logits/rejected": -2.4027912616729736, + "logps/chosen": -328.5480041503906, + "logps/rejected": -276.5953369140625, + "loss": 0.4097, + "rewards/accuracies": 0.7708333134651184, + "rewards/chosen": -1.7748560905456543, + "rewards/margins": 3.9136385917663574, + "rewards/rejected": -5.68849515914917, + "step": 2016 + }, + { + "epoch": 0.5655105303398902, + "grad_norm": 24.693368911743164, + "learning_rate": 2.397017970996466e-05, + "logits/chosen": -2.7521121501922607, + "logits/rejected": -2.3789448738098145, + "logps/chosen": -292.8934020996094, + "logps/rejected": -244.35665893554688, + "loss": 0.4414, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -2.16286301612854, + "rewards/margins": 3.7771759033203125, + "rewards/rejected": -5.940038681030273, + "step": 2034 + }, + { + "epoch": 0.5705150483074998, + "grad_norm": 32.282615661621094, + "learning_rate": 2.3507524338093432e-05, + "logits/chosen": -2.715144634246826, + "logits/rejected": -2.503474235534668, + "logps/chosen": -296.28790283203125, + "logps/rejected": -294.35955810546875, + "loss": 0.4622, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -1.4016257524490356, + "rewards/margins": 3.682527780532837, + "rewards/rejected": -5.084153652191162, + "step": 2052 + }, + { + "epoch": 0.5755195662751095, + "grad_norm": 17.326995849609375, + "learning_rate": 2.304648341962881e-05, + "logits/chosen": -2.704878330230713, + "logits/rejected": -2.4385833740234375, + "logps/chosen": -308.77435302734375, + "logps/rejected": -271.3253173828125, + "loss": 0.5039, + "rewards/accuracies": 0.7083333134651184, + "rewards/chosen": -1.7743431329727173, + "rewards/margins": 3.1500954627990723, + "rewards/rejected": -4.9244384765625, + "step": 2070 + }, + { + "epoch": 0.5805240842427192, + "grad_norm": 39.15388488769531, + "learning_rate": 2.2587171599448978e-05, + "logits/chosen": -2.69236421585083, + "logits/rejected": -2.3836355209350586, + "logps/chosen": -309.4457702636719, + "logps/rejected": -266.6226806640625, + "loss": 0.5611, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -1.9411275386810303, + "rewards/margins": 2.748253583908081, + "rewards/rejected": -4.689381122589111, + "step": 2088 + }, + { + "epoch": 0.5855286022103288, + "grad_norm": 24.7508487701416, + "learning_rate": 2.212970309246538e-05, + "logits/chosen": -2.8038172721862793, + "logits/rejected": -2.4395999908447266, + "logps/chosen": -298.9497985839844, + "logps/rejected": -253.84364318847656, + "loss": 0.4794, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9884018898010254, + "rewards/margins": 3.1343109607696533, + "rewards/rejected": -5.1227126121521, + "step": 2106 + }, + { + "epoch": 0.5905331201779385, + "grad_norm": 34.415287017822266, + "learning_rate": 2.1674191655221417e-05, + "logits/chosen": -2.692016363143921, + "logits/rejected": -2.4969379901885986, + "logps/chosen": -343.382568359375, + "logps/rejected": -295.4187927246094, + "loss": 0.5438, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.783701777458191, + "rewards/margins": 2.760159969329834, + "rewards/rejected": -4.5438618659973145, + "step": 2124 + }, + { + "epoch": 0.595537638145548, + "grad_norm": 6.282063961029053, + "learning_rate": 2.1220750557605203e-05, + "logits/chosen": -2.750368595123291, + "logits/rejected": -2.3735711574554443, + "logps/chosen": -329.3069152832031, + "logps/rejected": -281.57305908203125, + "loss": 0.3962, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -2.1854476928710938, + "rewards/margins": 3.8083245754241943, + "rewards/rejected": -5.993772506713867, + "step": 2142 + }, + { + "epoch": 0.6005421561131578, + "grad_norm": 34.97222900390625, + "learning_rate": 2.076949255468317e-05, + "logits/chosen": -2.7288906574249268, + "logits/rejected": -2.40971302986145, + "logps/chosen": -325.5711364746094, + "logps/rejected": -279.8585510253906, + "loss": 0.4488, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7341102361679077, + "rewards/margins": 3.346696615219116, + "rewards/rejected": -5.080807209014893, + "step": 2160 + }, + { + "epoch": 0.6055466740807673, + "grad_norm": 74.58136749267578, + "learning_rate": 2.0320529858661906e-05, + "logits/chosen": -2.7919845581054688, + "logits/rejected": -2.330806255340576, + "logps/chosen": -350.3424987792969, + "logps/rejected": -283.5325622558594, + "loss": 0.458, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.815643548965454, + "rewards/margins": 4.053121566772461, + "rewards/rejected": -5.868765354156494, + "step": 2178 + }, + { + "epoch": 0.610551192048377, + "grad_norm": 27.076066970825195, + "learning_rate": 1.987397411098475e-05, + "logits/chosen": -2.673175573348999, + "logits/rejected": -2.440861701965332, + "logps/chosen": -284.3968505859375, + "logps/rejected": -261.8418884277344, + "loss": 0.5901, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -1.5232903957366943, + "rewards/margins": 3.77292537689209, + "rewards/rejected": -5.296216011047363, + "step": 2196 + }, + { + "epoch": 0.6155557100159866, + "grad_norm": 39.62623596191406, + "learning_rate": 1.9429936354570604e-05, + "logits/chosen": -2.778045177459717, + "logits/rejected": -2.4611825942993164, + "logps/chosen": -315.9211730957031, + "logps/rejected": -260.84832763671875, + "loss": 0.5069, + "rewards/accuracies": 0.7083333134651184, + "rewards/chosen": -1.3145393133163452, + "rewards/margins": 2.574803352355957, + "rewards/rejected": -3.889342784881592, + "step": 2214 + }, + { + "epoch": 0.6205602279835963, + "grad_norm": 49.92598342895508, + "learning_rate": 1.898852700620124e-05, + "logits/chosen": -2.60537052154541, + "logits/rejected": -2.3992955684661865, + "logps/chosen": -291.26409912109375, + "logps/rejected": -269.22711181640625, + "loss": 0.4579, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4664363861083984, + "rewards/margins": 3.583827495574951, + "rewards/rejected": -5.050264358520508, + "step": 2232 + }, + { + "epoch": 0.6255647459512059, + "grad_norm": 48.596107482910156, + "learning_rate": 1.8549855829064594e-05, + "logits/chosen": -2.7216684818267822, + "logits/rejected": -2.540314197540283, + "logps/chosen": -290.99017333984375, + "logps/rejected": -277.0935974121094, + "loss": 0.4279, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4961212873458862, + "rewards/margins": 3.441756010055542, + "rewards/rejected": -4.937877178192139, + "step": 2250 + }, + { + "epoch": 0.6305692639188156, + "grad_norm": 17.144893646240234, + "learning_rate": 1.8114031905460407e-05, + "logits/chosen": -2.826719045639038, + "logits/rejected": -2.5303499698638916, + "logps/chosen": -290.9403076171875, + "logps/rejected": -269.9739685058594, + "loss": 0.5296, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.7913404703140259, + "rewards/margins": 2.7407760620117188, + "rewards/rejected": -4.532116413116455, + "step": 2268 + }, + { + "epoch": 0.6355737818864252, + "grad_norm": 32.463714599609375, + "learning_rate": 1.76811636096753e-05, + "logits/chosen": -2.765101432800293, + "logits/rejected": -2.3980343341827393, + "logps/chosen": -325.4449157714844, + "logps/rejected": -255.06222534179688, + "loss": 0.5, + "rewards/accuracies": 0.7083333134651184, + "rewards/chosen": -1.558535099029541, + "rewards/margins": 3.0294504165649414, + "rewards/rejected": -4.587985515594482, + "step": 2286 + }, + { + "epoch": 0.6405782998540349, + "grad_norm": 26.847185134887695, + "learning_rate": 1.7251358581033784e-05, + "logits/chosen": -2.779033660888672, + "logits/rejected": -2.4409353733062744, + "logps/chosen": -321.6474914550781, + "logps/rejected": -267.8830871582031, + "loss": 0.4256, + "rewards/accuracies": 0.7569444179534912, + "rewards/chosen": -1.7168571949005127, + "rewards/margins": 3.4056715965270996, + "rewards/rejected": -5.122529029846191, + "step": 2304 + }, + { + "epoch": 0.6455828178216445, + "grad_norm": 55.49918746948242, + "learning_rate": 1.6824723697132142e-05, + "logits/chosen": -2.7350029945373535, + "logits/rejected": -2.4814822673797607, + "logps/chosen": -323.92486572265625, + "logps/rejected": -291.02362060546875, + "loss": 0.5156, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -2.151139736175537, + "rewards/margins": 3.3088717460632324, + "rewards/rejected": -5.460011959075928, + "step": 2322 + }, + { + "epoch": 0.6505873357892542, + "grad_norm": 46.12184143066406, + "learning_rate": 1.6401365047261673e-05, + "logits/chosen": -2.73891544342041, + "logits/rejected": -2.43554425239563, + "logps/chosen": -288.85528564453125, + "logps/rejected": -265.0264892578125, + "loss": 0.3693, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7020026445388794, + "rewards/margins": 3.5679426193237305, + "rewards/rejected": -5.26994514465332, + "step": 2340 + }, + { + "epoch": 0.6555918537568638, + "grad_norm": 48.02040100097656, + "learning_rate": 1.5981387906027945e-05, + "logits/chosen": -2.7429189682006836, + "logits/rejected": -2.4096295833587646, + "logps/chosen": -353.75726318359375, + "logps/rejected": -290.4449768066406, + "loss": 0.4409, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1648786067962646, + "rewards/margins": 2.6874301433563232, + "rewards/rejected": -4.852308750152588, + "step": 2358 + }, + { + "epoch": 0.6605963717244735, + "grad_norm": 65.87178802490234, + "learning_rate": 1.5564896707172673e-05, + "logits/chosen": -2.6645004749298096, + "logits/rejected": -2.403073787689209, + "logps/chosen": -302.0002136230469, + "logps/rejected": -261.3192138671875, + "loss": 0.5171, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -1.543456792831421, + "rewards/margins": 2.984374761581421, + "rewards/rejected": -4.527831554412842, + "step": 2376 + }, + { + "epoch": 0.6656008896920831, + "grad_norm": 24.06062126159668, + "learning_rate": 1.5151995017604648e-05, + "logits/chosen": -2.7314882278442383, + "logits/rejected": -2.4527502059936523, + "logps/chosen": -287.94683837890625, + "logps/rejected": -271.3494873046875, + "loss": 0.4172, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -1.6124606132507324, + "rewards/margins": 3.9703075885772705, + "rewards/rejected": -5.582768440246582, + "step": 2394 + }, + { + "epoch": 0.6706054076596928, + "grad_norm": 16.180158615112305, + "learning_rate": 1.4742785511646304e-05, + "logits/chosen": -2.736513376235962, + "logits/rejected": -2.3983187675476074, + "logps/chosen": -333.7608947753906, + "logps/rejected": -273.1593322753906, + "loss": 0.4302, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9379234313964844, + "rewards/margins": 3.50919508934021, + "rewards/rejected": -5.447118282318115, + "step": 2412 + }, + { + "epoch": 0.6756099256273024, + "grad_norm": 34.05958557128906, + "learning_rate": 1.4337369945502085e-05, + "logits/chosen": -2.787388801574707, + "logits/rejected": -2.4262053966522217, + "logps/chosen": -297.0041809082031, + "logps/rejected": -258.23773193359375, + "loss": 0.4191, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -2.011275291442871, + "rewards/margins": 3.1109867095947266, + "rewards/rejected": -5.122262477874756, + "step": 2430 + }, + { + "epoch": 0.6806144435949121, + "grad_norm": 41.906883239746094, + "learning_rate": 1.3935849131955284e-05, + "logits/chosen": -2.801105260848999, + "logits/rejected": -2.496077537536621, + "logps/chosen": -313.3427429199219, + "logps/rejected": -287.7723083496094, + "loss": 0.4061, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -2.1190905570983887, + "rewards/margins": 3.29622220993042, + "rewards/rejected": -5.415312767028809, + "step": 2448 + }, + { + "epoch": 0.6856189615625217, + "grad_norm": 28.44654083251953, + "learning_rate": 1.3538322915299324e-05, + "logits/chosen": -2.739797830581665, + "logits/rejected": -2.368856906890869, + "logps/chosen": -309.02685546875, + "logps/rejected": -274.4028625488281, + "loss": 0.3908, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": -1.9836477041244507, + "rewards/margins": 4.328744888305664, + "rewards/rejected": -6.312392711639404, + "step": 2466 + }, + { + "epoch": 0.6906234795301314, + "grad_norm": 11.482656478881836, + "learning_rate": 1.314489014651006e-05, + "logits/chosen": -2.7526888847351074, + "logits/rejected": -2.470857620239258, + "logps/chosen": -299.3045654296875, + "logps/rejected": -265.3349609375, + "loss": 0.5829, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -2.428022623062134, + "rewards/margins": 2.994326114654541, + "rewards/rejected": -5.422348499298096, + "step": 2484 + }, + { + "epoch": 0.6950719399457844, + "eval_logits/chosen": -2.73663330078125, + "eval_logits/rejected": -2.4175937175750732, + "eval_logps/chosen": -315.6849060058594, + "eval_logps/rejected": -272.02569580078125, + "eval_loss": 0.4398120939731598, + "eval_rewards/accuracies": 0.7307296991348267, + "eval_rewards/chosen": -2.262033224105835, + "eval_rewards/margins": 3.4137518405914307, + "eval_rewards/rejected": -5.675785064697266, + "eval_runtime": 2891.4419, + "eval_samples_per_second": 1.346, + "eval_steps_per_second": 0.673, + "step": 2500 + }, + { + "epoch": 0.6956279974977411, + "grad_norm": 15.894915580749512, + "learning_rate": 1.275564865866487e-05, + "logits/chosen": -2.640007972717285, + "logits/rejected": -2.460934638977051, + "logps/chosen": -300.5777587890625, + "logps/rejected": -265.5496826171875, + "loss": 0.5483, + "rewards/accuracies": 0.6736111044883728, + "rewards/chosen": -2.134329319000244, + "rewards/margins": 3.097062349319458, + "rewards/rejected": -5.231391429901123, + "step": 2502 + }, + { + "epoch": 0.7006325154653507, + "grad_norm": 35.01946258544922, + "learning_rate": 1.2370695242615e-05, + "logits/chosen": -2.683218002319336, + "logits/rejected": -2.3299059867858887, + "logps/chosen": -314.0257263183594, + "logps/rejected": -278.9731750488281, + "loss": 0.3886, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -2.5734806060791016, + "rewards/margins": 4.0837554931640625, + "rewards/rejected": -6.657236576080322, + "step": 2520 + }, + { + "epoch": 0.7056370334329604, + "grad_norm": 54.64206314086914, + "learning_rate": 1.1990125622917034e-05, + "logits/chosen": -2.647454261779785, + "logits/rejected": -2.355983257293701, + "logps/chosen": -317.1181945800781, + "logps/rejected": -273.063720703125, + "loss": 0.4489, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -2.29117751121521, + "rewards/margins": 2.928903579711914, + "rewards/rejected": -5.220081329345703, + "step": 2538 + }, + { + "epoch": 0.71064155140057, + "grad_norm": 45.05056381225586, + "learning_rate": 1.161403443402955e-05, + "logits/chosen": -2.708786964416504, + "logits/rejected": -2.3760673999786377, + "logps/chosen": -315.9114685058594, + "logps/rejected": -280.67047119140625, + "loss": 0.3791, + "rewards/accuracies": 0.7708333134651184, + "rewards/chosen": -1.7743655443191528, + "rewards/margins": 4.166933059692383, + "rewards/rejected": -5.941298484802246, + "step": 2556 + }, + { + "epoch": 0.7156460693681796, + "grad_norm": 18.02887725830078, + "learning_rate": 1.1242515196780759e-05, + "logits/chosen": -2.7464094161987305, + "logits/rejected": -2.3915271759033203, + "logps/chosen": -341.52777099609375, + "logps/rejected": -278.60736083984375, + "loss": 0.4795, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": -2.121417999267578, + "rewards/margins": 3.1194028854370117, + "rewards/rejected": -5.24082088470459, + "step": 2574 + }, + { + "epoch": 0.7206505873357892, + "grad_norm": 38.05537796020508, + "learning_rate": 1.0875660295113154e-05, + "logits/chosen": -2.745269775390625, + "logits/rejected": -2.325620412826538, + "logps/chosen": -335.1282043457031, + "logps/rejected": -277.2452087402344, + "loss": 0.4032, + "rewards/accuracies": 0.7986111044883728, + "rewards/chosen": -1.916572093963623, + "rewards/margins": 3.6244702339172363, + "rewards/rejected": -5.541042327880859, + "step": 2592 + }, + { + "epoch": 0.7256551053033989, + "grad_norm": 22.900930404663086, + "learning_rate": 1.051356095311081e-05, + "logits/chosen": -2.7446491718292236, + "logits/rejected": -2.427164316177368, + "logps/chosen": -320.9158935546875, + "logps/rejected": -280.6097412109375, + "loss": 0.3997, + "rewards/accuracies": 0.7847222089767456, + "rewards/chosen": -1.4083409309387207, + "rewards/margins": 3.733079195022583, + "rewards/rejected": -5.141420364379883, + "step": 2610 + }, + { + "epoch": 0.7306596232710085, + "grad_norm": 45.608585357666016, + "learning_rate": 1.0156307212315172e-05, + "logits/chosen": -2.7047512531280518, + "logits/rejected": -2.3908822536468506, + "logps/chosen": -291.3263854980469, + "logps/rejected": -252.54534912109375, + "loss": 0.4405, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.8275411128997803, + "rewards/margins": 3.8801584243774414, + "rewards/rejected": -5.707700252532959, + "step": 2628 + }, + { + "epoch": 0.7356641412386182, + "grad_norm": 37.82722473144531, + "learning_rate": 9.803987909334771e-06, + "logits/chosen": -2.677929639816284, + "logits/rejected": -2.4055581092834473, + "logps/chosen": -307.4137878417969, + "logps/rejected": -269.66644287109375, + "loss": 0.3928, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": -1.88889741897583, + "rewards/margins": 3.8182270526885986, + "rewards/rejected": -5.707124710083008, + "step": 2646 + }, + { + "epoch": 0.7406686592062278, + "grad_norm": 14.627944946289062, + "learning_rate": 9.456690653754689e-06, + "logits/chosen": -2.7908084392547607, + "logits/rejected": -2.32016921043396, + "logps/chosen": -336.2296142578125, + "logps/rejected": -259.0232238769531, + "loss": 0.3361, + "rewards/accuracies": 0.7638888955116272, + "rewards/chosen": -1.1844143867492676, + "rewards/margins": 3.5488576889038086, + "rewards/rejected": -4.73327112197876, + "step": 2664 + }, + { + "epoch": 0.7456731771738375, + "grad_norm": 44.05823516845703, + "learning_rate": 9.114501806351085e-06, + "logits/chosen": -2.7528133392333984, + "logits/rejected": -2.548713207244873, + "logps/chosen": -306.82275390625, + "logps/rejected": -282.6586608886719, + "loss": 0.5731, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5416101217269897, + "rewards/margins": 3.034811496734619, + "rewards/rejected": -4.576421737670898, + "step": 2682 + }, + { + "epoch": 0.7506776951414471, + "grad_norm": 13.164937973022461, + "learning_rate": 8.777506457616279e-06, + "logits/chosen": -2.656193494796753, + "logits/rejected": -2.365365743637085, + "logps/chosen": -302.3138427734375, + "logps/rejected": -259.0361633300781, + "loss": 0.4363, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8880829811096191, + "rewards/margins": 3.280186891555786, + "rewards/rejected": -5.168270111083984, + "step": 2700 + }, + { + "epoch": 0.7556822131090568, + "grad_norm": 46.482601165771484, + "learning_rate": 8.44578840659967e-06, + "logits/chosen": -2.710949659347534, + "logits/rejected": -2.394054651260376, + "logps/chosen": -311.81170654296875, + "logps/rejected": -274.8843994140625, + "loss": 0.4529, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.32057523727417, + "rewards/margins": 3.8870952129364014, + "rewards/rejected": -5.20767068862915, + "step": 2718 + }, + { + "epoch": 0.7606867310766664, + "grad_norm": 25.94777488708496, + "learning_rate": 8.119430140069845e-06, + "logits/chosen": -2.748070240020752, + "logits/rejected": -2.385664939880371, + "logps/chosen": -302.9721984863281, + "logps/rejected": -262.8243713378906, + "loss": 0.4365, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.314685344696045, + "rewards/margins": 2.9653894901275635, + "rewards/rejected": -4.280074596405029, + "step": 2736 + }, + { + "epoch": 0.7656912490442761, + "grad_norm": 35.650943756103516, + "learning_rate": 7.798512812002993e-06, + "logits/chosen": -2.7013843059539795, + "logits/rejected": -2.3474152088165283, + "logps/chosen": -296.6829833984375, + "logps/rejected": -246.14752197265625, + "loss": 0.4185, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5774956941604614, + "rewards/margins": 3.504556179046631, + "rewards/rejected": -5.0820512771606445, + "step": 2754 + }, + { + "epoch": 0.7706957670118857, + "grad_norm": 11.78795337677002, + "learning_rate": 7.483116223402681e-06, + "logits/chosen": -2.7927751541137695, + "logits/rejected": -2.434936761856079, + "logps/chosen": -327.77276611328125, + "logps/rejected": -267.130859375, + "loss": 0.4826, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -1.440108060836792, + "rewards/margins": 2.941638469696045, + "rewards/rejected": -4.381746292114258, + "step": 2772 + }, + { + "epoch": 0.7757002849794954, + "grad_norm": 49.560516357421875, + "learning_rate": 7.173318802456157e-06, + "logits/chosen": -2.7469167709350586, + "logits/rejected": -2.3771026134490967, + "logps/chosen": -328.4552001953125, + "logps/rejected": -294.0330810546875, + "loss": 0.3957, + "rewards/accuracies": 0.7708333134651184, + "rewards/chosen": -1.2726744413375854, + "rewards/margins": 4.180568218231201, + "rewards/rejected": -5.453242301940918, + "step": 2790 + }, + { + "epoch": 0.780704802947105, + "grad_norm": 26.68621253967285, + "learning_rate": 6.869197585031959e-06, + "logits/chosen": -2.662766218185425, + "logits/rejected": -2.436544895172119, + "logps/chosen": -300.9196472167969, + "logps/rejected": -273.5084228515625, + "loss": 0.4148, + "rewards/accuracies": 0.7569444179534912, + "rewards/chosen": -1.5453829765319824, + "rewards/margins": 3.941492795944214, + "rewards/rejected": -5.486876487731934, + "step": 2808 + }, + { + "epoch": 0.7857093209147147, + "grad_norm": 35.615352630615234, + "learning_rate": 6.570828195523787e-06, + "logits/chosen": -2.677241325378418, + "logits/rejected": -2.470067024230957, + "logps/chosen": -286.5127258300781, + "logps/rejected": -267.2671813964844, + "loss": 0.5251, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3681201934814453, + "rewards/margins": 2.601278305053711, + "rewards/rejected": -4.9693989753723145, + "step": 2826 + }, + { + "epoch": 0.7907138388823243, + "grad_norm": 24.046659469604492, + "learning_rate": 6.278284828045317e-06, + "logits/chosen": -2.8378920555114746, + "logits/rejected": -2.430727243423462, + "logps/chosen": -290.444091796875, + "logps/rejected": -250.02720642089844, + "loss": 0.4247, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5597851276397705, + "rewards/margins": 3.6326475143432617, + "rewards/rejected": -5.192432403564453, + "step": 2844 + }, + { + "epoch": 0.795718356849934, + "grad_norm": 45.197017669677734, + "learning_rate": 5.991640227980711e-06, + "logits/chosen": -2.7528531551361084, + "logits/rejected": -2.4077205657958984, + "logps/chosen": -313.0735168457031, + "logps/rejected": -259.5475158691406, + "loss": 0.376, + "rewards/accuracies": 0.7569444179534912, + "rewards/chosen": -1.7554352283477783, + "rewards/margins": 3.788660764694214, + "rewards/rejected": -5.544095993041992, + "step": 2862 + }, + { + "epoch": 0.8007228748175436, + "grad_norm": 48.10368347167969, + "learning_rate": 5.710965673895368e-06, + "logits/chosen": -2.7433524131774902, + "logits/rejected": -2.4919819831848145, + "logps/chosen": -299.95904541015625, + "logps/rejected": -284.4269714355469, + "loss": 0.4149, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3762539625167847, + "rewards/margins": 3.7069790363311768, + "rewards/rejected": -5.083233833312988, + "step": 2880 + }, + { + "epoch": 0.8057273927851533, + "grad_norm": 35.14622116088867, + "learning_rate": 5.4363309598114145e-06, + "logits/chosen": -2.7596993446350098, + "logits/rejected": -2.3708279132843018, + "logps/chosen": -298.2368469238281, + "logps/rejected": -249.60655212402344, + "loss": 0.4001, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.6462877988815308, + "rewards/margins": 3.60522723197937, + "rewards/rejected": -5.251514434814453, + "step": 2898 + }, + { + "epoch": 0.810731910752763, + "grad_norm": 36.663394927978516, + "learning_rate": 5.167804377852343e-06, + "logits/chosen": -2.746026039123535, + "logits/rejected": -2.4985361099243164, + "logps/chosen": -322.4928283691406, + "logps/rejected": -294.7042541503906, + "loss": 0.4623, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -1.8921818733215332, + "rewards/margins": 3.328840732574463, + "rewards/rejected": -5.221022605895996, + "step": 2916 + }, + { + "epoch": 0.8157364287203726, + "grad_norm": 18.040237426757812, + "learning_rate": 4.905452701261157e-06, + "logits/chosen": -2.7748162746429443, + "logits/rejected": -2.3919949531555176, + "logps/chosen": -318.8956298828125, + "logps/rejected": -273.5999755859375, + "loss": 0.4515, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": -2.0273349285125732, + "rewards/margins": 3.6705000400543213, + "rewards/rejected": -5.697835445404053, + "step": 2934 + }, + { + "epoch": 0.8207409466879823, + "grad_norm": 12.170748710632324, + "learning_rate": 4.649341167796142e-06, + "logits/chosen": -2.713855504989624, + "logits/rejected": -2.373263359069824, + "logps/chosen": -333.3356628417969, + "logps/rejected": -281.8557434082031, + "loss": 0.4246, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -1.9140756130218506, + "rewards/margins": 3.825533866882324, + "rewards/rejected": -5.7396087646484375, + "step": 2952 + }, + { + "epoch": 0.8257454646555918, + "grad_norm": 40.6090202331543, + "learning_rate": 4.399533463508556e-06, + "logits/chosen": -2.7735509872436523, + "logits/rejected": -2.385148763656616, + "logps/chosen": -320.83404541015625, + "logps/rejected": -263.9934997558594, + "loss": 0.3941, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.4935352802276611, + "rewards/margins": 3.724492311477661, + "rewards/rejected": -5.218027591705322, + "step": 2970 + }, + { + "epoch": 0.8307499826232015, + "grad_norm": 25.662874221801758, + "learning_rate": 4.156091706906054e-06, + "logits/chosen": -2.7486495971679688, + "logits/rejected": -2.3925347328186035, + "logps/chosen": -299.8246154785156, + "logps/rejected": -265.2615051269531, + "loss": 0.4669, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -1.317674160003662, + "rewards/margins": 3.691737651824951, + "rewards/rejected": -5.0094122886657715, + "step": 2988 + }, + { + "epoch": 0.8340863279349413, + "eval_logits/chosen": -2.752748966217041, + "eval_logits/rejected": -2.431671380996704, + "eval_logps/chosen": -309.14166259765625, + "eval_logps/rejected": -266.5367431640625, + "eval_loss": 0.4303225874900818, + "eval_rewards/accuracies": 0.740750253200531, + "eval_rewards/chosen": -1.6077065467834473, + "eval_rewards/margins": 3.5191824436187744, + "eval_rewards/rejected": -5.126889228820801, + "eval_runtime": 2888.6018, + "eval_samples_per_second": 1.347, + "eval_steps_per_second": 0.674, + "step": 3000 + }, + { + "epoch": 0.8357545005908111, + "grad_norm": 20.970500946044922, + "learning_rate": 3.919076433506019e-06, + "logits/chosen": -2.73111629486084, + "logits/rejected": -2.3597335815429688, + "logps/chosen": -333.4314880371094, + "logps/rejected": -288.4777526855469, + "loss": 0.4186, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5794798135757446, + "rewards/margins": 4.338929653167725, + "rewards/rejected": -5.9184088706970215, + "step": 3006 + }, + { + "epoch": 0.8407590185584208, + "grad_norm": 19.71300506591797, + "learning_rate": 3.6885465807824483e-06, + "logits/chosen": -2.7255218029022217, + "logits/rejected": -2.5009584426879883, + "logps/chosen": -290.33990478515625, + "logps/rejected": -273.0076904296875, + "loss": 0.4672, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -1.5376120805740356, + "rewards/margins": 3.201075315475464, + "rewards/rejected": -4.738687515258789, + "step": 3024 + }, + { + "epoch": 0.8457635365260304, + "grad_norm": 30.243356704711914, + "learning_rate": 3.4645594735102815e-06, + "logits/chosen": -2.7144222259521484, + "logits/rejected": -2.390984058380127, + "logps/chosen": -334.29058837890625, + "logps/rejected": -274.4745788574219, + "loss": 0.4527, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.2189850807189941, + "rewards/margins": 3.407120704650879, + "rewards/rejected": -4.626105308532715, + "step": 3042 + }, + { + "epoch": 0.8507680544936401, + "grad_norm": 9.763965606689453, + "learning_rate": 3.247170809510688e-06, + "logits/chosen": -2.672239303588867, + "logits/rejected": -2.3834152221679688, + "logps/chosen": -327.905029296875, + "logps/rejected": -289.80035400390625, + "loss": 0.4095, + "rewards/accuracies": 0.7708333134651184, + "rewards/chosen": -1.8000273704528809, + "rewards/margins": 3.705233335494995, + "rewards/rejected": -5.505261421203613, + "step": 3060 + }, + { + "epoch": 0.8557725724612497, + "grad_norm": 40.514625549316406, + "learning_rate": 3.0364346458009595e-06, + "logits/chosen": -2.700105667114258, + "logits/rejected": -2.4108972549438477, + "logps/chosen": -299.8682861328125, + "logps/rejected": -272.5063781738281, + "loss": 0.5383, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -1.90571129322052, + "rewards/margins": 3.280506134033203, + "rewards/rejected": -5.186217308044434, + "step": 3078 + }, + { + "epoch": 0.8607770904288594, + "grad_norm": 20.45553970336914, + "learning_rate": 2.8324033851524156e-06, + "logits/chosen": -2.819423198699951, + "logits/rejected": -2.410477638244629, + "logps/chosen": -309.5467529296875, + "logps/rejected": -256.22113037109375, + "loss": 0.432, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -1.6138743162155151, + "rewards/margins": 3.463773488998413, + "rewards/rejected": -5.077648162841797, + "step": 3096 + }, + { + "epoch": 0.865781608396469, + "grad_norm": 19.980777740478516, + "learning_rate": 2.6351277630596417e-06, + "logits/chosen": -2.7615110874176025, + "logits/rejected": -2.5163559913635254, + "logps/chosen": -297.98760986328125, + "logps/rejected": -287.3669738769531, + "loss": 0.4199, + "rewards/accuracies": 0.7708333134651184, + "rewards/chosen": -1.6370689868927002, + "rewards/margins": 4.460258960723877, + "rewards/rejected": -6.09732723236084, + "step": 3114 + }, + { + "epoch": 0.8707861263640787, + "grad_norm": 15.881637573242188, + "learning_rate": 2.4446568351243447e-06, + "logits/chosen": -2.6440608501434326, + "logits/rejected": -2.3766634464263916, + "logps/chosen": -315.4722900390625, + "logps/rejected": -295.0150146484375, + "loss": 0.4, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -1.3497464656829834, + "rewards/margins": 3.961167573928833, + "rewards/rejected": -5.310914039611816, + "step": 3132 + }, + { + "epoch": 0.8757906443316883, + "grad_norm": 49.29906463623047, + "learning_rate": 2.2610379648569023e-06, + "logits/chosen": -2.7368593215942383, + "logits/rejected": -2.466759204864502, + "logps/chosen": -326.05938720703125, + "logps/rejected": -280.38568115234375, + "loss": 0.499, + "rewards/accuracies": 0.7291666865348816, + "rewards/chosen": -1.4365172386169434, + "rewards/margins": 3.243601083755493, + "rewards/rejected": -4.680118083953857, + "step": 3150 + }, + { + "epoch": 0.880795162299298, + "grad_norm": 41.694732666015625, + "learning_rate": 2.084316811898742e-06, + "logits/chosen": -2.81174373626709, + "logits/rejected": -2.4293229579925537, + "logps/chosen": -307.6359558105469, + "logps/rejected": -265.67962646484375, + "loss": 0.4365, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -1.5170542001724243, + "rewards/margins": 3.6684091091156006, + "rewards/rejected": -5.185462951660156, + "step": 3168 + }, + { + "epoch": 0.8857996802669076, + "grad_norm": 30.265064239501953, + "learning_rate": 1.9145373206683313e-06, + "logits/chosen": -2.7658839225769043, + "logits/rejected": -2.447499990463257, + "logps/chosen": -316.4184875488281, + "logps/rejected": -271.330810546875, + "loss": 0.4503, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.9047456979751587, + "rewards/margins": 3.7025341987609863, + "rewards/rejected": -5.6072797775268555, + "step": 3186 + }, + { + "epoch": 0.8908041982345173, + "grad_norm": 29.625341415405273, + "learning_rate": 1.7517417094337517e-06, + "logits/chosen": -2.7008306980133057, + "logits/rejected": -2.401216983795166, + "logps/chosen": -324.0052490234375, + "logps/rejected": -280.17156982421875, + "loss": 0.4115, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -1.634941577911377, + "rewards/margins": 3.371777057647705, + "rewards/rejected": -5.00671911239624, + "step": 3204 + }, + { + "epoch": 0.8958087162021269, + "grad_norm": 44.7132682800293, + "learning_rate": 1.5959704598144632e-06, + "logits/chosen": -2.7645680904388428, + "logits/rejected": -2.538956880569458, + "logps/chosen": -310.97369384765625, + "logps/rejected": -279.1435241699219, + "loss": 0.5964, + "rewards/accuracies": 0.7013888955116272, + "rewards/chosen": -1.7079379558563232, + "rewards/margins": 2.9834136962890625, + "rewards/rejected": -4.691351413726807, + "step": 3222 + }, + { + "epoch": 0.9008132341697366, + "grad_norm": 25.25443458557129, + "learning_rate": 1.4472623067149493e-06, + "logits/chosen": -2.7592039108276367, + "logits/rejected": -2.4473934173583984, + "logps/chosen": -318.0968933105469, + "logps/rejected": -276.5232849121094, + "loss": 0.4552, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4127529859542847, + "rewards/margins": 3.2837536334991455, + "rewards/rejected": -4.696506977081299, + "step": 3240 + }, + { + "epoch": 0.9058177521373462, + "grad_norm": 39.80801773071289, + "learning_rate": 1.3056542286926799e-06, + "logits/chosen": -2.7281241416931152, + "logits/rejected": -2.513171672821045, + "logps/chosen": -327.438232421875, + "logps/rejected": -298.3660888671875, + "loss": 0.4166, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": -1.3070768117904663, + "rewards/margins": 3.489137887954712, + "rewards/rejected": -4.7962141036987305, + "step": 3258 + }, + { + "epoch": 0.9108222701049559, + "grad_norm": 36.14491653442383, + "learning_rate": 1.1711814387628183e-06, + "logits/chosen": -2.7403101921081543, + "logits/rejected": -2.4477198123931885, + "logps/chosen": -311.3759765625, + "logps/rejected": -271.148681640625, + "loss": 0.4525, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.6331597566604614, + "rewards/margins": 3.1790125370025635, + "rewards/rejected": -4.812172889709473, + "step": 3276 + }, + { + "epoch": 0.9158267880725655, + "grad_norm": 18.024696350097656, + "learning_rate": 1.0438773756419773e-06, + "logits/chosen": -2.799095630645752, + "logits/rejected": -2.42502498626709, + "logps/chosen": -340.5537109375, + "logps/rejected": -290.1743469238281, + "loss": 0.3969, + "rewards/accuracies": 0.7916666865348816, + "rewards/chosen": -1.8523305654525757, + "rewards/margins": 3.7434768676757812, + "rewards/rejected": -5.5958075523376465, + "step": 3294 + }, + { + "epoch": 0.9208313060401752, + "grad_norm": 18.034805297851562, + "learning_rate": 9.237736954331855e-07, + "logits/chosen": -2.7686593532562256, + "logits/rejected": -2.432816505432129, + "logps/chosen": -302.0719909667969, + "logps/rejected": -280.1247253417969, + "loss": 0.3519, + "rewards/accuracies": 0.7569444179534912, + "rewards/chosen": -1.650702714920044, + "rewards/margins": 4.313185691833496, + "rewards/rejected": -5.963888168334961, + "step": 3312 + }, + { + "epoch": 0.9258358240077849, + "grad_norm": 18.73773765563965, + "learning_rate": 8.109002637540686e-07, + "logits/chosen": -2.811631441116333, + "logits/rejected": -2.403079032897949, + "logps/chosen": -340.51214599609375, + "logps/rejected": -269.27496337890625, + "loss": 0.4734, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -1.6071511507034302, + "rewards/margins": 3.59777569770813, + "rewards/rejected": -5.20492696762085, + "step": 3330 + }, + { + "epoch": 0.9308403419753944, + "grad_norm": 53.7007942199707, + "learning_rate": 7.052851483103595e-07, + "logits/chosen": -2.747727632522583, + "logits/rejected": -2.432584285736084, + "logps/chosen": -278.6109619140625, + "logps/rejected": -253.11199951171875, + "loss": 0.4279, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.596574306488037, + "rewards/margins": 4.288656711578369, + "rewards/rejected": -5.8852314949035645, + "step": 3348 + }, + { + "epoch": 0.9358448599430041, + "grad_norm": 12.700357437133789, + "learning_rate": 6.069546119163883e-07, + "logits/chosen": -2.7545533180236816, + "logits/rejected": -2.3816208839416504, + "logps/chosen": -294.5799865722656, + "logps/rejected": -242.094970703125, + "loss": 0.3494, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -1.1281059980392456, + "rewards/margins": 3.8597989082336426, + "rewards/rejected": -4.987905025482178, + "step": 3366 + }, + { + "epoch": 0.9408493779106137, + "grad_norm": 51.807857513427734, + "learning_rate": 5.159331059644723e-07, + "logits/chosen": -2.7963879108428955, + "logits/rejected": -2.4854278564453125, + "logps/chosen": -328.6551208496094, + "logps/rejected": -298.92779541015625, + "loss": 0.4834, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -1.6172294616699219, + "rewards/margins": 3.515500068664551, + "rewards/rejected": -5.132729530334473, + "step": 3384 + }, + { + "epoch": 0.9458538958782234, + "grad_norm": 23.846389770507812, + "learning_rate": 4.3224326434467056e-07, + "logits/chosen": -2.745059013366699, + "logits/rejected": -2.4258298873901367, + "logps/chosen": -312.1297302246094, + "logps/rejected": -265.2500305175781, + "loss": 0.4627, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.4351582527160645, + "rewards/margins": 3.231187343597412, + "rewards/rejected": -4.666345596313477, + "step": 3402 + }, + { + "epoch": 0.950858413845833, + "grad_norm": 40.21766662597656, + "learning_rate": 3.5590589781653193e-07, + "logits/chosen": -2.783703088760376, + "logits/rejected": -2.483260154724121, + "logps/chosen": -288.5788269042969, + "logps/rejected": -254.65548706054688, + "loss": 0.5498, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": -1.9071305990219116, + "rewards/margins": 2.9087460041046143, + "rewards/rejected": -4.815876483917236, + "step": 3420 + }, + { + "epoch": 0.9558629318134427, + "grad_norm": 41.0620002746582, + "learning_rate": 2.869399888341884e-07, + "logits/chosen": -2.753574848175049, + "logits/rejected": -2.44853138923645, + "logps/chosen": -321.7188415527344, + "logps/rejected": -281.68896484375, + "loss": 0.4242, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.794075608253479, + "rewards/margins": 3.2132554054260254, + "rewards/rejected": -5.007330417633057, + "step": 3438 + }, + { + "epoch": 0.9608674497810523, + "grad_norm": 44.541236877441406, + "learning_rate": 2.2536268682607053e-07, + "logits/chosen": -2.808333396911621, + "logits/rejected": -2.399050712585449, + "logps/chosen": -313.6329650878906, + "logps/rejected": -254.74417114257812, + "loss": 0.394, + "rewards/accuracies": 0.7708333134651184, + "rewards/chosen": -1.3611148595809937, + "rewards/margins": 3.667389392852783, + "rewards/rejected": -5.028504371643066, + "step": 3456 + }, + { + "epoch": 0.965871967748662, + "grad_norm": 28.37739372253418, + "learning_rate": 1.711893039304391e-07, + "logits/chosen": -2.787020683288574, + "logits/rejected": -2.3685710430145264, + "logps/chosen": -310.705810546875, + "logps/rejected": -252.1896209716797, + "loss": 0.4114, + "rewards/accuracies": 0.7152777910232544, + "rewards/chosen": -1.4525500535964966, + "rewards/margins": 3.6481471061706543, + "rewards/rejected": -5.100697994232178, + "step": 3474 + }, + { + "epoch": 0.9708764857162716, + "grad_norm": 35.66070556640625, + "learning_rate": 1.2443331118779044e-07, + "logits/chosen": -2.736020088195801, + "logits/rejected": -2.4357709884643555, + "logps/chosen": -312.6650390625, + "logps/rejected": -272.03125, + "loss": 0.4593, + "rewards/accuracies": 0.7430555820465088, + "rewards/chosen": -1.8849661350250244, + "rewards/margins": 3.159604072570801, + "rewards/rejected": -5.044570446014404, + "step": 3492 + }, + { + "epoch": 0.9731007159240982, + "eval_logits/chosen": -2.759683132171631, + "eval_logits/rejected": -2.438039541244507, + "eval_logps/chosen": -308.1109313964844, + "eval_logps/rejected": -265.2192687988281, + "eval_loss": 0.42871206998825073, + "eval_rewards/accuracies": 0.7386947870254517, + "eval_rewards/chosen": -1.5046364068984985, + "eval_rewards/margins": 3.490506410598755, + "eval_rewards/rejected": -4.995142459869385, + "eval_runtime": 2889.0688, + "eval_samples_per_second": 1.347, + "eval_steps_per_second": 0.674, + "step": 3500 + } + ], + "logging_steps": 18, + "max_steps": 3596, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}