{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.985781990521327, "eval_steps": 50, "global_step": 315, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0947867298578199, "grad_norm": 50.81444347836179, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.8273773193359375, "logits/rejected": -2.573636054992676, "logps/chosen": -369.3688049316406, "logps/rejected": -693.6748046875, "loss": 0.6858, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0030312505550682545, "rewards/margins": 0.015196545049548149, "rewards/rejected": -0.012165295891463757, "step": 10 }, { "epoch": 0.1895734597156398, "grad_norm": 17.76028531231973, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.8040361404418945, "logits/rejected": -2.5344460010528564, "logps/chosen": -356.6896057128906, "logps/rejected": -714.5941162109375, "loss": 0.4666, "rewards/accuracies": 1.0, "rewards/chosen": 0.1258755326271057, "rewards/margins": 0.69224613904953, "rewards/rejected": -0.5663706064224243, "step": 20 }, { "epoch": 0.2843601895734597, "grad_norm": 3.027708678861818, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.8109562397003174, "logits/rejected": -2.5492990016937256, "logps/chosen": -299.7785949707031, "logps/rejected": -1061.5032958984375, "loss": 0.124, "rewards/accuracies": 1.0, "rewards/chosen": 0.4659281373023987, "rewards/margins": 4.801316261291504, "rewards/rejected": -4.33538818359375, "step": 30 }, { "epoch": 0.3791469194312796, "grad_norm": 4.339445107844336, "learning_rate": 4.990147841143461e-07, "logits/chosen": -2.6756181716918945, "logits/rejected": -2.3431499004364014, "logps/chosen": -309.88055419921875, "logps/rejected": -2287.322265625, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": 0.4024983048439026, "rewards/margins": 16.896318435668945, "rewards/rejected": -16.493820190429688, "step": 40 }, { "epoch": 0.47393364928909953, "grad_norm": 1.4582859394909966, "learning_rate": 4.950256493879794e-07, "logits/chosen": -2.350309371948242, "logits/rejected": -1.7835966348648071, "logps/chosen": -439.63720703125, "logps/rejected": -3742.43408203125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.5562213659286499, "rewards/margins": 29.479211807250977, "rewards/rejected": -30.035430908203125, "step": 50 }, { "epoch": 0.47393364928909953, "eval_logits/chosen": -1.910264015197754, "eval_logits/rejected": -1.2144216299057007, "eval_logps/chosen": -476.05950927734375, "eval_logps/rejected": -4036.952880859375, "eval_loss": 0.00275122607126832, "eval_rewards/accuracies": 0.9979838728904724, "eval_rewards/chosen": -1.0907776355743408, "eval_rewards/margins": 32.87086868286133, "eval_rewards/rejected": -33.96164321899414, "eval_runtime": 197.0098, "eval_samples_per_second": 19.821, "eval_steps_per_second": 0.315, "step": 50 }, { "epoch": 0.5687203791469194, "grad_norm": 0.2408524061043023, "learning_rate": 4.88020090697132e-07, "logits/chosen": -1.8117077350616455, "logits/rejected": -0.6155702471733093, "logps/chosen": -491.74359130859375, "logps/rejected": -4422.6455078125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.4004383087158203, "rewards/margins": 36.08842086791992, "rewards/rejected": -37.48885726928711, "step": 60 }, { "epoch": 0.6635071090047393, "grad_norm": 0.047142917188209066, "learning_rate": 4.780843509929904e-07, "logits/chosen": -1.8627986907958984, "logits/rejected": 0.22284331917762756, "logps/chosen": -509.3787536621094, "logps/rejected": -4690.046875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.4054739475250244, "rewards/margins": 39.368186950683594, "rewards/rejected": -40.773658752441406, "step": 70 }, { "epoch": 0.7582938388625592, "grad_norm": 9.120624486054943, "learning_rate": 4.6534074564712217e-07, "logits/chosen": -0.7828740477561951, "logits/rejected": 1.5713117122650146, "logps/chosen": -559.362548828125, "logps/rejected": -5071.544921875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.678344488143921, "rewards/margins": 42.42586135864258, "rewards/rejected": -44.10420227050781, "step": 80 }, { "epoch": 0.8530805687203792, "grad_norm": 0.11081628927270178, "learning_rate": 4.4994615667026846e-07, "logits/chosen": -1.3702471256256104, "logits/rejected": 2.505129337310791, "logps/chosen": -544.5760498046875, "logps/rejected": -5206.58203125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.5564204454421997, "rewards/margins": 44.48157501220703, "rewards/rejected": -46.03799057006836, "step": 90 }, { "epoch": 0.9478672985781991, "grad_norm": 0.3080840911945339, "learning_rate": 4.320901013934887e-07, "logits/chosen": -2.1306087970733643, "logits/rejected": 1.8308042287826538, "logps/chosen": -503.85980224609375, "logps/rejected": -5392.4296875, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -1.653719186782837, "rewards/margins": 45.56684112548828, "rewards/rejected": -47.22056198120117, "step": 100 }, { "epoch": 0.9478672985781991, "eval_logits/chosen": -2.1299259662628174, "eval_logits/rejected": 1.4562028646469116, "eval_logps/chosen": -528.1521606445312, "eval_logps/rejected": -5036.19775390625, "eval_loss": 0.0005923541029915214, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.6117043495178223, "eval_rewards/margins": 42.342384338378906, "eval_rewards/rejected": -43.95408630371094, "eval_runtime": 193.3087, "eval_samples_per_second": 20.201, "eval_steps_per_second": 0.321, "step": 100 }, { "epoch": 1.042654028436019, "grad_norm": 18.444713565032494, "learning_rate": 4.119923993874379e-07, "logits/chosen": -1.6922178268432617, "logits/rejected": 2.0268759727478027, "logps/chosen": -523.6668090820312, "logps/rejected": -5431.6923828125, "loss": 0.0056, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.692284345626831, "rewards/margins": 45.80804443359375, "rewards/rejected": -47.500328063964844, "step": 110 }, { "epoch": 1.1374407582938388, "grad_norm": 0.667964087575959, "learning_rate": 3.899004663415083e-07, "logits/chosen": -1.945476770401001, "logits/rejected": 2.063563346862793, "logps/chosen": -512.0524291992188, "logps/rejected": -5017.99755859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.6633787155151367, "rewards/margins": 41.92264175415039, "rewards/rejected": -43.586021423339844, "step": 120 }, { "epoch": 1.2322274881516588, "grad_norm": 0.07609835769363717, "learning_rate": 3.6608626821692824e-07, "logits/chosen": -1.3765870332717896, "logits/rejected": 2.4521493911743164, "logps/chosen": -511.48992919921875, "logps/rejected": -5102.3095703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.574892282485962, "rewards/margins": 42.903465270996094, "rewards/rejected": -44.47835922241211, "step": 130 }, { "epoch": 1.3270142180094786, "grad_norm": 0.056132145026876815, "learning_rate": 3.408429731701635e-07, "logits/chosen": -1.673305869102478, "logits/rejected": 2.8528292179107666, "logps/chosen": -515.4207763671875, "logps/rejected": -5185.7666015625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.6433398723602295, "rewards/margins": 44.202003479003906, "rewards/rejected": -45.84534454345703, "step": 140 }, { "epoch": 1.4218009478672986, "grad_norm": 0.23322034769023395, "learning_rate": 3.144813424636031e-07, "logits/chosen": -2.0590405464172363, "logits/rejected": 2.5171058177948, "logps/chosen": -564.9078369140625, "logps/rejected": -5421.3935546875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.7184299230575562, "rewards/margins": 46.220848083496094, "rewards/rejected": -47.93927764892578, "step": 150 }, { "epoch": 1.4218009478672986, "eval_logits/chosen": -1.983699083328247, "eval_logits/rejected": 1.6969449520111084, "eval_logps/chosen": -539.4231567382812, "eval_logps/rejected": -5257.45166015625, "eval_loss": 0.00044810696272179484, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.7244139909744263, "eval_rewards/margins": 44.44221878051758, "eval_rewards/rejected": -46.166629791259766, "eval_runtime": 194.364, "eval_samples_per_second": 20.091, "eval_steps_per_second": 0.319, "step": 150 }, { "epoch": 1.5165876777251186, "grad_norm": 0.12179595633717599, "learning_rate": 2.8732590479375165e-07, "logits/chosen": -1.6932016611099243, "logits/rejected": 3.00923752784729, "logps/chosen": -561.7379150390625, "logps/rejected": -5284.3369140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.8306872844696045, "rewards/margins": 45.2327995300293, "rewards/rejected": -47.06348419189453, "step": 160 }, { "epoch": 1.6113744075829384, "grad_norm": 0.0933485824938723, "learning_rate": 2.597109611334169e-07, "logits/chosen": -1.3871994018554688, "logits/rejected": 3.261793613433838, "logps/chosen": -551.7293090820312, "logps/rejected": -5060.6259765625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.971683144569397, "rewards/margins": 42.35912322998047, "rewards/rejected": -44.33080291748047, "step": 170 }, { "epoch": 1.7061611374407581, "grad_norm": 3.1974087952825787, "learning_rate": 2.3197646927086694e-07, "logits/chosen": -1.4737141132354736, "logits/rejected": 3.022137403488159, "logps/chosen": -511.197509765625, "logps/rejected": -5292.6005859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.733986258506775, "rewards/margins": 45.09421157836914, "rewards/rejected": -46.82819366455078, "step": 180 }, { "epoch": 1.8009478672985781, "grad_norm": 0.027408737628248046, "learning_rate": 2.0446385870993467e-07, "logits/chosen": -2.2061755657196045, "logits/rejected": 2.45582914352417, "logps/chosen": -533.607177734375, "logps/rejected": -5150.46044921875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.7912607192993164, "rewards/margins": 43.65951919555664, "rewards/rejected": -45.45077896118164, "step": 190 }, { "epoch": 1.8957345971563981, "grad_norm": 0.030194576861770926, "learning_rate": 1.775118274523545e-07, "logits/chosen": -2.1220927238464355, "logits/rejected": 2.6763927936553955, "logps/chosen": -580.6775512695312, "logps/rejected": -5746.35791015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.894126534461975, "rewards/margins": 48.489967346191406, "rewards/rejected": -50.38408660888672, "step": 200 }, { "epoch": 1.8957345971563981, "eval_logits/chosen": -2.0032970905303955, "eval_logits/rejected": 2.1634280681610107, "eval_logps/chosen": -542.7340698242188, "eval_logps/rejected": -5115.28857421875, "eval_loss": 0.0005020965472795069, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.7575234174728394, "eval_rewards/margins": 42.98747634887695, "eval_rewards/rejected": -44.744998931884766, "eval_runtime": 192.9625, "eval_samples_per_second": 20.237, "eval_steps_per_second": 0.321, "step": 200 }, { "epoch": 1.9905213270142181, "grad_norm": 0.649351349290712, "learning_rate": 1.514521724066537e-07, "logits/chosen": -1.6789367198944092, "logits/rejected": 3.430915355682373, "logps/chosen": -524.8860473632812, "logps/rejected": -4889.97998046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.7625246047973633, "rewards/margins": 40.506309509277344, "rewards/rejected": -42.268829345703125, "step": 210 }, { "epoch": 2.085308056872038, "grad_norm": 0.04035446049031933, "learning_rate": 1.266057047539568e-07, "logits/chosen": -1.4461164474487305, "logits/rejected": 3.0876190662384033, "logps/chosen": -525.4357299804688, "logps/rejected": -4884.6865234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.8402020931243896, "rewards/margins": 40.36837387084961, "rewards/rejected": -42.208580017089844, "step": 220 }, { "epoch": 2.1800947867298577, "grad_norm": 0.015128682487430115, "learning_rate": 1.032783005551884e-07, "logits/chosen": -1.137474775314331, "logits/rejected": 3.6721444129943848, "logps/chosen": -568.20751953125, "logps/rejected": -5129.24658203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.940818190574646, "rewards/margins": 42.981773376464844, "rewards/rejected": -44.92259216308594, "step": 230 }, { "epoch": 2.2748815165876777, "grad_norm": 0.037321954682957494, "learning_rate": 8.175713521924976e-08, "logits/chosen": -1.189206838607788, "logits/rejected": 3.206519603729248, "logps/chosen": -552.6511840820312, "logps/rejected": -5748.79541015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.8525272607803345, "rewards/margins": 48.330223083496094, "rewards/rejected": -50.1827507019043, "step": 240 }, { "epoch": 2.3696682464454977, "grad_norm": 0.006059762174940937, "learning_rate": 6.230714818829733e-08, "logits/chosen": -0.9478242993354797, "logits/rejected": 3.6084961891174316, "logps/chosen": -529.2486572265625, "logps/rejected": -5455.07421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.7955585718154907, "rewards/margins": 46.23430252075195, "rewards/rejected": -48.02985382080078, "step": 250 }, { "epoch": 2.3696682464454977, "eval_logits/chosen": -1.2428650856018066, "eval_logits/rejected": 2.7113900184631348, "eval_logps/chosen": -556.8338623046875, "eval_logps/rejected": -5293.04052734375, "eval_loss": 0.00038583340938203037, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.8985214233398438, "eval_rewards/margins": 44.623992919921875, "eval_rewards/rejected": -46.522518157958984, "eval_runtime": 194.5012, "eval_samples_per_second": 20.077, "eval_steps_per_second": 0.319, "step": 250 }, { "epoch": 2.4644549763033177, "grad_norm": 0.03581328161307903, "learning_rate": 4.516778136213037e-08, "logits/chosen": -0.6857299208641052, "logits/rejected": 4.016716003417969, "logps/chosen": -546.9486083984375, "logps/rejected": -6013.9873046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.029759407043457, "rewards/margins": 51.3958740234375, "rewards/rejected": -53.425636291503906, "step": 260 }, { "epoch": 2.5592417061611377, "grad_norm": 0.022468211975921346, "learning_rate": 3.055003141378948e-08, "logits/chosen": -1.1639906167984009, "logits/rejected": 3.84511137008667, "logps/chosen": -576.4962158203125, "logps/rejected": -5156.3203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.8559118509292603, "rewards/margins": 43.48760223388672, "rewards/rejected": -45.34351348876953, "step": 270 }, { "epoch": 2.654028436018957, "grad_norm": 0.04077948274793423, "learning_rate": 1.8633852284264508e-08, "logits/chosen": -0.8976588249206543, "logits/rejected": 3.54345440864563, "logps/chosen": -536.1121826171875, "logps/rejected": -5684.75048828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.8762388229370117, "rewards/margins": 48.596927642822266, "rewards/rejected": -50.473167419433594, "step": 280 }, { "epoch": 2.748815165876777, "grad_norm": 0.025754086632460663, "learning_rate": 9.56593983327919e-09, "logits/chosen": -1.061200737953186, "logits/rejected": 3.624175548553467, "logps/chosen": -544.1278076171875, "logps/rejected": -5561.8818359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.8096107244491577, "rewards/margins": 47.35784149169922, "rewards/rejected": -49.16745376586914, "step": 290 }, { "epoch": 2.843601895734597, "grad_norm": 0.00972787744038685, "learning_rate": 3.4579259185321398e-09, "logits/chosen": -0.9289520978927612, "logits/rejected": 3.9043102264404297, "logps/chosen": -559.0933837890625, "logps/rejected": -5231.408203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.9514116048812866, "rewards/margins": 43.472190856933594, "rewards/rejected": -45.423606872558594, "step": 300 }, { "epoch": 2.843601895734597, "eval_logits/chosen": -1.2322728633880615, "eval_logits/rejected": 2.752917766571045, "eval_logps/chosen": -555.4259033203125, "eval_logps/rejected": -5321.5576171875, "eval_loss": 0.0003751559997908771, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.8844420909881592, "eval_rewards/margins": 44.92324447631836, "eval_rewards/rejected": -46.80768585205078, "eval_runtime": 193.0161, "eval_samples_per_second": 20.231, "eval_steps_per_second": 0.321, "step": 300 }, { "epoch": 2.938388625592417, "grad_norm": 0.07731963429515382, "learning_rate": 3.850041354441502e-10, "logits/chosen": -0.9526262283325195, "logits/rejected": 3.7660250663757324, "logps/chosen": -527.6392822265625, "logps/rejected": -5758.5341796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.9126548767089844, "rewards/margins": 49.44573211669922, "rewards/rejected": -51.3583869934082, "step": 310 }, { "epoch": 2.985781990521327, "step": 315, "total_flos": 0.0, "train_loss": 0.04266955489773185, "train_runtime": 9090.3989, "train_samples_per_second": 4.455, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 315, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }