{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 16.144156476964564, "learning_rate": 3.125e-08, "logits/chosen": -1.517999529838562, "logits/rejected": -1.427964687347412, "logps/chosen": -138.13075256347656, "logps/rejected": -139.19334411621094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "grad_norm": 16.23747042290227, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.8612151145935059, "logits/rejected": -1.837838053703308, "logps/chosen": -165.77423095703125, "logps/rejected": -167.00115966796875, "loss": 0.6932, "rewards/accuracies": 0.4097222089767456, "rewards/chosen": 0.00027224430232308805, "rewards/margins": -0.0006895032711327076, "rewards/rejected": 0.0009617475443519652, "step": 10 }, { "epoch": 0.13, "grad_norm": 16.626258757403523, "learning_rate": 4.989935734988097e-07, "logits/chosen": -1.765881896018982, "logits/rejected": -1.758798360824585, "logps/chosen": -146.74258422851562, "logps/rejected": -150.7355499267578, "loss": 0.6929, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.013516152277588844, "rewards/margins": 0.0023722327314317226, "rewards/rejected": -0.015888383612036705, "step": 20 }, { "epoch": 0.19, "grad_norm": 15.590271696523773, "learning_rate": 4.877641290737883e-07, "logits/chosen": -1.6886975765228271, "logits/rejected": -1.7255547046661377, "logps/chosen": -171.6064453125, "logps/rejected": -175.8733367919922, "loss": 0.6936, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.10228633880615234, "rewards/margins": -0.003102194517850876, "rewards/rejected": -0.09918414056301117, "step": 30 }, { "epoch": 0.26, "grad_norm": 15.611757303970759, "learning_rate": 4.646121984004665e-07, "logits/chosen": -1.6890376806259155, "logits/rejected": -1.6796882152557373, "logps/chosen": -175.96786499023438, "logps/rejected": -174.4598846435547, "loss": 0.6962, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": -0.1981964409351349, "rewards/margins": -0.007378303911536932, "rewards/rejected": -0.19081811606884003, "step": 40 }, { "epoch": 0.32, "grad_norm": 15.152656997490112, "learning_rate": 4.3069871595684787e-07, "logits/chosen": -1.8153579235076904, "logits/rejected": -1.864524483680725, "logps/chosen": -173.35693359375, "logps/rejected": -175.8848114013672, "loss": 0.6916, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09905640780925751, "rewards/margins": 0.00832386501133442, "rewards/rejected": -0.10738028585910797, "step": 50 }, { "epoch": 0.38, "grad_norm": 17.022534458266477, "learning_rate": 3.877242453630256e-07, "logits/chosen": -1.8550310134887695, "logits/rejected": -1.8582820892333984, "logps/chosen": -192.25155639648438, "logps/rejected": -197.19004821777344, "loss": 0.6912, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.2872371971607208, "rewards/margins": 0.02158377319574356, "rewards/rejected": -0.3088209331035614, "step": 60 }, { "epoch": 0.45, "grad_norm": 16.571326277539317, "learning_rate": 3.378437060203357e-07, "logits/chosen": -1.8471654653549194, "logits/rejected": -1.8384788036346436, "logps/chosen": -192.4368896484375, "logps/rejected": -195.90115356445312, "loss": 0.6888, "rewards/accuracies": 0.515625, "rewards/chosen": -0.3420848548412323, "rewards/margins": 0.01330840028822422, "rewards/rejected": -0.35539326071739197, "step": 70 }, { "epoch": 0.51, "grad_norm": 13.438756015994942, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -1.8971712589263916, "logits/rejected": -1.9240859746932983, "logps/chosen": -172.50802612304688, "logps/rejected": -181.87484741210938, "loss": 0.6872, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.19086754322052002, "rewards/margins": 0.0247647762298584, "rewards/rejected": -0.21563228964805603, "step": 80 }, { "epoch": 0.58, "grad_norm": 15.835235204252495, "learning_rate": 2.2759017277414164e-07, "logits/chosen": -1.803915023803711, "logits/rejected": -1.7513706684112549, "logps/chosen": -196.07321166992188, "logps/rejected": -192.37071228027344, "loss": 0.683, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.3334119915962219, "rewards/margins": 0.01788436248898506, "rewards/rejected": -0.3512963652610779, "step": 90 }, { "epoch": 0.64, "grad_norm": 20.64165185557883, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -1.8089154958724976, "logits/rejected": -1.7900733947753906, "logps/chosen": -193.5570831298828, "logps/rejected": -195.0845489501953, "loss": 0.6909, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.3730206787586212, "rewards/margins": 0.006149230990558863, "rewards/rejected": -0.37916988134384155, "step": 100 }, { "epoch": 0.7, "grad_norm": 14.009691318366311, "learning_rate": 1.2177518064852348e-07, "logits/chosen": -1.735630750656128, "logits/rejected": -1.7698205709457397, "logps/chosen": -179.77761840820312, "logps/rejected": -190.6461944580078, "loss": 0.6853, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.3420836329460144, "rewards/margins": 0.045021723955869675, "rewards/rejected": -0.38710540533065796, "step": 110 }, { "epoch": 0.77, "grad_norm": 15.208350623463305, "learning_rate": 7.723433775328384e-08, "logits/chosen": -1.8086316585540771, "logits/rejected": -1.8518075942993164, "logps/chosen": -198.0193634033203, "logps/rejected": -197.88845825195312, "loss": 0.6845, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.36722415685653687, "rewards/margins": 0.019394848495721817, "rewards/rejected": -0.386618971824646, "step": 120 }, { "epoch": 0.83, "grad_norm": 20.27285082098782, "learning_rate": 4.1356686569674335e-08, "logits/chosen": -1.6780946254730225, "logits/rejected": -1.6494309902191162, "logps/chosen": -181.09170532226562, "logps/rejected": -182.8488311767578, "loss": 0.6846, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.29602062702178955, "rewards/margins": 0.0265937689691782, "rewards/rejected": -0.3226144015789032, "step": 130 }, { "epoch": 0.9, "grad_norm": 16.756793414757478, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -1.8084728717803955, "logits/rejected": -1.767961859703064, "logps/chosen": -189.6092071533203, "logps/rejected": -193.9722900390625, "loss": 0.6818, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.3253946006298065, "rewards/margins": 0.02885589934885502, "rewards/rejected": -0.3542505204677582, "step": 140 }, { "epoch": 0.96, "grad_norm": 15.87524427688872, "learning_rate": 2.2625595580163247e-09, "logits/chosen": -1.70889413356781, "logits/rejected": -1.748719573020935, "logps/chosen": -189.88803100585938, "logps/rejected": -193.5582733154297, "loss": 0.6852, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.38032156229019165, "rewards/margins": 0.030102457851171494, "rewards/rejected": -0.41042399406433105, "step": 150 }, { "epoch": 1.0, "step": 156, "total_flos": 0.0, "train_loss": 0.688686496936358, "train_runtime": 18523.9468, "train_samples_per_second": 1.08, "train_steps_per_second": 0.008 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }