{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9874476987447699, "eval_steps": 500, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016736401673640166, "grad_norm": 60.54649919183787, "learning_rate": 8.333333333333333e-08, "logits/chosen": -2.7982423305511475, "logits/rejected": -2.8120927810668945, "logps/chosen": -280.23602294921875, "logps/pi_response": -160.69113159179688, "logps/ref_response": -160.69113159179688, "logps/rejected": -531.5305786132812, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16736401673640167, "grad_norm": 30.670173445737255, "learning_rate": 4.930057285201027e-07, "logits/chosen": -2.647188901901245, "logits/rejected": -2.564857244491577, "logps/chosen": -261.201416015625, "logps/pi_response": -134.0703125, "logps/ref_response": -117.64765167236328, "logps/rejected": -430.36309814453125, "loss": 0.6601, "rewards/accuracies": 0.6041666865348816, "rewards/chosen": -0.2441195845603943, "rewards/margins": 0.09843622148036957, "rewards/rejected": -0.3425557613372803, "step": 10 }, { "epoch": 0.33472803347280333, "grad_norm": 43.23351074591614, "learning_rate": 4.187457503795526e-07, "logits/chosen": -2.5144124031066895, "logits/rejected": -2.4462006092071533, "logps/chosen": -314.9715881347656, "logps/pi_response": -141.60418701171875, "logps/ref_response": -112.05741119384766, "logps/rejected": -561.4404296875, "loss": 0.5925, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9154893755912781, "rewards/margins": 1.0008858442306519, "rewards/rejected": -1.9163751602172852, "step": 20 }, { "epoch": 0.502092050209205, "grad_norm": 26.849454289365735, "learning_rate": 2.8691164100062034e-07, "logits/chosen": -2.409524440765381, "logits/rejected": -2.3526878356933594, "logps/chosen": -336.5103759765625, "logps/pi_response": -145.1826629638672, "logps/ref_response": -115.66377258300781, "logps/rejected": -507.21588134765625, "loss": 0.5309, "rewards/accuracies": 0.71875, "rewards/chosen": -0.896476149559021, "rewards/margins": 0.602551281452179, "rewards/rejected": -1.4990274906158447, "step": 30 }, { "epoch": 0.6694560669456067, "grad_norm": 26.2463400987958, "learning_rate": 1.4248369943086995e-07, "logits/chosen": -2.4155898094177246, "logits/rejected": -2.3113021850585938, "logps/chosen": -315.86505126953125, "logps/pi_response": -140.6067657470703, "logps/ref_response": -117.29170989990234, "logps/rejected": -532.4923706054688, "loss": 0.5073, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6968622803688049, "rewards/margins": 0.7633368372917175, "rewards/rejected": -1.460198998451233, "step": 40 }, { "epoch": 0.8368200836820083, "grad_norm": 27.26219373255879, "learning_rate": 3.473909705816111e-08, "logits/chosen": -2.4362852573394775, "logits/rejected": -2.3505661487579346, "logps/chosen": -314.051513671875, "logps/pi_response": -138.14505004882812, "logps/ref_response": -123.77779388427734, "logps/rejected": -529.0848388671875, "loss": 0.5014, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6226969957351685, "rewards/margins": 0.7943958640098572, "rewards/rejected": -1.4170929193496704, "step": 50 }, { "epoch": 0.9874476987447699, "step": 59, "total_flos": 0.0, "train_loss": 0.5520036190243091, "train_runtime": 1320.8605, "train_samples_per_second": 11.57, "train_steps_per_second": 0.045 } ], "logging_steps": 10, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }