{ "best_metric": 0.8431944847106934, "best_model_checkpoint": "./output/training_results/C020_random_sample_llama3-8b-base_instruct_20240505_135320/checkpoint-40", "epoch": 4.0, "eval_steps": 20, "global_step": 192, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020833333333333332, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.9042, "step": 1 }, { "epoch": 0.10416666666666667, "grad_norm": 12.011271874481888, "learning_rate": 2.25e-06, "loss": 0.8971, "step": 5 }, { "epoch": 0.20833333333333334, "grad_norm": 4.434846352911407, "learning_rate": 5.25e-06, "loss": 0.8439, "step": 10 }, { "epoch": 0.3125, "grad_norm": 4.931092332966974, "learning_rate": 9e-06, "loss": 0.8244, "step": 15 }, { "epoch": 0.4166666666666667, "grad_norm": 5.012867232413473, "learning_rate": 1.275e-05, "loss": 0.8145, "step": 20 }, { "epoch": 0.4166666666666667, "eval_loss": 0.8468257784843445, "eval_runtime": 1.9881, "eval_samples_per_second": 171.019, "eval_steps_per_second": 1.509, "step": 20 }, { "epoch": 0.5208333333333334, "grad_norm": 4.449941139886805, "learning_rate": 1.3195176200175283e-05, "loss": 0.8159, "step": 25 }, { "epoch": 0.625, "grad_norm": 4.18135091284813, "learning_rate": 9.515676612044427e-06, "loss": 0.8957, "step": 30 }, { "epoch": 0.7291666666666666, "grad_norm": 3.752406031727004, "learning_rate": 6.797580677308734e-06, "loss": 0.8129, "step": 35 }, { "epoch": 0.8333333333333334, "grad_norm": 4.1430378216898704, "learning_rate": 4.808575415542887e-06, "loss": 0.7939, "step": 40 }, { "epoch": 0.8333333333333334, "eval_loss": 0.8431944847106934, "eval_runtime": 1.9703, "eval_samples_per_second": 172.563, "eval_steps_per_second": 1.523, "step": 40 }, { "epoch": 0.9375, "grad_norm": 4.01217841380717, "learning_rate": 3.3676619069852654e-06, "loss": 0.8733, "step": 45 }, { "epoch": 1.0416666666666667, "grad_norm": 3.732637485037027, "learning_rate": 2.334947896124909e-06, "loss": 0.7185, "step": 50 }, { "epoch": 1.1458333333333333, "grad_norm": 3.043619829639132, "learning_rate": 1.603233215095547e-06, "loss": 0.4622, "step": 55 }, { "epoch": 1.25, "grad_norm": 3.636342734483024, "learning_rate": 1.0911174606561334e-06, "loss": 0.4337, "step": 60 }, { "epoch": 1.25, "eval_loss": 0.8652567267417908, "eval_runtime": 1.9663, "eval_samples_per_second": 172.915, "eval_steps_per_second": 1.526, "step": 60 }, { "epoch": 1.3541666666666667, "grad_norm": 3.721014386073381, "learning_rate": 7.373930741131784e-07, "loss": 0.3948, "step": 65 }, { "epoch": 1.4583333333333333, "grad_norm": 4.272747622198974, "learning_rate": 4.965174334325768e-07, "loss": 0.4398, "step": 70 }, { "epoch": 1.5625, "grad_norm": 4.702025195189812, "learning_rate": 3.349849877937343e-07, "loss": 0.4357, "step": 75 }, { "epoch": 1.6666666666666665, "grad_norm": 3.878800809642797, "learning_rate": 2.2844505627726646e-07, "loss": 0.4546, "step": 80 }, { "epoch": 1.6666666666666665, "eval_loss": 0.8523830771446228, "eval_runtime": 1.9683, "eval_samples_per_second": 172.737, "eval_steps_per_second": 1.524, "step": 80 }, { "epoch": 1.7708333333333335, "grad_norm": 3.433258345841423, "learning_rate": 1.594328760942437e-07, "loss": 0.4238, "step": 85 }, { "epoch": 1.875, "grad_norm": 3.4246678351281, "learning_rate": 1.156010161291434e-07, "loss": 0.4194, "step": 90 }, { "epoch": 1.9791666666666665, "grad_norm": 4.120006985739495, "learning_rate": 8.835555547373544e-08, "loss": 0.4784, "step": 95 }, { "epoch": 2.0833333333333335, "grad_norm": 3.440494197786637, "learning_rate": 7.181664349277562e-08, "loss": 0.3886, "step": 100 }, { "epoch": 2.0833333333333335, "eval_loss": 0.8477094769477844, "eval_runtime": 1.9597, "eval_samples_per_second": 173.492, "eval_steps_per_second": 1.531, "step": 100 }, { "epoch": 2.1875, "grad_norm": 3.2905186031662637, "learning_rate": 6.203637972657601e-08, "loss": 0.3689, "step": 105 }, { "epoch": 2.2916666666666665, "grad_norm": 3.2557322300638707, "learning_rate": 5.6418543066491835e-08, "loss": 0.3967, "step": 110 }, { "epoch": 2.3958333333333335, "grad_norm": 3.4533216456969154, "learning_rate": 5.329471712759216e-08, "loss": 0.356, "step": 115 }, { "epoch": 2.5, "grad_norm": 4.435421310367252, "learning_rate": 5.161995210302015e-08, "loss": 0.3963, "step": 120 }, { "epoch": 2.5, "eval_loss": 0.8523402214050293, "eval_runtime": 1.9645, "eval_samples_per_second": 173.073, "eval_steps_per_second": 1.527, "step": 120 }, { "epoch": 2.6041666666666665, "grad_norm": 3.534509862323624, "learning_rate": 5.075841465580837e-08, "loss": 0.3745, "step": 125 }, { "epoch": 2.7083333333333335, "grad_norm": 3.6392331005784455, "learning_rate": 5.033564114946932e-08, "loss": 0.3739, "step": 130 }, { "epoch": 2.8125, "grad_norm": 3.403135268036561, "learning_rate": 5.013915282607116e-08, "loss": 0.375, "step": 135 }, { "epoch": 2.9166666666666665, "grad_norm": 3.2635906832893986, "learning_rate": 5.005343402153039e-08, "loss": 0.3728, "step": 140 }, { "epoch": 2.9166666666666665, "eval_loss": 0.8571116328239441, "eval_runtime": 1.9607, "eval_samples_per_second": 173.407, "eval_steps_per_second": 1.53, "step": 140 }, { "epoch": 3.0208333333333335, "grad_norm": 2.935995500247785, "learning_rate": 5.001872829857116e-08, "loss": 0.3761, "step": 145 }, { "epoch": 3.125, "grad_norm": 3.1843597559668586, "learning_rate": 5.000587713853837e-08, "loss": 0.3705, "step": 150 }, { "epoch": 3.2291666666666665, "grad_norm": 3.189160099244143, "learning_rate": 5.0001608748597456e-08, "loss": 0.3663, "step": 155 }, { "epoch": 3.3333333333333335, "grad_norm": 4.57809262740007, "learning_rate": 5.0000370319656156e-08, "loss": 0.3681, "step": 160 }, { "epoch": 3.3333333333333335, "eval_loss": 0.8608238101005554, "eval_runtime": 1.959, "eval_samples_per_second": 173.559, "eval_steps_per_second": 1.531, "step": 160 }, { "epoch": 3.4375, "grad_norm": 3.2737719408968937, "learning_rate": 5.0000067945715855e-08, "loss": 0.3625, "step": 165 }, { "epoch": 3.5416666666666665, "grad_norm": 3.693883088182916, "learning_rate": 5.0000009144677036e-08, "loss": 0.35, "step": 170 }, { "epoch": 3.6458333333333335, "grad_norm": 3.4068564249987734, "learning_rate": 5.0000000785521776e-08, "loss": 0.3634, "step": 175 }, { "epoch": 3.75, "grad_norm": 3.4105928150946205, "learning_rate": 5.000000003317662e-08, "loss": 0.3621, "step": 180 }, { "epoch": 3.75, "eval_loss": 0.8636972904205322, "eval_runtime": 1.9687, "eval_samples_per_second": 172.707, "eval_steps_per_second": 1.524, "step": 180 }, { "epoch": 3.8541666666666665, "grad_norm": 3.825319205616779, "learning_rate": 5.000000000038355e-08, "loss": 0.3713, "step": 185 }, { "epoch": 3.9583333333333335, "grad_norm": 3.138248738326591, "learning_rate": 5.000000000000018e-08, "loss": 0.3633, "step": 190 }, { "epoch": 4.0, "step": 192, "total_flos": 5373123624960.0, "train_loss": 0.5064623864988486, "train_runtime": 1042.9955, "train_samples_per_second": 11.712, "train_steps_per_second": 0.184 } ], "logging_steps": 5, "max_steps": 192, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 20, "total_flos": 5373123624960.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }