|
{ |
|
"best_metric": 0.8431944847106934, |
|
"best_model_checkpoint": "./output/training_results/C020_random_sample_llama3-8b-base_instruct_20240505_135320/checkpoint-40", |
|
"epoch": 4.0, |
|
"eval_steps": 20, |
|
"global_step": 192, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.020833333333333332, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.9042, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 12.011271874481888, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.8971, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 4.434846352911407, |
|
"learning_rate": 5.25e-06, |
|
"loss": 0.8439, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 4.931092332966974, |
|
"learning_rate": 9e-06, |
|
"loss": 0.8244, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 5.012867232413473, |
|
"learning_rate": 1.275e-05, |
|
"loss": 0.8145, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"eval_loss": 0.8468257784843445, |
|
"eval_runtime": 1.9881, |
|
"eval_samples_per_second": 171.019, |
|
"eval_steps_per_second": 1.509, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 4.449941139886805, |
|
"learning_rate": 1.3195176200175283e-05, |
|
"loss": 0.8159, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 4.18135091284813, |
|
"learning_rate": 9.515676612044427e-06, |
|
"loss": 0.8957, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7291666666666666, |
|
"grad_norm": 3.752406031727004, |
|
"learning_rate": 6.797580677308734e-06, |
|
"loss": 0.8129, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 4.1430378216898704, |
|
"learning_rate": 4.808575415542887e-06, |
|
"loss": 0.7939, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"eval_loss": 0.8431944847106934, |
|
"eval_runtime": 1.9703, |
|
"eval_samples_per_second": 172.563, |
|
"eval_steps_per_second": 1.523, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 4.01217841380717, |
|
"learning_rate": 3.3676619069852654e-06, |
|
"loss": 0.8733, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.0416666666666667, |
|
"grad_norm": 3.732637485037027, |
|
"learning_rate": 2.334947896124909e-06, |
|
"loss": 0.7185, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1458333333333333, |
|
"grad_norm": 3.043619829639132, |
|
"learning_rate": 1.603233215095547e-06, |
|
"loss": 0.4622, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.636342734483024, |
|
"learning_rate": 1.0911174606561334e-06, |
|
"loss": 0.4337, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8652567267417908, |
|
"eval_runtime": 1.9663, |
|
"eval_samples_per_second": 172.915, |
|
"eval_steps_per_second": 1.526, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3541666666666667, |
|
"grad_norm": 3.721014386073381, |
|
"learning_rate": 7.373930741131784e-07, |
|
"loss": 0.3948, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.4583333333333333, |
|
"grad_norm": 4.272747622198974, |
|
"learning_rate": 4.965174334325768e-07, |
|
"loss": 0.4398, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 4.702025195189812, |
|
"learning_rate": 3.349849877937343e-07, |
|
"loss": 0.4357, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 3.878800809642797, |
|
"learning_rate": 2.2844505627726646e-07, |
|
"loss": 0.4546, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_loss": 0.8523830771446228, |
|
"eval_runtime": 1.9683, |
|
"eval_samples_per_second": 172.737, |
|
"eval_steps_per_second": 1.524, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7708333333333335, |
|
"grad_norm": 3.433258345841423, |
|
"learning_rate": 1.594328760942437e-07, |
|
"loss": 0.4238, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 3.4246678351281, |
|
"learning_rate": 1.156010161291434e-07, |
|
"loss": 0.4194, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9791666666666665, |
|
"grad_norm": 4.120006985739495, |
|
"learning_rate": 8.835555547373544e-08, |
|
"loss": 0.4784, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 3.440494197786637, |
|
"learning_rate": 7.181664349277562e-08, |
|
"loss": 0.3886, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"eval_loss": 0.8477094769477844, |
|
"eval_runtime": 1.9597, |
|
"eval_samples_per_second": 173.492, |
|
"eval_steps_per_second": 1.531, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 3.2905186031662637, |
|
"learning_rate": 6.203637972657601e-08, |
|
"loss": 0.3689, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.2916666666666665, |
|
"grad_norm": 3.2557322300638707, |
|
"learning_rate": 5.6418543066491835e-08, |
|
"loss": 0.3967, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.3958333333333335, |
|
"grad_norm": 3.4533216456969154, |
|
"learning_rate": 5.329471712759216e-08, |
|
"loss": 0.356, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 4.435421310367252, |
|
"learning_rate": 5.161995210302015e-08, |
|
"loss": 0.3963, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.8523402214050293, |
|
"eval_runtime": 1.9645, |
|
"eval_samples_per_second": 173.073, |
|
"eval_steps_per_second": 1.527, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6041666666666665, |
|
"grad_norm": 3.534509862323624, |
|
"learning_rate": 5.075841465580837e-08, |
|
"loss": 0.3745, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.7083333333333335, |
|
"grad_norm": 3.6392331005784455, |
|
"learning_rate": 5.033564114946932e-08, |
|
"loss": 0.3739, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 3.403135268036561, |
|
"learning_rate": 5.013915282607116e-08, |
|
"loss": 0.375, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 3.2635906832893986, |
|
"learning_rate": 5.005343402153039e-08, |
|
"loss": 0.3728, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"eval_loss": 0.8571116328239441, |
|
"eval_runtime": 1.9607, |
|
"eval_samples_per_second": 173.407, |
|
"eval_steps_per_second": 1.53, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0208333333333335, |
|
"grad_norm": 2.935995500247785, |
|
"learning_rate": 5.001872829857116e-08, |
|
"loss": 0.3761, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 3.1843597559668586, |
|
"learning_rate": 5.000587713853837e-08, |
|
"loss": 0.3705, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.2291666666666665, |
|
"grad_norm": 3.189160099244143, |
|
"learning_rate": 5.0001608748597456e-08, |
|
"loss": 0.3663, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 4.57809262740007, |
|
"learning_rate": 5.0000370319656156e-08, |
|
"loss": 0.3681, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"eval_loss": 0.8608238101005554, |
|
"eval_runtime": 1.959, |
|
"eval_samples_per_second": 173.559, |
|
"eval_steps_per_second": 1.531, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 3.2737719408968937, |
|
"learning_rate": 5.0000067945715855e-08, |
|
"loss": 0.3625, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.5416666666666665, |
|
"grad_norm": 3.693883088182916, |
|
"learning_rate": 5.0000009144677036e-08, |
|
"loss": 0.35, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6458333333333335, |
|
"grad_norm": 3.4068564249987734, |
|
"learning_rate": 5.0000000785521776e-08, |
|
"loss": 0.3634, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 3.4105928150946205, |
|
"learning_rate": 5.000000003317662e-08, |
|
"loss": 0.3621, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 0.8636972904205322, |
|
"eval_runtime": 1.9687, |
|
"eval_samples_per_second": 172.707, |
|
"eval_steps_per_second": 1.524, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.8541666666666665, |
|
"grad_norm": 3.825319205616779, |
|
"learning_rate": 5.000000000038355e-08, |
|
"loss": 0.3713, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.9583333333333335, |
|
"grad_norm": 3.138248738326591, |
|
"learning_rate": 5.000000000000018e-08, |
|
"loss": 0.3633, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 192, |
|
"total_flos": 5373123624960.0, |
|
"train_loss": 0.5064623864988486, |
|
"train_runtime": 1042.9955, |
|
"train_samples_per_second": 11.712, |
|
"train_steps_per_second": 0.184 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 192, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 20, |
|
"total_flos": 5373123624960.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|