|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 230, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.043478260869565216, |
|
"grad_norm": 175.07723361554483, |
|
"learning_rate": 8.695652173913044e-07, |
|
"loss": 1.9054, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 62.962929820729734, |
|
"learning_rate": 4.347826086956522e-06, |
|
"loss": 1.4576, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 8.648281883873826, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 0.6132, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 16.239431113601718, |
|
"learning_rate": 1.3043478260869566e-05, |
|
"loss": 0.3115, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 3.450808311744288, |
|
"learning_rate": 1.739130434782609e-05, |
|
"loss": 0.203, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.10863249748945236, |
|
"eval_runtime": 67.8349, |
|
"eval_samples_per_second": 21.169, |
|
"eval_steps_per_second": 0.663, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.0869565217391304, |
|
"grad_norm": 34.3749757818615, |
|
"learning_rate": 1.9995393663024054e-05, |
|
"loss": 0.1339, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 2.3841429135265555, |
|
"learning_rate": 1.9943621095573588e-05, |
|
"loss": 0.1249, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.5217391304347827, |
|
"grad_norm": 1.2759644316371075, |
|
"learning_rate": 1.9834617016337424e-05, |
|
"loss": 0.088, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 34.92728113964099, |
|
"learning_rate": 1.9669008809262064e-05, |
|
"loss": 0.0829, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.9565217391304348, |
|
"grad_norm": 2.0028289380489945, |
|
"learning_rate": 1.944774964904754e-05, |
|
"loss": 0.0942, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.07020141929388046, |
|
"eval_runtime": 67.1251, |
|
"eval_samples_per_second": 21.393, |
|
"eval_steps_per_second": 0.67, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 1.3438276529586066, |
|
"learning_rate": 1.917211301505453e-05, |
|
"loss": 0.0674, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.391304347826087, |
|
"grad_norm": 1142.7739612838905, |
|
"learning_rate": 1.8843685361665724e-05, |
|
"loss": 0.0668, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 2.660687233406025, |
|
"learning_rate": 1.8464356987288012e-05, |
|
"loss": 0.065, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.8260869565217392, |
|
"grad_norm": 6.723586538182285, |
|
"learning_rate": 1.8036311154549783e-05, |
|
"loss": 0.064, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.057872917503118515, |
|
"eval_runtime": 67.6569, |
|
"eval_samples_per_second": 21.225, |
|
"eval_steps_per_second": 0.665, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 3.0434782608695654, |
|
"grad_norm": 0.7827050845032839, |
|
"learning_rate": 1.7562011524313187e-05, |
|
"loss": 0.0604, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.260869565217391, |
|
"grad_norm": 1.2639368123822572, |
|
"learning_rate": 1.7044187975826126e-05, |
|
"loss": 0.0755, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 0.9047997047917037, |
|
"learning_rate": 1.648582089462756e-05, |
|
"loss": 0.0569, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.6956521739130435, |
|
"grad_norm": 0.5428178712369748, |
|
"learning_rate": 1.589012401863864e-05, |
|
"loss": 0.0577, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 3.9130434782608696, |
|
"grad_norm": 0.6450527453899985, |
|
"learning_rate": 1.526052594117071e-05, |
|
"loss": 0.055, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.05387277901172638, |
|
"eval_runtime": 67.296, |
|
"eval_samples_per_second": 21.339, |
|
"eval_steps_per_second": 0.669, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 4.130434782608695, |
|
"grad_norm": 0.6167289429330038, |
|
"learning_rate": 1.4600650377311523e-05, |
|
"loss": 0.0549, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 0.5274268025844417, |
|
"learning_rate": 1.3914295307268396e-05, |
|
"loss": 0.0535, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.565217391304348, |
|
"grad_norm": 4.645394192055986, |
|
"learning_rate": 1.3205411116710973e-05, |
|
"loss": 0.0558, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 4.782608695652174, |
|
"grad_norm": 0.44016634132559596, |
|
"learning_rate": 1.2478077859929e-05, |
|
"loss": 0.0557, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.37871781752696215, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.0533, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.05291759595274925, |
|
"eval_runtime": 67.622, |
|
"eval_samples_per_second": 21.236, |
|
"eval_steps_per_second": 0.665, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 5.217391304347826, |
|
"grad_norm": 0.4605215085179153, |
|
"learning_rate": 1.0984891197811686e-05, |
|
"loss": 0.0526, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 5.434782608695652, |
|
"grad_norm": 0.5874492251340868, |
|
"learning_rate": 1.0227631978561057e-05, |
|
"loss": 0.0531, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 5.6521739130434785, |
|
"grad_norm": 0.29510087532653284, |
|
"learning_rate": 9.469062600552509e-06, |
|
"loss": 0.0525, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 5.869565217391305, |
|
"grad_norm": 0.344906161076209, |
|
"learning_rate": 8.71354908617169e-06, |
|
"loss": 0.0525, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.0515441857278347, |
|
"eval_runtime": 67.5553, |
|
"eval_samples_per_second": 21.257, |
|
"eval_steps_per_second": 0.666, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 6.086956521739131, |
|
"grad_norm": 0.36354029610859134, |
|
"learning_rate": 7.965439869473664e-06, |
|
"loss": 0.0514, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 6.304347826086957, |
|
"grad_norm": 0.26428382375759113, |
|
"learning_rate": 7.2290407683331154e-06, |
|
"loss": 0.0515, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 6.521739130434782, |
|
"grad_norm": 0.22822648732081055, |
|
"learning_rate": 6.508590201876317e-06, |
|
"loss": 0.0512, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 6.739130434782608, |
|
"grad_norm": 0.25828737593751994, |
|
"learning_rate": 5.8082347958333625e-06, |
|
"loss": 0.0513, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 6.956521739130435, |
|
"grad_norm": 0.2285799063924952, |
|
"learning_rate": 5.132005516216512e-06, |
|
"loss": 0.0519, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.05051277205348015, |
|
"eval_runtime": 67.5123, |
|
"eval_samples_per_second": 21.27, |
|
"eval_steps_per_second": 0.667, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 7.173913043478261, |
|
"grad_norm": 0.24475626828923558, |
|
"learning_rate": 4.483794468689728e-06, |
|
"loss": 0.0508, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 7.391304347826087, |
|
"grad_norm": 0.2520072015362384, |
|
"learning_rate": 3.867332497162836e-06, |
|
"loss": 0.0504, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 7.608695652173913, |
|
"grad_norm": 0.27382520169651525, |
|
"learning_rate": 3.2861677105440335e-06, |
|
"loss": 0.0504, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 7.826086956521739, |
|
"grad_norm": 0.2716287514584957, |
|
"learning_rate": 2.7436450612420098e-06, |
|
"loss": 0.0505, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.04957514628767967, |
|
"eval_runtime": 67.276, |
|
"eval_samples_per_second": 21.345, |
|
"eval_steps_per_second": 0.669, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 8.043478260869565, |
|
"grad_norm": 0.21787711573464694, |
|
"learning_rate": 2.2428870929558012e-06, |
|
"loss": 0.0496, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 8.26086956521739, |
|
"grad_norm": 0.21445383159071862, |
|
"learning_rate": 1.7867759685603115e-06, |
|
"loss": 0.0494, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 8.478260869565217, |
|
"grad_norm": 0.24405861737397047, |
|
"learning_rate": 1.3779368815278648e-06, |
|
"loss": 0.0491, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"grad_norm": 0.2697463930100617, |
|
"learning_rate": 1.01872294636304e-06, |
|
"loss": 0.0495, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 8.91304347826087, |
|
"grad_norm": 0.21112878071345867, |
|
"learning_rate": 7.1120165501533e-07, |
|
"loss": 0.0494, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.048730239272117615, |
|
"eval_runtime": 66.913, |
|
"eval_samples_per_second": 21.461, |
|
"eval_steps_per_second": 0.673, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 9.130434782608695, |
|
"grad_norm": 0.19010353057716406, |
|
"learning_rate": 4.5714297722121105e-07, |
|
"loss": 0.049, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 9.347826086956522, |
|
"grad_norm": 0.17948234694281104, |
|
"learning_rate": 2.5800917326521013e-07, |
|
"loss": 0.0491, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 9.565217391304348, |
|
"grad_norm": 0.19343675188279383, |
|
"learning_rate": 1.1494637779369766e-07, |
|
"loss": 0.0489, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 9.782608695652174, |
|
"grad_norm": 0.21072621537133224, |
|
"learning_rate": 2.8778003121607834e-08, |
|
"loss": 0.0483, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.18309652115048786, |
|
"learning_rate": 0.0, |
|
"loss": 0.0484, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.0484623983502388, |
|
"eval_runtime": 67.5159, |
|
"eval_samples_per_second": 21.269, |
|
"eval_steps_per_second": 0.667, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 230, |
|
"total_flos": 48157320806400.0, |
|
"train_loss": 0.1130801611620447, |
|
"train_runtime": 3471.4646, |
|
"train_samples_per_second": 4.137, |
|
"train_steps_per_second": 0.066 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 230, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 48157320806400.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|