|
{ |
|
"best_metric": 0.04311952739953995, |
|
"best_model_checkpoint": "t5/checkpoint-58320", |
|
"epoch": 100.0, |
|
"eval_steps": 500, |
|
"global_step": 486000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.14633552730083466, |
|
"learning_rate": 0.00099, |
|
"loss": 0.1231, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.6418053052843314, |
|
"eval_loss": 0.0717623308300972, |
|
"eval_runtime": 1333.7775, |
|
"eval_samples_per_second": 89.456, |
|
"eval_steps_per_second": 0.35, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.10493500530719757, |
|
"learning_rate": 0.00098, |
|
"loss": 0.0712, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6884046431714369, |
|
"eval_loss": 0.06004703789949417, |
|
"eval_runtime": 1304.0684, |
|
"eval_samples_per_second": 91.494, |
|
"eval_steps_per_second": 0.358, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.09630604088306427, |
|
"learning_rate": 0.0009699999999999999, |
|
"loss": 0.0593, |
|
"step": 14580 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7176465658131835, |
|
"eval_loss": 0.05390430614352226, |
|
"eval_runtime": 1308.2854, |
|
"eval_samples_per_second": 91.2, |
|
"eval_steps_per_second": 0.357, |
|
"step": 14580 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.07841313630342484, |
|
"learning_rate": 0.00096, |
|
"loss": 0.0519, |
|
"step": 19440 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.730679294304991, |
|
"eval_loss": 0.050438590347766876, |
|
"eval_runtime": 1309.4742, |
|
"eval_samples_per_second": 91.117, |
|
"eval_steps_per_second": 0.357, |
|
"step": 19440 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.07362372428178787, |
|
"learning_rate": 0.00095, |
|
"loss": 0.0464, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.7446423333193647, |
|
"eval_loss": 0.04836108162999153, |
|
"eval_runtime": 1308.1745, |
|
"eval_samples_per_second": 91.207, |
|
"eval_steps_per_second": 0.357, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.05613507702946663, |
|
"learning_rate": 0.00094, |
|
"loss": 0.0422, |
|
"step": 29160 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7498721870678456, |
|
"eval_loss": 0.04650866985321045, |
|
"eval_runtime": 1310.787, |
|
"eval_samples_per_second": 91.025, |
|
"eval_steps_per_second": 0.356, |
|
"step": 29160 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.06997396796941757, |
|
"learning_rate": 0.00093, |
|
"loss": 0.0385, |
|
"step": 34020 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7611197250974312, |
|
"eval_loss": 0.04569365829229355, |
|
"eval_runtime": 1301.8483, |
|
"eval_samples_per_second": 91.65, |
|
"eval_steps_per_second": 0.359, |
|
"step": 34020 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.054521311074495316, |
|
"learning_rate": 0.00092, |
|
"loss": 0.0354, |
|
"step": 38880 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7642039978208943, |
|
"eval_loss": 0.04475805535912514, |
|
"eval_runtime": 1307.7292, |
|
"eval_samples_per_second": 91.238, |
|
"eval_steps_per_second": 0.357, |
|
"step": 38880 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.06338842958211899, |
|
"learning_rate": 0.00091, |
|
"loss": 0.0328, |
|
"step": 43740 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.771554289066756, |
|
"eval_loss": 0.0442810133099556, |
|
"eval_runtime": 1304.569, |
|
"eval_samples_per_second": 91.459, |
|
"eval_steps_per_second": 0.358, |
|
"step": 43740 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.054538544267416, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 0.0304, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7760130746343712, |
|
"eval_loss": 0.0437050461769104, |
|
"eval_runtime": 1308.322, |
|
"eval_samples_per_second": 91.197, |
|
"eval_steps_per_second": 0.357, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.06947464495897293, |
|
"learning_rate": 0.0008900000000000001, |
|
"loss": 0.0283, |
|
"step": 53460 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.7780664627247202, |
|
"eval_loss": 0.04394479840993881, |
|
"eval_runtime": 1311.6831, |
|
"eval_samples_per_second": 90.963, |
|
"eval_steps_per_second": 0.356, |
|
"step": 53460 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.05090058967471123, |
|
"learning_rate": 0.00088, |
|
"loss": 0.0264, |
|
"step": 58320 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7814943636592214, |
|
"eval_loss": 0.04311952739953995, |
|
"eval_runtime": 1303.6986, |
|
"eval_samples_per_second": 91.52, |
|
"eval_steps_per_second": 0.358, |
|
"step": 58320 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.05226626992225647, |
|
"learning_rate": 0.00087, |
|
"loss": 0.0248, |
|
"step": 63180 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.7855257092570087, |
|
"eval_loss": 0.04451437294483185, |
|
"eval_runtime": 1313.511, |
|
"eval_samples_per_second": 90.837, |
|
"eval_steps_per_second": 0.356, |
|
"step": 63180 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.053579073399305344, |
|
"learning_rate": 0.00086, |
|
"loss": 0.0232, |
|
"step": 68040 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7867158362318233, |
|
"eval_loss": 0.04421268403530121, |
|
"eval_runtime": 1304.7705, |
|
"eval_samples_per_second": 91.445, |
|
"eval_steps_per_second": 0.358, |
|
"step": 68040 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.052565447986125946, |
|
"learning_rate": 0.00085, |
|
"loss": 0.0218, |
|
"step": 72900 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.7881657796588861, |
|
"eval_loss": 0.04461174085736275, |
|
"eval_runtime": 1306.4264, |
|
"eval_samples_per_second": 91.329, |
|
"eval_steps_per_second": 0.357, |
|
"step": 72900 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.05223050341010094, |
|
"learning_rate": 0.00084, |
|
"loss": 0.0206, |
|
"step": 77760 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7900850689351716, |
|
"eval_loss": 0.046072401106357574, |
|
"eval_runtime": 1314.7849, |
|
"eval_samples_per_second": 90.749, |
|
"eval_steps_per_second": 0.355, |
|
"step": 77760 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.04498209059238434, |
|
"learning_rate": 0.00083, |
|
"loss": 0.0194, |
|
"step": 82620 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.7911410970959225, |
|
"eval_loss": 0.04610202834010124, |
|
"eval_runtime": 1308.5405, |
|
"eval_samples_per_second": 91.182, |
|
"eval_steps_per_second": 0.357, |
|
"step": 82620 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.047790784388780594, |
|
"learning_rate": 0.00082, |
|
"loss": 0.0183, |
|
"step": 87480 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7914931064828395, |
|
"eval_loss": 0.04640175402164459, |
|
"eval_runtime": 1301.9032, |
|
"eval_samples_per_second": 91.647, |
|
"eval_steps_per_second": 0.359, |
|
"step": 87480 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.04554256424307823, |
|
"learning_rate": 0.0008100000000000001, |
|
"loss": 0.0173, |
|
"step": 92340 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.7922055064325525, |
|
"eval_loss": 0.046802300959825516, |
|
"eval_runtime": 1318.5946, |
|
"eval_samples_per_second": 90.486, |
|
"eval_steps_per_second": 0.354, |
|
"step": 92340 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.051229629665613174, |
|
"learning_rate": 0.0008, |
|
"loss": 0.0166, |
|
"step": 97200 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7951556803419519, |
|
"eval_loss": 0.04811061546206474, |
|
"eval_runtime": 1304.3101, |
|
"eval_samples_per_second": 91.477, |
|
"eval_steps_per_second": 0.358, |
|
"step": 97200 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.04701264947652817, |
|
"learning_rate": 0.00079, |
|
"loss": 0.0158, |
|
"step": 102060 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.7942253698193856, |
|
"eval_loss": 0.04858441650867462, |
|
"eval_runtime": 1303.9515, |
|
"eval_samples_per_second": 91.503, |
|
"eval_steps_per_second": 0.358, |
|
"step": 102060 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.07426326721906662, |
|
"learning_rate": 0.0007800000000000001, |
|
"loss": 0.015, |
|
"step": 106920 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.7949126262414616, |
|
"eval_loss": 0.048401448875665665, |
|
"eval_runtime": 1304.9706, |
|
"eval_samples_per_second": 91.431, |
|
"eval_steps_per_second": 0.358, |
|
"step": 106920 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.05088690295815468, |
|
"learning_rate": 0.0007700000000000001, |
|
"loss": 0.0143, |
|
"step": 111780 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.7971001131458744, |
|
"eval_loss": 0.04964574798941612, |
|
"eval_runtime": 1311.3518, |
|
"eval_samples_per_second": 90.986, |
|
"eval_steps_per_second": 0.356, |
|
"step": 111780 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.04760482534766197, |
|
"learning_rate": 0.00076, |
|
"loss": 0.0137, |
|
"step": 116640 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.7959937979298496, |
|
"eval_loss": 0.049800001084804535, |
|
"eval_runtime": 1306.2706, |
|
"eval_samples_per_second": 91.34, |
|
"eval_steps_per_second": 0.358, |
|
"step": 116640 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.040201518684625626, |
|
"learning_rate": 0.00075, |
|
"loss": 0.0131, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.7972677366634539, |
|
"eval_loss": 0.0510859489440918, |
|
"eval_runtime": 1307.6635, |
|
"eval_samples_per_second": 91.243, |
|
"eval_steps_per_second": 0.357, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.04697073623538017, |
|
"learning_rate": 0.00074, |
|
"loss": 0.0125, |
|
"step": 126360 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.7971336378493903, |
|
"eval_loss": 0.05105246230959892, |
|
"eval_runtime": 1305.0277, |
|
"eval_samples_per_second": 91.427, |
|
"eval_steps_per_second": 0.358, |
|
"step": 126360 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.035631682723760605, |
|
"learning_rate": 0.00073, |
|
"loss": 0.012, |
|
"step": 131220 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.7990780706533127, |
|
"eval_loss": 0.051402851939201355, |
|
"eval_runtime": 1309.1883, |
|
"eval_samples_per_second": 91.137, |
|
"eval_steps_per_second": 0.357, |
|
"step": 131220 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.053142111748456955, |
|
"learning_rate": 0.0007199999999999999, |
|
"loss": 0.0116, |
|
"step": 136080 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.7987009177387587, |
|
"eval_loss": 0.05239921808242798, |
|
"eval_runtime": 1306.5799, |
|
"eval_samples_per_second": 91.319, |
|
"eval_steps_per_second": 0.357, |
|
"step": 136080 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 0.04080447182059288, |
|
"learning_rate": 0.00071, |
|
"loss": 0.0111, |
|
"step": 140940 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.7965804802413778, |
|
"eval_loss": 0.05250364542007446, |
|
"eval_runtime": 1309.3121, |
|
"eval_samples_per_second": 91.128, |
|
"eval_steps_per_second": 0.357, |
|
"step": 140940 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.045455146580934525, |
|
"learning_rate": 0.0007, |
|
"loss": 0.0107, |
|
"step": 145800 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.7984997695176633, |
|
"eval_loss": 0.0543711818754673, |
|
"eval_runtime": 1310.1708, |
|
"eval_samples_per_second": 91.068, |
|
"eval_steps_per_second": 0.356, |
|
"step": 145800 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 0.03871888667345047, |
|
"learning_rate": 0.00069, |
|
"loss": 0.0104, |
|
"step": 150660 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.7982399530654151, |
|
"eval_loss": 0.053769443184137344, |
|
"eval_runtime": 1306.4928, |
|
"eval_samples_per_second": 91.325, |
|
"eval_steps_per_second": 0.357, |
|
"step": 150660 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.048963289707899094, |
|
"learning_rate": 0.00068, |
|
"loss": 0.01, |
|
"step": 155520 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.8012404140300884, |
|
"eval_loss": 0.05514230951666832, |
|
"eval_runtime": 1308.1993, |
|
"eval_samples_per_second": 91.206, |
|
"eval_steps_per_second": 0.357, |
|
"step": 155520 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 0.03987804055213928, |
|
"learning_rate": 0.00067, |
|
"loss": 0.0097, |
|
"step": 160380 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.8019109081004064, |
|
"eval_loss": 0.055017631500959396, |
|
"eval_runtime": 1312.5155, |
|
"eval_samples_per_second": 90.906, |
|
"eval_steps_per_second": 0.356, |
|
"step": 160380 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 0.0373803973197937, |
|
"learning_rate": 0.00066, |
|
"loss": 0.0094, |
|
"step": 165240 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.7987009177387587, |
|
"eval_loss": 0.055196575820446014, |
|
"eval_runtime": 1306.0614, |
|
"eval_samples_per_second": 91.355, |
|
"eval_steps_per_second": 0.358, |
|
"step": 165240 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 0.039517637342214584, |
|
"learning_rate": 0.0006500000000000001, |
|
"loss": 0.0091, |
|
"step": 170100 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.8004777270251017, |
|
"eval_loss": 0.05661753937602043, |
|
"eval_runtime": 1306.965, |
|
"eval_samples_per_second": 91.292, |
|
"eval_steps_per_second": 0.357, |
|
"step": 170100 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.04135722666978836, |
|
"learning_rate": 0.00064, |
|
"loss": 0.0088, |
|
"step": 174960 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.8019025269245275, |
|
"eval_loss": 0.05708213895559311, |
|
"eval_runtime": 1305.0895, |
|
"eval_samples_per_second": 91.423, |
|
"eval_steps_per_second": 0.358, |
|
"step": 174960 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 0.04340599477291107, |
|
"learning_rate": 0.00063, |
|
"loss": 0.0085, |
|
"step": 179820 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.8015421363617315, |
|
"eval_loss": 0.0565766803920269, |
|
"eval_runtime": 1303.7543, |
|
"eval_samples_per_second": 91.516, |
|
"eval_steps_per_second": 0.358, |
|
"step": 179820 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 0.042780667543411255, |
|
"learning_rate": 0.00062, |
|
"loss": 0.0082, |
|
"step": 184680 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.7985919624523321, |
|
"eval_loss": 0.05795786902308464, |
|
"eval_runtime": 1306.8337, |
|
"eval_samples_per_second": 91.301, |
|
"eval_steps_per_second": 0.357, |
|
"step": 184680 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 0.03298887610435486, |
|
"learning_rate": 0.00061, |
|
"loss": 0.008, |
|
"step": 189540 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.8006537317185601, |
|
"eval_loss": 0.05666106194257736, |
|
"eval_runtime": 1303.7751, |
|
"eval_samples_per_second": 91.515, |
|
"eval_steps_per_second": 0.358, |
|
"step": 189540 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.03825366497039795, |
|
"learning_rate": 0.0006, |
|
"loss": 0.0077, |
|
"step": 194400 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.8003101035075221, |
|
"eval_loss": 0.05909406766295433, |
|
"eval_runtime": 1304.1065, |
|
"eval_samples_per_second": 91.492, |
|
"eval_steps_per_second": 0.358, |
|
"step": 194400 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 0.049214523285627365, |
|
"learning_rate": 0.00059, |
|
"loss": 0.0075, |
|
"step": 199260 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.8027406445124251, |
|
"eval_loss": 0.0589471310377121, |
|
"eval_runtime": 1305.1945, |
|
"eval_samples_per_second": 91.415, |
|
"eval_steps_per_second": 0.358, |
|
"step": 199260 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 0.03445366024971008, |
|
"learning_rate": 0.00058, |
|
"loss": 0.0073, |
|
"step": 204120 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.802765788040062, |
|
"eval_loss": 0.05833474174141884, |
|
"eval_runtime": 1304.7043, |
|
"eval_samples_per_second": 91.45, |
|
"eval_steps_per_second": 0.358, |
|
"step": 204120 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 0.031152933835983276, |
|
"learning_rate": 0.00057, |
|
"loss": 0.007, |
|
"step": 208980 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.8017851904622219, |
|
"eval_loss": 0.05955711379647255, |
|
"eval_runtime": 1302.9367, |
|
"eval_samples_per_second": 91.574, |
|
"eval_steps_per_second": 0.358, |
|
"step": 208980 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 0.028931325301527977, |
|
"learning_rate": 0.0005600000000000001, |
|
"loss": 0.0069, |
|
"step": 213840 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.803469806813896, |
|
"eval_loss": 0.05940761789679527, |
|
"eval_runtime": 1302.3209, |
|
"eval_samples_per_second": 91.617, |
|
"eval_steps_per_second": 0.359, |
|
"step": 213840 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 0.03164521977305412, |
|
"learning_rate": 0.00055, |
|
"loss": 0.0066, |
|
"step": 218700 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.803084272723463, |
|
"eval_loss": 0.0604814775288105, |
|
"eval_runtime": 1299.6461, |
|
"eval_samples_per_second": 91.806, |
|
"eval_steps_per_second": 0.359, |
|
"step": 218700 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 0.09477687627077103, |
|
"learning_rate": 0.00054, |
|
"loss": 0.0065, |
|
"step": 223560 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.803805053849055, |
|
"eval_loss": 0.05929319187998772, |
|
"eval_runtime": 1297.2788, |
|
"eval_samples_per_second": 91.973, |
|
"eval_steps_per_second": 0.36, |
|
"step": 223560 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 0.032785411924123764, |
|
"learning_rate": 0.0005300000000000001, |
|
"loss": 0.0063, |
|
"step": 228420 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.8049784184721116, |
|
"eval_loss": 0.06024543195962906, |
|
"eval_runtime": 1298.8006, |
|
"eval_samples_per_second": 91.866, |
|
"eval_steps_per_second": 0.36, |
|
"step": 228420 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.03677200525999069, |
|
"learning_rate": 0.0005200000000000001, |
|
"loss": 0.006, |
|
"step": 233280 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.8040145832460294, |
|
"eval_loss": 0.061811413615942, |
|
"eval_runtime": 1297.6127, |
|
"eval_samples_per_second": 91.95, |
|
"eval_steps_per_second": 0.36, |
|
"step": 233280 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 0.030352266505360603, |
|
"learning_rate": 0.00051, |
|
"loss": 0.0059, |
|
"step": 238140 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.8042241126430039, |
|
"eval_loss": 0.061159055680036545, |
|
"eval_runtime": 1299.1406, |
|
"eval_samples_per_second": 91.841, |
|
"eval_steps_per_second": 0.359, |
|
"step": 238140 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.02934379130601883, |
|
"learning_rate": 0.0005, |
|
"loss": 0.0057, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.8055148137283661, |
|
"eval_loss": 0.06327831000089645, |
|
"eval_runtime": 1298.5116, |
|
"eval_samples_per_second": 91.886, |
|
"eval_steps_per_second": 0.36, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"grad_norm": 0.023088792338967323, |
|
"learning_rate": 0.00049, |
|
"loss": 0.0055, |
|
"step": 247860 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_accuracy": 0.8067971336378494, |
|
"eval_loss": 0.06312137842178345, |
|
"eval_runtime": 1302.1135, |
|
"eval_samples_per_second": 91.632, |
|
"eval_steps_per_second": 0.359, |
|
"step": 247860 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 0.03648848831653595, |
|
"learning_rate": 0.00048, |
|
"loss": 0.0053, |
|
"step": 252720 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.8061936889745631, |
|
"eval_loss": 0.06350181996822357, |
|
"eval_runtime": 1297.9213, |
|
"eval_samples_per_second": 91.928, |
|
"eval_steps_per_second": 0.36, |
|
"step": 252720 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"grad_norm": 0.03157039359211922, |
|
"learning_rate": 0.00047, |
|
"loss": 0.0051, |
|
"step": 257580 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_accuracy": 0.8065121736579642, |
|
"eval_loss": 0.06361949443817139, |
|
"eval_runtime": 1305.9679, |
|
"eval_samples_per_second": 91.361, |
|
"eval_steps_per_second": 0.358, |
|
"step": 257580 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"grad_norm": 0.026564130559563637, |
|
"learning_rate": 0.00046, |
|
"loss": 0.005, |
|
"step": 262440 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_accuracy": 0.8064535054268114, |
|
"eval_loss": 0.06370926648378372, |
|
"eval_runtime": 1302.6026, |
|
"eval_samples_per_second": 91.597, |
|
"eval_steps_per_second": 0.359, |
|
"step": 262440 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"grad_norm": 0.039359357208013535, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 0.0048, |
|
"step": 267300 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_accuracy": 0.8070569500900976, |
|
"eval_loss": 0.0649728775024414, |
|
"eval_runtime": 1301.934, |
|
"eval_samples_per_second": 91.644, |
|
"eval_steps_per_second": 0.359, |
|
"step": 267300 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 0.02652502991259098, |
|
"learning_rate": 0.00044, |
|
"loss": 0.0047, |
|
"step": 272160 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.80695637597955, |
|
"eval_loss": 0.06507979333400726, |
|
"eval_runtime": 1302.3742, |
|
"eval_samples_per_second": 91.613, |
|
"eval_steps_per_second": 0.359, |
|
"step": 272160 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"grad_norm": 0.04170479625463486, |
|
"learning_rate": 0.00043, |
|
"loss": 0.0045, |
|
"step": 277020 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_accuracy": 0.8077442065121737, |
|
"eval_loss": 0.06572364270687103, |
|
"eval_runtime": 1303.9456, |
|
"eval_samples_per_second": 91.503, |
|
"eval_steps_per_second": 0.358, |
|
"step": 277020 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"grad_norm": 0.02988004870712757, |
|
"learning_rate": 0.00042, |
|
"loss": 0.0044, |
|
"step": 281880 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_accuracy": 0.8076687759292629, |
|
"eval_loss": 0.06484715640544891, |
|
"eval_runtime": 1299.0165, |
|
"eval_samples_per_second": 91.85, |
|
"eval_steps_per_second": 0.36, |
|
"step": 281880 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"grad_norm": 0.031678713858127594, |
|
"learning_rate": 0.00041, |
|
"loss": 0.0042, |
|
"step": 286740 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_accuracy": 0.8078950676779952, |
|
"eval_loss": 0.06634358316659927, |
|
"eval_runtime": 1304.0645, |
|
"eval_samples_per_second": 91.495, |
|
"eval_steps_per_second": 0.358, |
|
"step": 286740 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 0.020897777751088142, |
|
"learning_rate": 0.0004, |
|
"loss": 0.0041, |
|
"step": 291600 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.8078866865021163, |
|
"eval_loss": 0.0666716918349266, |
|
"eval_runtime": 1302.0029, |
|
"eval_samples_per_second": 91.64, |
|
"eval_steps_per_second": 0.359, |
|
"step": 291600 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"grad_norm": 0.03830067440867424, |
|
"learning_rate": 0.00039000000000000005, |
|
"loss": 0.004, |
|
"step": 296460 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_accuracy": 0.8104597074969618, |
|
"eval_loss": 0.0661536380648613, |
|
"eval_runtime": 1303.993, |
|
"eval_samples_per_second": 91.5, |
|
"eval_steps_per_second": 0.358, |
|
"step": 296460 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"grad_norm": 0.025591198354959488, |
|
"learning_rate": 0.00038, |
|
"loss": 0.0037, |
|
"step": 301320 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_accuracy": 0.809722164019612, |
|
"eval_loss": 0.06793326884508133, |
|
"eval_runtime": 1302.3775, |
|
"eval_samples_per_second": 91.613, |
|
"eval_steps_per_second": 0.359, |
|
"step": 301320 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"grad_norm": 0.029204251244664192, |
|
"learning_rate": 0.00037, |
|
"loss": 0.0036, |
|
"step": 306180 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_accuracy": 0.8103172275070193, |
|
"eval_loss": 0.06969352066516876, |
|
"eval_runtime": 1309.132, |
|
"eval_samples_per_second": 91.141, |
|
"eval_steps_per_second": 0.357, |
|
"step": 306180 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 0.029880277812480927, |
|
"learning_rate": 0.00035999999999999997, |
|
"loss": 0.0035, |
|
"step": 311040 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.810057411054771, |
|
"eval_loss": 0.06940728425979614, |
|
"eval_runtime": 1308.9186, |
|
"eval_samples_per_second": 91.155, |
|
"eval_steps_per_second": 0.357, |
|
"step": 311040 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"grad_norm": 0.030717821791768074, |
|
"learning_rate": 0.00035, |
|
"loss": 0.0034, |
|
"step": 315900 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_accuracy": 0.810082554582408, |
|
"eval_loss": 0.06925758719444275, |
|
"eval_runtime": 1308.5599, |
|
"eval_samples_per_second": 91.18, |
|
"eval_steps_per_second": 0.357, |
|
"step": 315900 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"grad_norm": 0.02681083045899868, |
|
"learning_rate": 0.00034, |
|
"loss": 0.0032, |
|
"step": 320760 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_accuracy": 0.8104597074969618, |
|
"eval_loss": 0.07081950455904007, |
|
"eval_runtime": 1307.5379, |
|
"eval_samples_per_second": 91.252, |
|
"eval_steps_per_second": 0.357, |
|
"step": 320760 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"grad_norm": 0.023366352543234825, |
|
"learning_rate": 0.00033, |
|
"loss": 0.0031, |
|
"step": 325620 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_accuracy": 0.8130411096676864, |
|
"eval_loss": 0.07085347920656204, |
|
"eval_runtime": 1306.1303, |
|
"eval_samples_per_second": 91.35, |
|
"eval_steps_per_second": 0.358, |
|
"step": 325620 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"grad_norm": 0.018703831359744072, |
|
"learning_rate": 0.00032, |
|
"loss": 0.0029, |
|
"step": 330480 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.8118090768134769, |
|
"eval_loss": 0.07043693959712982, |
|
"eval_runtime": 1308.3789, |
|
"eval_samples_per_second": 91.193, |
|
"eval_steps_per_second": 0.357, |
|
"step": 330480 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"grad_norm": 0.021384961903095245, |
|
"learning_rate": 0.00031, |
|
"loss": 0.0028, |
|
"step": 335340 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_accuracy": 0.8111721074466748, |
|
"eval_loss": 0.0710659921169281, |
|
"eval_runtime": 1306.3675, |
|
"eval_samples_per_second": 91.333, |
|
"eval_steps_per_second": 0.357, |
|
"step": 335340 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"grad_norm": 0.027229884639382362, |
|
"learning_rate": 0.0003, |
|
"loss": 0.0027, |
|
"step": 340200 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_accuracy": 0.811775552109961, |
|
"eval_loss": 0.07277531921863556, |
|
"eval_runtime": 1306.0312, |
|
"eval_samples_per_second": 91.357, |
|
"eval_steps_per_second": 0.358, |
|
"step": 340200 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"grad_norm": 0.024667974561452866, |
|
"learning_rate": 0.00029, |
|
"loss": 0.0025, |
|
"step": 345060 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"eval_accuracy": 0.8127980555671961, |
|
"eval_loss": 0.07439424097537994, |
|
"eval_runtime": 1304.9752, |
|
"eval_samples_per_second": 91.431, |
|
"eval_steps_per_second": 0.358, |
|
"step": 345060 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"grad_norm": 0.019673120230436325, |
|
"learning_rate": 0.00028000000000000003, |
|
"loss": 0.0024, |
|
"step": 349920 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.8129572979088966, |
|
"eval_loss": 0.07484369724988937, |
|
"eval_runtime": 1304.8727, |
|
"eval_samples_per_second": 91.438, |
|
"eval_steps_per_second": 0.358, |
|
"step": 349920 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"grad_norm": 0.023998018354177475, |
|
"learning_rate": 0.00027, |
|
"loss": 0.0023, |
|
"step": 354780 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"eval_accuracy": 0.8130662531953233, |
|
"eval_loss": 0.07576391845941544, |
|
"eval_runtime": 1307.7757, |
|
"eval_samples_per_second": 91.235, |
|
"eval_steps_per_second": 0.357, |
|
"step": 354780 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"grad_norm": 0.04219399765133858, |
|
"learning_rate": 0.00026000000000000003, |
|
"loss": 0.0022, |
|
"step": 359640 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_accuracy": 0.8143485731048066, |
|
"eval_loss": 0.07631613314151764, |
|
"eval_runtime": 1306.0365, |
|
"eval_samples_per_second": 91.357, |
|
"eval_steps_per_second": 0.358, |
|
"step": 359640 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"grad_norm": 0.021946126595139503, |
|
"learning_rate": 0.00025, |
|
"loss": 0.0021, |
|
"step": 364500 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"eval_accuracy": 0.8144994342706282, |
|
"eval_loss": 0.07692206650972366, |
|
"eval_runtime": 1303.767, |
|
"eval_samples_per_second": 91.516, |
|
"eval_steps_per_second": 0.358, |
|
"step": 364500 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"grad_norm": 0.023164469748735428, |
|
"learning_rate": 0.00024, |
|
"loss": 0.0019, |
|
"step": 369360 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.8133176884716926, |
|
"eval_loss": 0.0780106782913208, |
|
"eval_runtime": 1305.1212, |
|
"eval_samples_per_second": 91.421, |
|
"eval_steps_per_second": 0.358, |
|
"step": 369360 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"grad_norm": 0.02851826325058937, |
|
"learning_rate": 0.00023, |
|
"loss": 0.0018, |
|
"step": 374220 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"eval_accuracy": 0.8146335330846918, |
|
"eval_loss": 0.0777156725525856, |
|
"eval_runtime": 1304.8456, |
|
"eval_samples_per_second": 91.44, |
|
"eval_steps_per_second": 0.358, |
|
"step": 374220 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"grad_norm": 0.0253597479313612, |
|
"learning_rate": 0.00022, |
|
"loss": 0.0017, |
|
"step": 379080 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_accuracy": 0.8162175753258182, |
|
"eval_loss": 0.07899600267410278, |
|
"eval_runtime": 1303.5402, |
|
"eval_samples_per_second": 91.532, |
|
"eval_steps_per_second": 0.358, |
|
"step": 379080 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"grad_norm": 0.02291404828429222, |
|
"learning_rate": 0.00021, |
|
"loss": 0.0016, |
|
"step": 383940 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"eval_accuracy": 0.8145245777982651, |
|
"eval_loss": 0.08102333545684814, |
|
"eval_runtime": 1304.8976, |
|
"eval_samples_per_second": 91.436, |
|
"eval_steps_per_second": 0.358, |
|
"step": 383940 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 0.029658950865268707, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0015, |
|
"step": 388800 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.8161170012152705, |
|
"eval_loss": 0.08235077559947968, |
|
"eval_runtime": 1306.0263, |
|
"eval_samples_per_second": 91.357, |
|
"eval_steps_per_second": 0.358, |
|
"step": 388800 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"grad_norm": 0.02459796331822872, |
|
"learning_rate": 0.00019, |
|
"loss": 0.0014, |
|
"step": 393660 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"eval_accuracy": 0.815940996521812, |
|
"eval_loss": 0.08271630853414536, |
|
"eval_runtime": 1306.1311, |
|
"eval_samples_per_second": 91.35, |
|
"eval_steps_per_second": 0.358, |
|
"step": 393660 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"grad_norm": 0.01461075246334076, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 0.0013, |
|
"step": 398520 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_accuracy": 0.8159745212253279, |
|
"eval_loss": 0.08540969341993332, |
|
"eval_runtime": 1303.2922, |
|
"eval_samples_per_second": 91.549, |
|
"eval_steps_per_second": 0.358, |
|
"step": 398520 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"grad_norm": 0.011922557838261127, |
|
"learning_rate": 0.00017, |
|
"loss": 0.0012, |
|
"step": 403380 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"eval_accuracy": 0.8165025353057034, |
|
"eval_loss": 0.08608754724264145, |
|
"eval_runtime": 1300.8769, |
|
"eval_samples_per_second": 91.719, |
|
"eval_steps_per_second": 0.359, |
|
"step": 403380 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"grad_norm": 0.006066465750336647, |
|
"learning_rate": 0.00016, |
|
"loss": 0.0011, |
|
"step": 408240 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.8179943846121611, |
|
"eval_loss": 0.0866456851363182, |
|
"eval_runtime": 1297.1839, |
|
"eval_samples_per_second": 91.98, |
|
"eval_steps_per_second": 0.36, |
|
"step": 408240 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"grad_norm": 0.019887538626790047, |
|
"learning_rate": 0.00015, |
|
"loss": 0.001, |
|
"step": 413100 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"eval_accuracy": 0.8174747517076646, |
|
"eval_loss": 0.08994536101818085, |
|
"eval_runtime": 1297.8948, |
|
"eval_samples_per_second": 91.93, |
|
"eval_steps_per_second": 0.36, |
|
"step": 413100 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"grad_norm": 0.019588502123951912, |
|
"learning_rate": 0.00014000000000000001, |
|
"loss": 0.0009, |
|
"step": 417960 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"eval_accuracy": 0.8186648786824792, |
|
"eval_loss": 0.08895347267389297, |
|
"eval_runtime": 1297.8927, |
|
"eval_samples_per_second": 91.93, |
|
"eval_steps_per_second": 0.36, |
|
"step": 417960 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"grad_norm": 0.019738251343369484, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 0.0008, |
|
"step": 422820 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"eval_accuracy": 0.8184553492855048, |
|
"eval_loss": 0.09202321618795395, |
|
"eval_runtime": 1297.7904, |
|
"eval_samples_per_second": 91.937, |
|
"eval_steps_per_second": 0.36, |
|
"step": 422820 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"grad_norm": 0.0069356439635157585, |
|
"learning_rate": 0.00012, |
|
"loss": 0.0008, |
|
"step": 427680 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_accuracy": 0.8189749821900013, |
|
"eval_loss": 0.09301886707544327, |
|
"eval_runtime": 1297.4851, |
|
"eval_samples_per_second": 91.959, |
|
"eval_steps_per_second": 0.36, |
|
"step": 427680 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"grad_norm": 0.0168699249625206, |
|
"learning_rate": 0.00011, |
|
"loss": 0.0007, |
|
"step": 432540 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"eval_accuracy": 0.8196370950844404, |
|
"eval_loss": 0.09475909918546677, |
|
"eval_runtime": 1297.2344, |
|
"eval_samples_per_second": 91.976, |
|
"eval_steps_per_second": 0.36, |
|
"step": 432540 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"grad_norm": 0.00997143518179655, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0006, |
|
"step": 437400 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"eval_accuracy": 0.8189917445417592, |
|
"eval_loss": 0.09576508402824402, |
|
"eval_runtime": 1296.498, |
|
"eval_samples_per_second": 92.029, |
|
"eval_steps_per_second": 0.36, |
|
"step": 437400 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"grad_norm": 0.017046066001057625, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 0.0005, |
|
"step": 442260 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"eval_accuracy": 0.8195365209738926, |
|
"eval_loss": 0.09832222014665604, |
|
"eval_runtime": 1296.9323, |
|
"eval_samples_per_second": 91.998, |
|
"eval_steps_per_second": 0.36, |
|
"step": 442260 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"grad_norm": 0.01709928549826145, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0005, |
|
"step": 447120 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_accuracy": 0.8208523655868918, |
|
"eval_loss": 0.10070452094078064, |
|
"eval_runtime": 1296.8404, |
|
"eval_samples_per_second": 92.004, |
|
"eval_steps_per_second": 0.36, |
|
"step": 447120 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"grad_norm": 0.014434403739869595, |
|
"learning_rate": 7.000000000000001e-05, |
|
"loss": 0.0004, |
|
"step": 451980 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"eval_accuracy": 0.8202153962200897, |
|
"eval_loss": 0.10308787226676941, |
|
"eval_runtime": 1304.3584, |
|
"eval_samples_per_second": 91.474, |
|
"eval_steps_per_second": 0.358, |
|
"step": 451980 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"grad_norm": 0.014810960739850998, |
|
"learning_rate": 6e-05, |
|
"loss": 0.0004, |
|
"step": 456840 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"eval_accuracy": 0.82111218203914, |
|
"eval_loss": 0.10589364916086197, |
|
"eval_runtime": 1302.0953, |
|
"eval_samples_per_second": 91.633, |
|
"eval_steps_per_second": 0.359, |
|
"step": 456840 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"grad_norm": 0.0034873096738010645, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0003, |
|
"step": 461700 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"eval_accuracy": 0.8215982902401207, |
|
"eval_loss": 0.10970806330442429, |
|
"eval_runtime": 1301.0636, |
|
"eval_samples_per_second": 91.706, |
|
"eval_steps_per_second": 0.359, |
|
"step": 461700 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"grad_norm": 0.002740664640441537, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0003, |
|
"step": 466560 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_accuracy": 0.8221179231446172, |
|
"eval_loss": 0.11456754803657532, |
|
"eval_runtime": 1302.7083, |
|
"eval_samples_per_second": 91.59, |
|
"eval_steps_per_second": 0.358, |
|
"step": 466560 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"grad_norm": 0.0014008020516484976, |
|
"learning_rate": 3e-05, |
|
"loss": 0.0002, |
|
"step": 471420 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"eval_accuracy": 0.8224364078280183, |
|
"eval_loss": 0.11764019727706909, |
|
"eval_runtime": 1302.3991, |
|
"eval_samples_per_second": 91.612, |
|
"eval_steps_per_second": 0.359, |
|
"step": 471420 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"grad_norm": 0.0054730623960494995, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0002, |
|
"step": 476280 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"eval_accuracy": 0.8226543184008717, |
|
"eval_loss": 0.12025844305753708, |
|
"eval_runtime": 1302.1251, |
|
"eval_samples_per_second": 91.631, |
|
"eval_steps_per_second": 0.359, |
|
"step": 476280 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"grad_norm": 0.0020766761153936386, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0002, |
|
"step": 481140 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"eval_accuracy": 0.8226878431043876, |
|
"eval_loss": 0.1223362609744072, |
|
"eval_runtime": 1301.9742, |
|
"eval_samples_per_second": 91.642, |
|
"eval_steps_per_second": 0.359, |
|
"step": 481140 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 0.018789879977703094, |
|
"learning_rate": 0.0, |
|
"loss": 0.0001, |
|
"step": 486000 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_accuracy": 0.8229895654360306, |
|
"eval_loss": 0.12378211319446564, |
|
"eval_runtime": 1301.6752, |
|
"eval_samples_per_second": 91.663, |
|
"eval_steps_per_second": 0.359, |
|
"step": 486000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 486000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"total_flos": 2.2219508552555553e+19, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|