|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.1186113789778207, |
|
"eval_steps": 100, |
|
"global_step": 2900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003857280617164899, |
|
"grad_norm": 2.45951247215271, |
|
"learning_rate": 2.497749774977498e-05, |
|
"loss": 3.6523, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007714561234329798, |
|
"grad_norm": 1.4561721086502075, |
|
"learning_rate": 2.4952495249524954e-05, |
|
"loss": 2.7058, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.011571841851494697, |
|
"grad_norm": 1.2352159023284912, |
|
"learning_rate": 2.4927492749274926e-05, |
|
"loss": 2.0281, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.015429122468659595, |
|
"grad_norm": 1.2523480653762817, |
|
"learning_rate": 2.4902490249024905e-05, |
|
"loss": 1.5702, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.019286403085824494, |
|
"grad_norm": 1.407914400100708, |
|
"learning_rate": 2.4877487748774877e-05, |
|
"loss": 1.177, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.023143683702989394, |
|
"grad_norm": 1.1699292659759521, |
|
"learning_rate": 2.4852485248524852e-05, |
|
"loss": 0.9623, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02700096432015429, |
|
"grad_norm": 1.0662754774093628, |
|
"learning_rate": 2.4827482748274828e-05, |
|
"loss": 1.029, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03085824493731919, |
|
"grad_norm": 0.9690182209014893, |
|
"learning_rate": 2.4802480248024803e-05, |
|
"loss": 0.946, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03471552555448409, |
|
"grad_norm": 0.8241429328918457, |
|
"learning_rate": 2.477747774777478e-05, |
|
"loss": 0.8613, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03857280617164899, |
|
"grad_norm": 0.9876273274421692, |
|
"learning_rate": 2.4752475247524754e-05, |
|
"loss": 0.891, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03857280617164899, |
|
"eval_loss": 0.8584423065185547, |
|
"eval_runtime": 94.247, |
|
"eval_samples_per_second": 55.015, |
|
"eval_steps_per_second": 6.886, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04243008678881389, |
|
"grad_norm": 0.8240203261375427, |
|
"learning_rate": 2.472747274727473e-05, |
|
"loss": 0.8924, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04628736740597879, |
|
"grad_norm": 0.7671812176704407, |
|
"learning_rate": 2.47024702470247e-05, |
|
"loss": 0.8446, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05014464802314368, |
|
"grad_norm": 0.9588340520858765, |
|
"learning_rate": 2.467746774677468e-05, |
|
"loss": 0.8491, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05400192864030858, |
|
"grad_norm": 0.9825944304466248, |
|
"learning_rate": 2.4652465246524652e-05, |
|
"loss": 0.8437, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05785920925747348, |
|
"grad_norm": 0.9779114723205566, |
|
"learning_rate": 2.4627462746274628e-05, |
|
"loss": 0.8404, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06171648987463838, |
|
"grad_norm": 0.9949918389320374, |
|
"learning_rate": 2.4602460246024603e-05, |
|
"loss": 0.772, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06557377049180328, |
|
"grad_norm": 0.9132283329963684, |
|
"learning_rate": 2.457745774577458e-05, |
|
"loss": 0.8132, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06943105110896818, |
|
"grad_norm": 0.8586040735244751, |
|
"learning_rate": 2.4552455245524554e-05, |
|
"loss": 0.8365, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07328833172613308, |
|
"grad_norm": 0.78518146276474, |
|
"learning_rate": 2.452745274527453e-05, |
|
"loss": 0.7607, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07714561234329798, |
|
"grad_norm": 1.1228320598602295, |
|
"learning_rate": 2.45024502450245e-05, |
|
"loss": 0.8262, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07714561234329798, |
|
"eval_loss": 0.7683142423629761, |
|
"eval_runtime": 94.3304, |
|
"eval_samples_per_second": 54.966, |
|
"eval_steps_per_second": 6.88, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08100289296046287, |
|
"grad_norm": 1.47947096824646, |
|
"learning_rate": 2.447744774477448e-05, |
|
"loss": 0.7629, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08486017357762778, |
|
"grad_norm": 1.398677110671997, |
|
"learning_rate": 2.4452445244524452e-05, |
|
"loss": 0.7253, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08871745419479267, |
|
"grad_norm": 0.8628906607627869, |
|
"learning_rate": 2.4427442744274428e-05, |
|
"loss": 0.7691, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09257473481195758, |
|
"grad_norm": 0.9008379578590393, |
|
"learning_rate": 2.4402440244024403e-05, |
|
"loss": 0.6461, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09643201542912247, |
|
"grad_norm": 0.6998778581619263, |
|
"learning_rate": 2.437743774377438e-05, |
|
"loss": 0.7174, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10028929604628736, |
|
"grad_norm": 0.863390326499939, |
|
"learning_rate": 2.4352435243524354e-05, |
|
"loss": 0.6757, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.10414657666345227, |
|
"grad_norm": 1.0060020685195923, |
|
"learning_rate": 2.432743274327433e-05, |
|
"loss": 0.6933, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10800385728061716, |
|
"grad_norm": 0.8257681727409363, |
|
"learning_rate": 2.4302430243024305e-05, |
|
"loss": 0.7369, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11186113789778207, |
|
"grad_norm": 0.6368749141693115, |
|
"learning_rate": 2.4277427742774277e-05, |
|
"loss": 0.6612, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.11571841851494696, |
|
"grad_norm": 0.8179033994674683, |
|
"learning_rate": 2.4252425242524256e-05, |
|
"loss": 0.6744, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11571841851494696, |
|
"eval_loss": 0.689677357673645, |
|
"eval_runtime": 94.4186, |
|
"eval_samples_per_second": 54.915, |
|
"eval_steps_per_second": 6.874, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11957569913211186, |
|
"grad_norm": 0.7856632471084595, |
|
"learning_rate": 2.4227422742274228e-05, |
|
"loss": 0.6407, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.12343297974927676, |
|
"grad_norm": 0.788524866104126, |
|
"learning_rate": 2.4202420242024203e-05, |
|
"loss": 0.7115, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12729026036644167, |
|
"grad_norm": 1.0506746768951416, |
|
"learning_rate": 2.417741774177418e-05, |
|
"loss": 0.6825, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13114754098360656, |
|
"grad_norm": 1.2924314737319946, |
|
"learning_rate": 2.4152415241524154e-05, |
|
"loss": 0.6627, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.13500482160077146, |
|
"grad_norm": 0.8082237243652344, |
|
"learning_rate": 2.4127412741274126e-05, |
|
"loss": 0.6895, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13886210221793635, |
|
"grad_norm": 0.787610650062561, |
|
"learning_rate": 2.4102410241024105e-05, |
|
"loss": 0.6839, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.14271938283510124, |
|
"grad_norm": 0.7939244508743286, |
|
"learning_rate": 2.4077407740774077e-05, |
|
"loss": 0.6533, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.14657666345226616, |
|
"grad_norm": 0.7655636668205261, |
|
"learning_rate": 2.4052405240524052e-05, |
|
"loss": 0.6645, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.15043394406943106, |
|
"grad_norm": 0.709829568862915, |
|
"learning_rate": 2.402740274027403e-05, |
|
"loss": 0.6792, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.15429122468659595, |
|
"grad_norm": 0.7088485956192017, |
|
"learning_rate": 2.4002400240024003e-05, |
|
"loss": 0.6568, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15429122468659595, |
|
"eval_loss": 0.677769124507904, |
|
"eval_runtime": 94.3721, |
|
"eval_samples_per_second": 54.942, |
|
"eval_steps_per_second": 6.877, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15814850530376084, |
|
"grad_norm": 0.739398717880249, |
|
"learning_rate": 2.397739773977398e-05, |
|
"loss": 0.6802, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.16200578592092574, |
|
"grad_norm": 0.7921575307846069, |
|
"learning_rate": 2.3952395239523954e-05, |
|
"loss": 0.6598, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.16586306653809066, |
|
"grad_norm": 0.9333528280258179, |
|
"learning_rate": 2.392739273927393e-05, |
|
"loss": 0.6543, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.16972034715525555, |
|
"grad_norm": 0.906482994556427, |
|
"learning_rate": 2.39023902390239e-05, |
|
"loss": 0.692, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.17357762777242045, |
|
"grad_norm": 0.8562319278717041, |
|
"learning_rate": 2.387738773877388e-05, |
|
"loss": 0.6963, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.17743490838958534, |
|
"grad_norm": 0.8864608407020569, |
|
"learning_rate": 2.3852385238523852e-05, |
|
"loss": 0.6672, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.18129218900675023, |
|
"grad_norm": 0.7445130944252014, |
|
"learning_rate": 2.3827382738273828e-05, |
|
"loss": 0.6052, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.18514946962391515, |
|
"grad_norm": 0.751557469367981, |
|
"learning_rate": 2.3802380238023803e-05, |
|
"loss": 0.6301, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.18900675024108005, |
|
"grad_norm": 0.6981202960014343, |
|
"learning_rate": 2.377737773777378e-05, |
|
"loss": 0.6423, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.19286403085824494, |
|
"grad_norm": 0.9979777336120605, |
|
"learning_rate": 2.3752375237523754e-05, |
|
"loss": 0.6075, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.19286403085824494, |
|
"eval_loss": 0.6642400622367859, |
|
"eval_runtime": 94.3518, |
|
"eval_samples_per_second": 54.954, |
|
"eval_steps_per_second": 6.879, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.19672131147540983, |
|
"grad_norm": 0.7130064368247986, |
|
"learning_rate": 2.372737273727373e-05, |
|
"loss": 0.6054, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.20057859209257473, |
|
"grad_norm": 0.7771989703178406, |
|
"learning_rate": 2.37023702370237e-05, |
|
"loss": 0.6621, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.20443587270973965, |
|
"grad_norm": 0.8572603464126587, |
|
"learning_rate": 2.3677367736773677e-05, |
|
"loss": 0.6563, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.20829315332690454, |
|
"grad_norm": 0.8305298686027527, |
|
"learning_rate": 2.3652365236523656e-05, |
|
"loss": 0.6658, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.21215043394406943, |
|
"grad_norm": 0.8520190119743347, |
|
"learning_rate": 2.3627362736273628e-05, |
|
"loss": 0.638, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.21600771456123433, |
|
"grad_norm": 0.9404274225234985, |
|
"learning_rate": 2.3602360236023603e-05, |
|
"loss": 0.6067, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.21986499517839922, |
|
"grad_norm": 0.8018991351127625, |
|
"learning_rate": 2.357735773577358e-05, |
|
"loss": 0.6385, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.22372227579556414, |
|
"grad_norm": 0.8628789186477661, |
|
"learning_rate": 2.3552355235523554e-05, |
|
"loss": 0.6554, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.22757955641272903, |
|
"grad_norm": 0.8279526829719543, |
|
"learning_rate": 2.352735273527353e-05, |
|
"loss": 0.6114, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.23143683702989393, |
|
"grad_norm": 0.8158825635910034, |
|
"learning_rate": 2.3502350235023505e-05, |
|
"loss": 0.6149, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.23143683702989393, |
|
"eval_loss": 0.6521801352500916, |
|
"eval_runtime": 94.3458, |
|
"eval_samples_per_second": 54.957, |
|
"eval_steps_per_second": 6.879, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 0.8334428071975708, |
|
"learning_rate": 2.3477347734773477e-05, |
|
"loss": 0.6769, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2391513982642237, |
|
"grad_norm": 0.9083623886108398, |
|
"learning_rate": 2.3452345234523456e-05, |
|
"loss": 0.6331, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.24300867888138863, |
|
"grad_norm": 1.199766993522644, |
|
"learning_rate": 2.3427342734273428e-05, |
|
"loss": 0.6967, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.24686595949855353, |
|
"grad_norm": 1.2198294401168823, |
|
"learning_rate": 2.3402340234023403e-05, |
|
"loss": 0.6618, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2507232401157184, |
|
"grad_norm": 0.8489105701446533, |
|
"learning_rate": 2.337733773377338e-05, |
|
"loss": 0.6242, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.25458052073288334, |
|
"grad_norm": 1.0652421712875366, |
|
"learning_rate": 2.3352335233523354e-05, |
|
"loss": 0.6192, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.25843780135004824, |
|
"grad_norm": 0.7928668856620789, |
|
"learning_rate": 2.3327332733273326e-05, |
|
"loss": 0.5706, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.26229508196721313, |
|
"grad_norm": 0.8512901663780212, |
|
"learning_rate": 2.3302330233023305e-05, |
|
"loss": 0.6457, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.266152362584378, |
|
"grad_norm": 0.8443427085876465, |
|
"learning_rate": 2.327732773277328e-05, |
|
"loss": 0.601, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.2700096432015429, |
|
"grad_norm": 0.8724773526191711, |
|
"learning_rate": 2.3252325232523252e-05, |
|
"loss": 0.6476, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2700096432015429, |
|
"eval_loss": 0.6422102451324463, |
|
"eval_runtime": 94.4282, |
|
"eval_samples_per_second": 54.909, |
|
"eval_steps_per_second": 6.873, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2738669238187078, |
|
"grad_norm": 0.8733763098716736, |
|
"learning_rate": 2.322732273227323e-05, |
|
"loss": 0.6523, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2777242044358727, |
|
"grad_norm": 0.8932089805603027, |
|
"learning_rate": 2.3202320232023203e-05, |
|
"loss": 0.6305, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2815814850530376, |
|
"grad_norm": 0.9854605197906494, |
|
"learning_rate": 2.317731773177318e-05, |
|
"loss": 0.6358, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2854387656702025, |
|
"grad_norm": 0.8158785700798035, |
|
"learning_rate": 2.3152315231523154e-05, |
|
"loss": 0.6027, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.2892960462873674, |
|
"grad_norm": 0.9273302555084229, |
|
"learning_rate": 2.312731273127313e-05, |
|
"loss": 0.6431, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.29315332690453233, |
|
"grad_norm": 0.9094042181968689, |
|
"learning_rate": 2.31023102310231e-05, |
|
"loss": 0.5767, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.2970106075216972, |
|
"grad_norm": 0.8175253868103027, |
|
"learning_rate": 2.307730773077308e-05, |
|
"loss": 0.6174, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.3008678881388621, |
|
"grad_norm": 0.8517961502075195, |
|
"learning_rate": 2.3052305230523052e-05, |
|
"loss": 0.6183, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.304725168756027, |
|
"grad_norm": 0.8863179087638855, |
|
"learning_rate": 2.3027302730273028e-05, |
|
"loss": 0.5849, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3085824493731919, |
|
"grad_norm": 0.9195278882980347, |
|
"learning_rate": 2.3002300230023003e-05, |
|
"loss": 0.6016, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3085824493731919, |
|
"eval_loss": 0.6320463418960571, |
|
"eval_runtime": 94.3592, |
|
"eval_samples_per_second": 54.95, |
|
"eval_steps_per_second": 6.878, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3124397299903568, |
|
"grad_norm": 0.9424280524253845, |
|
"learning_rate": 2.297729772977298e-05, |
|
"loss": 0.5961, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.3162970106075217, |
|
"grad_norm": 1.031079888343811, |
|
"learning_rate": 2.295229522952295e-05, |
|
"loss": 0.6357, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3201542912246866, |
|
"grad_norm": 0.9320313334465027, |
|
"learning_rate": 2.292729272927293e-05, |
|
"loss": 0.6228, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3240115718418515, |
|
"grad_norm": 0.9292299747467041, |
|
"learning_rate": 2.2902290229022905e-05, |
|
"loss": 0.6504, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 0.8377825021743774, |
|
"learning_rate": 2.2877287728772877e-05, |
|
"loss": 0.5952, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3317261330761813, |
|
"grad_norm": 0.8555241227149963, |
|
"learning_rate": 2.2852285228522856e-05, |
|
"loss": 0.5852, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3355834136933462, |
|
"grad_norm": 1.0691065788269043, |
|
"learning_rate": 2.2827282728272828e-05, |
|
"loss": 0.5806, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.3394406943105111, |
|
"grad_norm": 1.0052144527435303, |
|
"learning_rate": 2.2802280228022803e-05, |
|
"loss": 0.6592, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.343297974927676, |
|
"grad_norm": 1.000553011894226, |
|
"learning_rate": 2.277727772777278e-05, |
|
"loss": 0.6347, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3471552555448409, |
|
"grad_norm": 1.13107430934906, |
|
"learning_rate": 2.2752275227522754e-05, |
|
"loss": 0.5989, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3471552555448409, |
|
"eval_loss": 0.6231358647346497, |
|
"eval_runtime": 94.3809, |
|
"eval_samples_per_second": 54.937, |
|
"eval_steps_per_second": 6.876, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3510125361620058, |
|
"grad_norm": 1.0130326747894287, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 0.6227, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.3548698167791707, |
|
"grad_norm": 1.0335384607315063, |
|
"learning_rate": 2.2702270227022705e-05, |
|
"loss": 0.5277, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.35872709739633557, |
|
"grad_norm": 0.9162185788154602, |
|
"learning_rate": 2.2677267726772677e-05, |
|
"loss": 0.5633, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.36258437801350046, |
|
"grad_norm": 0.9492796063423157, |
|
"learning_rate": 2.2652265226522652e-05, |
|
"loss": 0.6536, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.36644165863066536, |
|
"grad_norm": 1.0065137147903442, |
|
"learning_rate": 2.2627262726272628e-05, |
|
"loss": 0.6622, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3702989392478303, |
|
"grad_norm": 0.917143702507019, |
|
"learning_rate": 2.2602260226022603e-05, |
|
"loss": 0.6391, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3741562198649952, |
|
"grad_norm": 0.9580853581428528, |
|
"learning_rate": 2.257725772577258e-05, |
|
"loss": 0.6354, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.3780135004821601, |
|
"grad_norm": 1.1998488903045654, |
|
"learning_rate": 2.2552255225522554e-05, |
|
"loss": 0.5885, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.381870781099325, |
|
"grad_norm": 0.9667923450469971, |
|
"learning_rate": 2.252725272527253e-05, |
|
"loss": 0.6199, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.3857280617164899, |
|
"grad_norm": 0.9675014019012451, |
|
"learning_rate": 2.2502250225022505e-05, |
|
"loss": 0.5522, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3857280617164899, |
|
"eval_loss": 0.6154375672340393, |
|
"eval_runtime": 94.3972, |
|
"eval_samples_per_second": 54.927, |
|
"eval_steps_per_second": 6.875, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.38958534233365477, |
|
"grad_norm": 1.035885214805603, |
|
"learning_rate": 2.247724772477248e-05, |
|
"loss": 0.5868, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.39344262295081966, |
|
"grad_norm": 1.1226266622543335, |
|
"learning_rate": 2.2452245224522452e-05, |
|
"loss": 0.5787, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.39729990356798456, |
|
"grad_norm": 1.0908483266830444, |
|
"learning_rate": 2.2427242724272428e-05, |
|
"loss": 0.6161, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.40115718418514945, |
|
"grad_norm": 0.9660767316818237, |
|
"learning_rate": 2.2402240224022403e-05, |
|
"loss": 0.6277, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.40501446480231434, |
|
"grad_norm": 0.9711313843727112, |
|
"learning_rate": 2.237723772377238e-05, |
|
"loss": 0.6062, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4088717454194793, |
|
"grad_norm": 0.9374969601631165, |
|
"learning_rate": 2.2352235223522354e-05, |
|
"loss": 0.6299, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4127290260366442, |
|
"grad_norm": 1.0570039749145508, |
|
"learning_rate": 2.232723272327233e-05, |
|
"loss": 0.5965, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.4165863066538091, |
|
"grad_norm": 1.0144932270050049, |
|
"learning_rate": 2.23022302230223e-05, |
|
"loss": 0.5479, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.420443587270974, |
|
"grad_norm": 0.9654034972190857, |
|
"learning_rate": 2.227722772277228e-05, |
|
"loss": 0.5768, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.42430086788813887, |
|
"grad_norm": 0.9580025672912598, |
|
"learning_rate": 2.2252225222522252e-05, |
|
"loss": 0.6023, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.42430086788813887, |
|
"eval_loss": 0.6073106527328491, |
|
"eval_runtime": 94.4156, |
|
"eval_samples_per_second": 54.917, |
|
"eval_steps_per_second": 6.874, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.42815814850530376, |
|
"grad_norm": 1.0227288007736206, |
|
"learning_rate": 2.2227222722272228e-05, |
|
"loss": 0.5824, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.43201542912246865, |
|
"grad_norm": 0.977800726890564, |
|
"learning_rate": 2.2202220222022203e-05, |
|
"loss": 0.5629, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.43587270973963355, |
|
"grad_norm": 0.9433587789535522, |
|
"learning_rate": 2.217721772177218e-05, |
|
"loss": 0.5774, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.43972999035679844, |
|
"grad_norm": 1.0534788370132446, |
|
"learning_rate": 2.2152215221522154e-05, |
|
"loss": 0.6191, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.44358727097396333, |
|
"grad_norm": 0.9741374850273132, |
|
"learning_rate": 2.212721272127213e-05, |
|
"loss": 0.5989, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4474445515911283, |
|
"grad_norm": 1.1215403079986572, |
|
"learning_rate": 2.2102210221022105e-05, |
|
"loss": 0.6547, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.4513018322082932, |
|
"grad_norm": 1.1161948442459106, |
|
"learning_rate": 2.2077207720772077e-05, |
|
"loss": 0.5999, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.45515911282545807, |
|
"grad_norm": 1.1462429761886597, |
|
"learning_rate": 2.2052205220522055e-05, |
|
"loss": 0.6458, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.45901639344262296, |
|
"grad_norm": 1.0904706716537476, |
|
"learning_rate": 2.2027202720272027e-05, |
|
"loss": 0.5839, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.46287367405978785, |
|
"grad_norm": 1.0991252660751343, |
|
"learning_rate": 2.2002200220022003e-05, |
|
"loss": 0.5403, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.46287367405978785, |
|
"eval_loss": 0.6001272797584534, |
|
"eval_runtime": 94.5589, |
|
"eval_samples_per_second": 54.834, |
|
"eval_steps_per_second": 6.863, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.46673095467695275, |
|
"grad_norm": 1.221454381942749, |
|
"learning_rate": 2.197719771977198e-05, |
|
"loss": 0.6155, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 1.0147477388381958, |
|
"learning_rate": 2.1952195219521954e-05, |
|
"loss": 0.62, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.47444551591128253, |
|
"grad_norm": 1.0702507495880127, |
|
"learning_rate": 2.1927192719271926e-05, |
|
"loss": 0.5605, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.4783027965284474, |
|
"grad_norm": 1.295518398284912, |
|
"learning_rate": 2.1902190219021905e-05, |
|
"loss": 0.5065, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.4821600771456123, |
|
"grad_norm": 1.1323541402816772, |
|
"learning_rate": 2.1877187718771877e-05, |
|
"loss": 0.5726, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.48601735776277727, |
|
"grad_norm": 0.9562482833862305, |
|
"learning_rate": 2.1852185218521852e-05, |
|
"loss": 0.5683, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.48987463837994216, |
|
"grad_norm": 1.129547119140625, |
|
"learning_rate": 2.1827182718271827e-05, |
|
"loss": 0.5732, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.49373191899710706, |
|
"grad_norm": 1.0175765752792358, |
|
"learning_rate": 2.1802180218021803e-05, |
|
"loss": 0.5251, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.49758919961427195, |
|
"grad_norm": 1.1538267135620117, |
|
"learning_rate": 2.177717771777178e-05, |
|
"loss": 0.5798, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5014464802314368, |
|
"grad_norm": 1.1203854084014893, |
|
"learning_rate": 2.1752175217521754e-05, |
|
"loss": 0.535, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5014464802314368, |
|
"eval_loss": 0.593771755695343, |
|
"eval_runtime": 94.4579, |
|
"eval_samples_per_second": 54.892, |
|
"eval_steps_per_second": 6.871, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5053037608486017, |
|
"grad_norm": 1.158937692642212, |
|
"learning_rate": 2.172717271727173e-05, |
|
"loss": 0.5667, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5091610414657667, |
|
"grad_norm": 1.1078110933303833, |
|
"learning_rate": 2.17021702170217e-05, |
|
"loss": 0.6097, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5130183220829315, |
|
"grad_norm": 1.1934500932693481, |
|
"learning_rate": 2.167716771677168e-05, |
|
"loss": 0.5478, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.5168756027000965, |
|
"grad_norm": 1.048662781715393, |
|
"learning_rate": 2.1652165216521652e-05, |
|
"loss": 0.5753, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5207328833172613, |
|
"grad_norm": 1.0503116846084595, |
|
"learning_rate": 2.1627162716271627e-05, |
|
"loss": 0.5762, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5245901639344263, |
|
"grad_norm": 1.1861109733581543, |
|
"learning_rate": 2.1602160216021603e-05, |
|
"loss": 0.5808, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5284474445515911, |
|
"grad_norm": 1.178539752960205, |
|
"learning_rate": 2.1577157715771578e-05, |
|
"loss": 0.5584, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.532304725168756, |
|
"grad_norm": 1.0662671327590942, |
|
"learning_rate": 2.1552155215521554e-05, |
|
"loss": 0.5535, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5361620057859209, |
|
"grad_norm": 1.1202431917190552, |
|
"learning_rate": 2.152715271527153e-05, |
|
"loss": 0.5555, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5400192864030858, |
|
"grad_norm": 1.1992982625961304, |
|
"learning_rate": 2.15021502150215e-05, |
|
"loss": 0.5712, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5400192864030858, |
|
"eval_loss": 0.5875272750854492, |
|
"eval_runtime": 94.4312, |
|
"eval_samples_per_second": 54.908, |
|
"eval_steps_per_second": 6.873, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5438765670202508, |
|
"grad_norm": 1.1259962320327759, |
|
"learning_rate": 2.147714771477148e-05, |
|
"loss": 0.5676, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5477338476374156, |
|
"grad_norm": 1.0652165412902832, |
|
"learning_rate": 2.1452145214521452e-05, |
|
"loss": 0.5551, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5515911282545806, |
|
"grad_norm": 1.1056393384933472, |
|
"learning_rate": 2.1427142714271427e-05, |
|
"loss": 0.508, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.5554484088717454, |
|
"grad_norm": 1.1506450176239014, |
|
"learning_rate": 2.1402140214021403e-05, |
|
"loss": 0.582, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.5593056894889104, |
|
"grad_norm": 1.4107190370559692, |
|
"learning_rate": 2.1377137713771378e-05, |
|
"loss": 0.5821, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.5631629701060752, |
|
"grad_norm": 1.2830005884170532, |
|
"learning_rate": 2.1352135213521354e-05, |
|
"loss": 0.5451, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.5670202507232401, |
|
"grad_norm": 1.1122502088546753, |
|
"learning_rate": 2.132713271327133e-05, |
|
"loss": 0.5905, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.570877531340405, |
|
"grad_norm": 1.1104683876037598, |
|
"learning_rate": 2.1302130213021305e-05, |
|
"loss": 0.6189, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5747348119575699, |
|
"grad_norm": 1.2569029331207275, |
|
"learning_rate": 2.1277127712771277e-05, |
|
"loss": 0.5717, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5785920925747348, |
|
"grad_norm": 1.1278156042099, |
|
"learning_rate": 2.1252125212521255e-05, |
|
"loss": 0.5686, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5785920925747348, |
|
"eval_loss": 0.5797137022018433, |
|
"eval_runtime": 94.4199, |
|
"eval_samples_per_second": 54.914, |
|
"eval_steps_per_second": 6.874, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5824493731918997, |
|
"grad_norm": 1.075393795967102, |
|
"learning_rate": 2.1227122712271227e-05, |
|
"loss": 0.5849, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.5863066538090647, |
|
"grad_norm": 1.2325960397720337, |
|
"learning_rate": 2.1202120212021203e-05, |
|
"loss": 0.5706, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.5901639344262295, |
|
"grad_norm": 1.1058759689331055, |
|
"learning_rate": 2.1177117711771178e-05, |
|
"loss": 0.5706, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.5940212150433944, |
|
"grad_norm": 1.1634057760238647, |
|
"learning_rate": 2.1152115211521154e-05, |
|
"loss": 0.5518, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.5978784956605593, |
|
"grad_norm": 1.0119497776031494, |
|
"learning_rate": 2.1127112711271126e-05, |
|
"loss": 0.5104, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.6017357762777242, |
|
"grad_norm": 1.2648943662643433, |
|
"learning_rate": 2.1102110211021104e-05, |
|
"loss": 0.5261, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.6055930568948891, |
|
"grad_norm": 1.2454555034637451, |
|
"learning_rate": 2.1077107710771077e-05, |
|
"loss": 0.5633, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.609450337512054, |
|
"grad_norm": 1.1793566942214966, |
|
"learning_rate": 2.1052105210521052e-05, |
|
"loss": 0.535, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.6133076181292189, |
|
"grad_norm": 1.5229750871658325, |
|
"learning_rate": 2.102710271027103e-05, |
|
"loss": 0.5559, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.6171648987463838, |
|
"grad_norm": 1.2203059196472168, |
|
"learning_rate": 2.1002100210021003e-05, |
|
"loss": 0.5315, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6171648987463838, |
|
"eval_loss": 0.5732572078704834, |
|
"eval_runtime": 94.4141, |
|
"eval_samples_per_second": 54.918, |
|
"eval_steps_per_second": 6.874, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6210221793635486, |
|
"grad_norm": 1.4130253791809082, |
|
"learning_rate": 2.0977097709770978e-05, |
|
"loss": 0.5521, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.6248794599807136, |
|
"grad_norm": 1.2830981016159058, |
|
"learning_rate": 2.0952095209520954e-05, |
|
"loss": 0.5432, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.6287367405978785, |
|
"grad_norm": 1.1956433057785034, |
|
"learning_rate": 2.092709270927093e-05, |
|
"loss": 0.5746, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.6325940212150434, |
|
"grad_norm": 1.5104076862335205, |
|
"learning_rate": 2.09020902090209e-05, |
|
"loss": 0.5916, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6364513018322083, |
|
"grad_norm": 1.2112847566604614, |
|
"learning_rate": 2.087708770877088e-05, |
|
"loss": 0.5322, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.6403085824493732, |
|
"grad_norm": 1.1859279870986938, |
|
"learning_rate": 2.0852085208520852e-05, |
|
"loss": 0.599, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6441658630665381, |
|
"grad_norm": 1.348300576210022, |
|
"learning_rate": 2.0827082708270827e-05, |
|
"loss": 0.605, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.648023143683703, |
|
"grad_norm": 1.3982155323028564, |
|
"learning_rate": 2.0802080208020803e-05, |
|
"loss": 0.5367, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.6518804243008679, |
|
"grad_norm": 1.2189476490020752, |
|
"learning_rate": 2.0777077707770778e-05, |
|
"loss": 0.5855, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 1.3908072710037231, |
|
"learning_rate": 2.0752075207520754e-05, |
|
"loss": 0.5876, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"eval_loss": 0.5670270919799805, |
|
"eval_runtime": 94.4393, |
|
"eval_samples_per_second": 54.903, |
|
"eval_steps_per_second": 6.872, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6595949855351977, |
|
"grad_norm": 1.150038480758667, |
|
"learning_rate": 2.072707270727073e-05, |
|
"loss": 0.5151, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.6634522661523626, |
|
"grad_norm": 1.2351560592651367, |
|
"learning_rate": 2.07020702070207e-05, |
|
"loss": 0.5171, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.6673095467695275, |
|
"grad_norm": 1.2720533609390259, |
|
"learning_rate": 2.0677067706770676e-05, |
|
"loss": 0.5526, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.6711668273866924, |
|
"grad_norm": 1.2330290079116821, |
|
"learning_rate": 2.0652065206520655e-05, |
|
"loss": 0.5516, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.6750241080038573, |
|
"grad_norm": 1.319873571395874, |
|
"learning_rate": 2.0627062706270627e-05, |
|
"loss": 0.5512, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6788813886210222, |
|
"grad_norm": 1.663527250289917, |
|
"learning_rate": 2.0602060206020603e-05, |
|
"loss": 0.556, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.682738669238187, |
|
"grad_norm": 1.2730813026428223, |
|
"learning_rate": 2.0577057705770578e-05, |
|
"loss": 0.5362, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.686595949855352, |
|
"grad_norm": 1.2985719442367554, |
|
"learning_rate": 2.0552055205520554e-05, |
|
"loss": 0.6448, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.6904532304725168, |
|
"grad_norm": 1.384941577911377, |
|
"learning_rate": 2.052705270527053e-05, |
|
"loss": 0.5767, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.6943105110896818, |
|
"grad_norm": 1.2721012830734253, |
|
"learning_rate": 2.0502050205020504e-05, |
|
"loss": 0.6248, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6943105110896818, |
|
"eval_loss": 0.560900092124939, |
|
"eval_runtime": 94.4846, |
|
"eval_samples_per_second": 54.877, |
|
"eval_steps_per_second": 6.869, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6981677917068466, |
|
"grad_norm": 1.3880654573440552, |
|
"learning_rate": 2.0477047704770476e-05, |
|
"loss": 0.5389, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.7020250723240116, |
|
"grad_norm": 1.2518627643585205, |
|
"learning_rate": 2.0452045204520455e-05, |
|
"loss": 0.566, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 1.4524362087249756, |
|
"learning_rate": 2.0427042704270427e-05, |
|
"loss": 0.5105, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.7097396335583414, |
|
"grad_norm": 1.2816158533096313, |
|
"learning_rate": 2.0402040204020403e-05, |
|
"loss": 0.5308, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.7135969141755063, |
|
"grad_norm": 1.286135196685791, |
|
"learning_rate": 2.0377037703770378e-05, |
|
"loss": 0.5273, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.7174541947926711, |
|
"grad_norm": 1.4501844644546509, |
|
"learning_rate": 2.0352035203520354e-05, |
|
"loss": 0.5622, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7213114754098361, |
|
"grad_norm": 1.3340784311294556, |
|
"learning_rate": 2.0327032703270326e-05, |
|
"loss": 0.6137, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.7251687560270009, |
|
"grad_norm": 1.439643383026123, |
|
"learning_rate": 2.0302030203020304e-05, |
|
"loss": 0.5846, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.7290260366441659, |
|
"grad_norm": 1.2474430799484253, |
|
"learning_rate": 2.027702770277028e-05, |
|
"loss": 0.5519, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.7328833172613307, |
|
"grad_norm": 1.0996040105819702, |
|
"learning_rate": 2.0252025202520252e-05, |
|
"loss": 0.5481, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7328833172613307, |
|
"eval_loss": 0.5548669695854187, |
|
"eval_runtime": 94.5252, |
|
"eval_samples_per_second": 54.853, |
|
"eval_steps_per_second": 6.866, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7367405978784957, |
|
"grad_norm": 1.5467498302459717, |
|
"learning_rate": 2.022702270227023e-05, |
|
"loss": 0.546, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.7405978784956606, |
|
"grad_norm": 1.4486864805221558, |
|
"learning_rate": 2.0202020202020203e-05, |
|
"loss": 0.5239, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7444551591128254, |
|
"grad_norm": 1.3535338640213013, |
|
"learning_rate": 2.0177017701770178e-05, |
|
"loss": 0.5733, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.7483124397299904, |
|
"grad_norm": 1.4148615598678589, |
|
"learning_rate": 2.0152015201520154e-05, |
|
"loss": 0.5177, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.7521697203471552, |
|
"grad_norm": 1.5134552717208862, |
|
"learning_rate": 2.012701270127013e-05, |
|
"loss": 0.5643, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.7560270009643202, |
|
"grad_norm": 1.5626767873764038, |
|
"learning_rate": 2.01020102010201e-05, |
|
"loss": 0.5317, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.759884281581485, |
|
"grad_norm": 1.3729217052459717, |
|
"learning_rate": 2.007700770077008e-05, |
|
"loss": 0.5859, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.76374156219865, |
|
"grad_norm": 1.5823298692703247, |
|
"learning_rate": 2.0052005200520052e-05, |
|
"loss": 0.517, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.7675988428158148, |
|
"grad_norm": 1.4126390218734741, |
|
"learning_rate": 2.0027002700270027e-05, |
|
"loss": 0.578, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.7714561234329798, |
|
"grad_norm": 1.5024161338806152, |
|
"learning_rate": 2.0002000200020003e-05, |
|
"loss": 0.4779, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7714561234329798, |
|
"eval_loss": 0.548928439617157, |
|
"eval_runtime": 94.5508, |
|
"eval_samples_per_second": 54.838, |
|
"eval_steps_per_second": 6.864, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7753134040501446, |
|
"grad_norm": 1.4644631147384644, |
|
"learning_rate": 1.9976997699769978e-05, |
|
"loss": 0.545, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.7791706846673095, |
|
"grad_norm": 1.394882082939148, |
|
"learning_rate": 1.995199519951995e-05, |
|
"loss": 0.5502, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.7830279652844745, |
|
"grad_norm": 1.4921457767486572, |
|
"learning_rate": 1.992699269926993e-05, |
|
"loss": 0.6197, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.7868852459016393, |
|
"grad_norm": 1.3136405944824219, |
|
"learning_rate": 1.9901990199019904e-05, |
|
"loss": 0.5296, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.7907425265188043, |
|
"grad_norm": 1.5223480463027954, |
|
"learning_rate": 1.9876987698769876e-05, |
|
"loss": 0.4991, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.7945998071359691, |
|
"grad_norm": 1.4527870416641235, |
|
"learning_rate": 1.9851985198519855e-05, |
|
"loss": 0.5194, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.7984570877531341, |
|
"grad_norm": 1.4777238368988037, |
|
"learning_rate": 1.9826982698269827e-05, |
|
"loss": 0.5511, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.8023143683702989, |
|
"grad_norm": 1.8136184215545654, |
|
"learning_rate": 1.9801980198019803e-05, |
|
"loss": 0.5819, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.8061716489874639, |
|
"grad_norm": 1.7190624475479126, |
|
"learning_rate": 1.9776977697769778e-05, |
|
"loss": 0.5725, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.8100289296046287, |
|
"grad_norm": 1.2566032409667969, |
|
"learning_rate": 1.9751975197519753e-05, |
|
"loss": 0.5471, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8100289296046287, |
|
"eval_loss": 0.5430962443351746, |
|
"eval_runtime": 94.4905, |
|
"eval_samples_per_second": 54.873, |
|
"eval_steps_per_second": 6.868, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8138862102217936, |
|
"grad_norm": 1.1948508024215698, |
|
"learning_rate": 1.9726972697269725e-05, |
|
"loss": 0.5449, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.8177434908389586, |
|
"grad_norm": 1.355807900428772, |
|
"learning_rate": 1.9701970197019704e-05, |
|
"loss": 0.5238, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.8216007714561234, |
|
"grad_norm": 1.4238370656967163, |
|
"learning_rate": 1.9676967696769676e-05, |
|
"loss": 0.5425, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.8254580520732884, |
|
"grad_norm": 1.5667427778244019, |
|
"learning_rate": 1.9651965196519652e-05, |
|
"loss": 0.5571, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.8293153326904532, |
|
"grad_norm": 1.5513569116592407, |
|
"learning_rate": 1.9626962696269627e-05, |
|
"loss": 0.5631, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.8331726133076182, |
|
"grad_norm": 1.3871880769729614, |
|
"learning_rate": 1.9601960196019603e-05, |
|
"loss": 0.5687, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.837029893924783, |
|
"grad_norm": 1.4342153072357178, |
|
"learning_rate": 1.9576957695769578e-05, |
|
"loss": 0.5193, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.840887174541948, |
|
"grad_norm": 1.4925063848495483, |
|
"learning_rate": 1.9551955195519553e-05, |
|
"loss": 0.5548, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.8447444551591128, |
|
"grad_norm": 1.5816041231155396, |
|
"learning_rate": 1.952695269526953e-05, |
|
"loss": 0.5538, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.8486017357762777, |
|
"grad_norm": 1.803604006767273, |
|
"learning_rate": 1.9501950195019504e-05, |
|
"loss": 0.4947, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8486017357762777, |
|
"eval_loss": 0.5378134846687317, |
|
"eval_runtime": 94.5121, |
|
"eval_samples_per_second": 54.861, |
|
"eval_steps_per_second": 6.867, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8524590163934426, |
|
"grad_norm": 1.5246657133102417, |
|
"learning_rate": 1.947694769476948e-05, |
|
"loss": 0.5183, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.8563162970106075, |
|
"grad_norm": 1.4470975399017334, |
|
"learning_rate": 1.9451945194519452e-05, |
|
"loss": 0.5197, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.8601735776277725, |
|
"grad_norm": 1.6767865419387817, |
|
"learning_rate": 1.9426942694269427e-05, |
|
"loss": 0.5654, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.8640308582449373, |
|
"grad_norm": 1.5155974626541138, |
|
"learning_rate": 1.9401940194019403e-05, |
|
"loss": 0.6042, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.8678881388621023, |
|
"grad_norm": 1.6148077249526978, |
|
"learning_rate": 1.9376937693769378e-05, |
|
"loss": 0.5055, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.8717454194792671, |
|
"grad_norm": 1.5768954753875732, |
|
"learning_rate": 1.9351935193519353e-05, |
|
"loss": 0.4966, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.875602700096432, |
|
"grad_norm": 1.5010885000228882, |
|
"learning_rate": 1.932693269326933e-05, |
|
"loss": 0.501, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.8794599807135969, |
|
"grad_norm": 1.661967158317566, |
|
"learning_rate": 1.93019301930193e-05, |
|
"loss": 0.5405, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.8833172613307618, |
|
"grad_norm": 1.5393158197402954, |
|
"learning_rate": 1.927692769276928e-05, |
|
"loss": 0.5544, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.8871745419479267, |
|
"grad_norm": 1.7475782632827759, |
|
"learning_rate": 1.9251925192519252e-05, |
|
"loss": 0.6173, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8871745419479267, |
|
"eval_loss": 0.5320296287536621, |
|
"eval_runtime": 94.5369, |
|
"eval_samples_per_second": 54.846, |
|
"eval_steps_per_second": 6.865, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8910318225650916, |
|
"grad_norm": 1.3934800624847412, |
|
"learning_rate": 1.9226922692269227e-05, |
|
"loss": 0.5125, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.8948891031822566, |
|
"grad_norm": 1.6484580039978027, |
|
"learning_rate": 1.9201920192019203e-05, |
|
"loss": 0.494, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.8987463837994214, |
|
"grad_norm": 1.6516157388687134, |
|
"learning_rate": 1.9176917691769178e-05, |
|
"loss": 0.5253, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.9026036644165863, |
|
"grad_norm": 1.5073869228363037, |
|
"learning_rate": 1.9151915191519153e-05, |
|
"loss": 0.5516, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.9064609450337512, |
|
"grad_norm": 1.582481026649475, |
|
"learning_rate": 1.912691269126913e-05, |
|
"loss": 0.5621, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.9103182256509161, |
|
"grad_norm": 1.4449944496154785, |
|
"learning_rate": 1.9101910191019104e-05, |
|
"loss": 0.5494, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.914175506268081, |
|
"grad_norm": 1.7907747030258179, |
|
"learning_rate": 1.9076907690769076e-05, |
|
"loss": 0.5404, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.9180327868852459, |
|
"grad_norm": 1.719509243965149, |
|
"learning_rate": 1.9051905190519055e-05, |
|
"loss": 0.5283, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.9218900675024108, |
|
"grad_norm": 1.5800633430480957, |
|
"learning_rate": 1.9026902690269027e-05, |
|
"loss": 0.5292, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.9257473481195757, |
|
"grad_norm": 1.4846770763397217, |
|
"learning_rate": 1.9001900190019003e-05, |
|
"loss": 0.524, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9257473481195757, |
|
"eval_loss": 0.5241175889968872, |
|
"eval_runtime": 94.4587, |
|
"eval_samples_per_second": 54.892, |
|
"eval_steps_per_second": 6.871, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9296046287367405, |
|
"grad_norm": 1.7714641094207764, |
|
"learning_rate": 1.8976897689768978e-05, |
|
"loss": 0.4915, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.9334619093539055, |
|
"grad_norm": 1.964656114578247, |
|
"learning_rate": 1.8951895189518953e-05, |
|
"loss": 0.4874, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.9373191899710704, |
|
"grad_norm": 1.6763602495193481, |
|
"learning_rate": 1.8926892689268925e-05, |
|
"loss": 0.5526, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 1.6096868515014648, |
|
"learning_rate": 1.8901890189018904e-05, |
|
"loss": 0.5101, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.9450337512054002, |
|
"grad_norm": 1.5164107084274292, |
|
"learning_rate": 1.8876887688768876e-05, |
|
"loss": 0.5307, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.9488910318225651, |
|
"grad_norm": 1.4356317520141602, |
|
"learning_rate": 1.885188518851885e-05, |
|
"loss": 0.4733, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.95274831243973, |
|
"grad_norm": 1.6256446838378906, |
|
"learning_rate": 1.8826882688268827e-05, |
|
"loss": 0.5726, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.9566055930568949, |
|
"grad_norm": 1.5358326435089111, |
|
"learning_rate": 1.8801880188018802e-05, |
|
"loss": 0.5134, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.9604628736740598, |
|
"grad_norm": 1.862509846687317, |
|
"learning_rate": 1.8776877687768778e-05, |
|
"loss": 0.5277, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.9643201542912246, |
|
"grad_norm": 1.7659302949905396, |
|
"learning_rate": 1.8751875187518753e-05, |
|
"loss": 0.5523, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9643201542912246, |
|
"eval_loss": 0.5180462002754211, |
|
"eval_runtime": 94.4176, |
|
"eval_samples_per_second": 54.916, |
|
"eval_steps_per_second": 6.874, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9681774349083896, |
|
"grad_norm": 1.5947084426879883, |
|
"learning_rate": 1.872687268726873e-05, |
|
"loss": 0.5419, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.9720347155255545, |
|
"grad_norm": 1.829914927482605, |
|
"learning_rate": 1.87018701870187e-05, |
|
"loss": 0.5897, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.9758919961427194, |
|
"grad_norm": 1.3083444833755493, |
|
"learning_rate": 1.867686768676868e-05, |
|
"loss": 0.4932, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.9797492767598843, |
|
"grad_norm": 1.5652191638946533, |
|
"learning_rate": 1.865186518651865e-05, |
|
"loss": 0.4967, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 1.7959744930267334, |
|
"learning_rate": 1.8626862686268627e-05, |
|
"loss": 0.4934, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.9874638379942141, |
|
"grad_norm": 1.6218141317367554, |
|
"learning_rate": 1.8601860186018602e-05, |
|
"loss": 0.4809, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.991321118611379, |
|
"grad_norm": 1.641104817390442, |
|
"learning_rate": 1.8576857685768578e-05, |
|
"loss": 0.4789, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.9951783992285439, |
|
"grad_norm": 1.732410192489624, |
|
"learning_rate": 1.8551855185518553e-05, |
|
"loss": 0.4998, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.9990356798457087, |
|
"grad_norm": 1.8680731058120728, |
|
"learning_rate": 1.852685268526853e-05, |
|
"loss": 0.5097, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.0028929604628736, |
|
"grad_norm": 1.7208608388900757, |
|
"learning_rate": 1.85018501850185e-05, |
|
"loss": 0.4809, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.0028929604628736, |
|
"eval_loss": 0.5124805569648743, |
|
"eval_runtime": 94.4179, |
|
"eval_samples_per_second": 54.915, |
|
"eval_steps_per_second": 6.874, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.0067502410800386, |
|
"grad_norm": 1.9916785955429077, |
|
"learning_rate": 1.847684768476848e-05, |
|
"loss": 0.4324, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.0106075216972035, |
|
"grad_norm": 1.5762462615966797, |
|
"learning_rate": 1.845184518451845e-05, |
|
"loss": 0.4817, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.0144648023143683, |
|
"grad_norm": 2.0109360218048096, |
|
"learning_rate": 1.8426842684268427e-05, |
|
"loss": 0.441, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.0183220829315334, |
|
"grad_norm": 1.7828129529953003, |
|
"learning_rate": 1.8401840184018402e-05, |
|
"loss": 0.4551, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.0221793635486982, |
|
"grad_norm": 1.7471317052841187, |
|
"learning_rate": 1.8376837683768378e-05, |
|
"loss": 0.3956, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.026036644165863, |
|
"grad_norm": 1.9026498794555664, |
|
"learning_rate": 1.8351835183518353e-05, |
|
"loss": 0.4544, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.0298939247830279, |
|
"grad_norm": 1.9493508338928223, |
|
"learning_rate": 1.832683268326833e-05, |
|
"loss": 0.4609, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.033751205400193, |
|
"grad_norm": 1.8381072282791138, |
|
"learning_rate": 1.8301830183018304e-05, |
|
"loss": 0.4321, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.0376084860173578, |
|
"grad_norm": 1.5527135133743286, |
|
"learning_rate": 1.8276827682768276e-05, |
|
"loss": 0.4112, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.0414657666345226, |
|
"grad_norm": 2.231661319732666, |
|
"learning_rate": 1.8251825182518255e-05, |
|
"loss": 0.4279, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.0414657666345226, |
|
"eval_loss": 0.5103564262390137, |
|
"eval_runtime": 94.417, |
|
"eval_samples_per_second": 54.916, |
|
"eval_steps_per_second": 6.874, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.0453230472516875, |
|
"grad_norm": 3.195507049560547, |
|
"learning_rate": 1.8226822682268227e-05, |
|
"loss": 0.4678, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.0491803278688525, |
|
"grad_norm": 1.8608683347702026, |
|
"learning_rate": 1.8201820182018202e-05, |
|
"loss": 0.4831, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.0530376084860174, |
|
"grad_norm": 2.1820995807647705, |
|
"learning_rate": 1.8176817681768178e-05, |
|
"loss": 0.4, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.0568948891031822, |
|
"grad_norm": 1.7552732229232788, |
|
"learning_rate": 1.8151815181518153e-05, |
|
"loss": 0.4431, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.0607521697203472, |
|
"grad_norm": 2.040696859359741, |
|
"learning_rate": 1.8126812681268125e-05, |
|
"loss": 0.528, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.064609450337512, |
|
"grad_norm": 1.7921245098114014, |
|
"learning_rate": 1.8101810181018104e-05, |
|
"loss": 0.449, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.068466730954677, |
|
"grad_norm": 2.0593929290771484, |
|
"learning_rate": 1.8076807680768076e-05, |
|
"loss": 0.41, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.0723240115718418, |
|
"grad_norm": 2.059739112854004, |
|
"learning_rate": 1.805180518051805e-05, |
|
"loss": 0.4451, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.0761812921890068, |
|
"grad_norm": 2.0607693195343018, |
|
"learning_rate": 1.802680268026803e-05, |
|
"loss": 0.4387, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.0800385728061717, |
|
"grad_norm": 1.7160958051681519, |
|
"learning_rate": 1.8001800180018002e-05, |
|
"loss": 0.4501, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.0800385728061717, |
|
"eval_loss": 0.5034841895103455, |
|
"eval_runtime": 94.5244, |
|
"eval_samples_per_second": 54.854, |
|
"eval_steps_per_second": 6.866, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.0838958534233365, |
|
"grad_norm": 1.879629373550415, |
|
"learning_rate": 1.7976797679767978e-05, |
|
"loss": 0.4553, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.0877531340405016, |
|
"grad_norm": 2.0610523223876953, |
|
"learning_rate": 1.7951795179517953e-05, |
|
"loss": 0.4842, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.0916104146576664, |
|
"grad_norm": 1.8454833030700684, |
|
"learning_rate": 1.792679267926793e-05, |
|
"loss": 0.4288, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.0954676952748312, |
|
"grad_norm": 1.7830801010131836, |
|
"learning_rate": 1.79017901790179e-05, |
|
"loss": 0.4552, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.099324975891996, |
|
"grad_norm": 1.7110368013381958, |
|
"learning_rate": 1.787678767876788e-05, |
|
"loss": 0.4557, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.1031822565091611, |
|
"grad_norm": 2.69413161277771, |
|
"learning_rate": 1.785178517851785e-05, |
|
"loss": 0.5252, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.107039537126326, |
|
"grad_norm": 2.2572829723358154, |
|
"learning_rate": 1.7826782678267827e-05, |
|
"loss": 0.5042, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.1108968177434908, |
|
"grad_norm": 2.144115447998047, |
|
"learning_rate": 1.7801780178017802e-05, |
|
"loss": 0.4615, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.1147540983606556, |
|
"grad_norm": 1.661698818206787, |
|
"learning_rate": 1.7776777677767778e-05, |
|
"loss": 0.429, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.1186113789778207, |
|
"grad_norm": 2.2900257110595703, |
|
"learning_rate": 1.7751775177517753e-05, |
|
"loss": 0.4651, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.1186113789778207, |
|
"eval_loss": 0.4993349611759186, |
|
"eval_runtime": 94.4361, |
|
"eval_samples_per_second": 54.905, |
|
"eval_steps_per_second": 6.872, |
|
"step": 2900 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3606948147288474e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|