|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 687, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004366812227074236, |
|
"grad_norm": 1.8753604454650958, |
|
"learning_rate": 2.898550724637681e-06, |
|
"loss": 3.7085, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.021834061135371178, |
|
"grad_norm": 1.801543636467953, |
|
"learning_rate": 1.4492753623188407e-05, |
|
"loss": 3.628, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.043668122270742356, |
|
"grad_norm": 2.2358499812546344, |
|
"learning_rate": 2.8985507246376814e-05, |
|
"loss": 3.5205, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06550218340611354, |
|
"grad_norm": 3.2738653822216937, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 3.3516, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08733624454148471, |
|
"grad_norm": 1.2942851623807274, |
|
"learning_rate": 5.797101449275363e-05, |
|
"loss": 2.7683, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1091703056768559, |
|
"grad_norm": 0.6794530467895241, |
|
"learning_rate": 7.246376811594203e-05, |
|
"loss": 2.4019, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13100436681222707, |
|
"grad_norm": 1.2145036683171, |
|
"learning_rate": 8.695652173913044e-05, |
|
"loss": 2.1421, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15283842794759825, |
|
"grad_norm": 0.8541800089072681, |
|
"learning_rate": 0.00010144927536231885, |
|
"loss": 1.9309, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.17467248908296942, |
|
"grad_norm": 0.5114546320288847, |
|
"learning_rate": 0.00011594202898550725, |
|
"loss": 1.9047, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1965065502183406, |
|
"grad_norm": 0.4569825196048275, |
|
"learning_rate": 0.00013043478260869567, |
|
"loss": 1.8231, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2183406113537118, |
|
"grad_norm": 0.5192344724636687, |
|
"learning_rate": 0.00014492753623188405, |
|
"loss": 1.7305, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.24017467248908297, |
|
"grad_norm": 0.34840209414620005, |
|
"learning_rate": 0.00015942028985507247, |
|
"loss": 1.7933, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.26200873362445415, |
|
"grad_norm": 0.3678371866239178, |
|
"learning_rate": 0.00017391304347826088, |
|
"loss": 1.6143, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2838427947598253, |
|
"grad_norm": 0.35400819586929, |
|
"learning_rate": 0.00018840579710144927, |
|
"loss": 1.7367, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3056768558951965, |
|
"grad_norm": 0.38841557534962395, |
|
"learning_rate": 0.00019999870791268066, |
|
"loss": 1.6669, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.32751091703056767, |
|
"grad_norm": 0.4474417485361201, |
|
"learning_rate": 0.00019995348836233516, |
|
"loss": 1.601, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.34934497816593885, |
|
"grad_norm": 0.35698828488835516, |
|
"learning_rate": 0.00019984369783193688, |
|
"loss": 1.5357, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.37117903930131, |
|
"grad_norm": 0.3585527157477228, |
|
"learning_rate": 0.00019966940724729603, |
|
"loss": 1.6221, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3930131004366812, |
|
"grad_norm": 0.34800578812888927, |
|
"learning_rate": 0.0001994307292019204, |
|
"loss": 1.6741, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4148471615720524, |
|
"grad_norm": 0.35237447031832875, |
|
"learning_rate": 0.0001991278178842786, |
|
"loss": 1.5682, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4366812227074236, |
|
"grad_norm": 0.39076567242886867, |
|
"learning_rate": 0.00019876086897819284, |
|
"loss": 1.5629, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4585152838427948, |
|
"grad_norm": 0.34983112900974217, |
|
"learning_rate": 0.00019833011953642525, |
|
"loss": 1.6151, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.48034934497816595, |
|
"grad_norm": 0.34530341844436674, |
|
"learning_rate": 0.00019783584782753918, |
|
"loss": 1.5494, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5021834061135371, |
|
"grad_norm": 0.38496791747153597, |
|
"learning_rate": 0.00019727837315613504, |
|
"loss": 1.5526, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5240174672489083, |
|
"grad_norm": 0.35057371546794386, |
|
"learning_rate": 0.00019665805565657603, |
|
"loss": 1.5933, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5458515283842795, |
|
"grad_norm": 0.35636722375213375, |
|
"learning_rate": 0.00019597529606033782, |
|
"loss": 1.5726, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5676855895196506, |
|
"grad_norm": 0.375075170102756, |
|
"learning_rate": 0.0001952305354371319, |
|
"loss": 1.572, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5895196506550219, |
|
"grad_norm": 0.38062791914729815, |
|
"learning_rate": 0.00019442425490996988, |
|
"loss": 1.5393, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.611353711790393, |
|
"grad_norm": 0.3465840806200646, |
|
"learning_rate": 0.0001935569753443532, |
|
"loss": 1.502, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6331877729257642, |
|
"grad_norm": 0.3566623409862706, |
|
"learning_rate": 0.00019262925701178866, |
|
"loss": 1.6075, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6550218340611353, |
|
"grad_norm": 0.3513669246328785, |
|
"learning_rate": 0.00019164169922784716, |
|
"loss": 1.5125, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6768558951965066, |
|
"grad_norm": 0.34755655244144995, |
|
"learning_rate": 0.00019059493996499986, |
|
"loss": 1.6031, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6986899563318777, |
|
"grad_norm": 0.35330261801172397, |
|
"learning_rate": 0.00018948965544048128, |
|
"loss": 1.5862, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7205240174672489, |
|
"grad_norm": 0.3540773871103764, |
|
"learning_rate": 0.00018832655967944607, |
|
"loss": 1.6557, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.74235807860262, |
|
"grad_norm": 0.3385589855658203, |
|
"learning_rate": 0.00018710640405370145, |
|
"loss": 1.5771, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7641921397379913, |
|
"grad_norm": 0.3948832031761469, |
|
"learning_rate": 0.00018582997679631315, |
|
"loss": 1.5896, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7860262008733624, |
|
"grad_norm": 0.3414806859938428, |
|
"learning_rate": 0.00018449810249239902, |
|
"loss": 1.5278, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8078602620087336, |
|
"grad_norm": 0.3306470223687427, |
|
"learning_rate": 0.00018311164154643836, |
|
"loss": 1.4916, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8296943231441049, |
|
"grad_norm": 0.37143716375731756, |
|
"learning_rate": 0.00018167148962644193, |
|
"loss": 1.625, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.851528384279476, |
|
"grad_norm": 0.3610023951445998, |
|
"learning_rate": 0.00018017857708534107, |
|
"loss": 1.6859, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8733624454148472, |
|
"grad_norm": 0.3969039590867808, |
|
"learning_rate": 0.00017863386835997028, |
|
"loss": 1.6366, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8951965065502183, |
|
"grad_norm": 0.3466218548962297, |
|
"learning_rate": 0.00017703836134803105, |
|
"loss": 1.4699, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.9170305676855895, |
|
"grad_norm": 0.35771965160066704, |
|
"learning_rate": 0.00017539308676343973, |
|
"loss": 1.5723, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9388646288209607, |
|
"grad_norm": 0.37691637209544226, |
|
"learning_rate": 0.00017369910747047572, |
|
"loss": 1.584, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9606986899563319, |
|
"grad_norm": 0.40411027091446045, |
|
"learning_rate": 0.00017195751779716027, |
|
"loss": 1.6019, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.982532751091703, |
|
"grad_norm": 0.3742848841742168, |
|
"learning_rate": 0.00017016944282830933, |
|
"loss": 1.4947, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.0043668122270741, |
|
"grad_norm": 0.38407235492099207, |
|
"learning_rate": 0.00016833603767871713, |
|
"loss": 1.5812, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0262008733624455, |
|
"grad_norm": 0.37035584561109924, |
|
"learning_rate": 0.0001664584867469403, |
|
"loss": 1.5113, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0480349344978166, |
|
"grad_norm": 0.4171769928455508, |
|
"learning_rate": 0.0001645380029501641, |
|
"loss": 1.427, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0698689956331877, |
|
"grad_norm": 0.4014807349448744, |
|
"learning_rate": 0.00016257582694064558, |
|
"loss": 1.488, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.091703056768559, |
|
"grad_norm": 0.41603472494317567, |
|
"learning_rate": 0.00016057322630423935, |
|
"loss": 1.4085, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1135371179039302, |
|
"grad_norm": 0.41696638930397173, |
|
"learning_rate": 0.00015853149474152423, |
|
"loss": 1.417, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.1353711790393013, |
|
"grad_norm": 0.42289781015455985, |
|
"learning_rate": 0.0001564519512320593, |
|
"loss": 1.4374, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1572052401746724, |
|
"grad_norm": 0.4136181415806817, |
|
"learning_rate": 0.00015433593918230955, |
|
"loss": 1.5384, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.1790393013100438, |
|
"grad_norm": 0.4304951924614819, |
|
"learning_rate": 0.00015218482555779165, |
|
"loss": 1.4184, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2008733624454149, |
|
"grad_norm": 0.4567200712003195, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.5038, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.222707423580786, |
|
"grad_norm": 0.418159543142196, |
|
"learning_rate": 0.00014778287392868417, |
|
"loss": 1.4477, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.244541484716157, |
|
"grad_norm": 0.43554491426674474, |
|
"learning_rate": 0.0001455348796300571, |
|
"loss": 1.362, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.2663755458515285, |
|
"grad_norm": 0.4303389630106061, |
|
"learning_rate": 0.0001432574693315238, |
|
"loss": 1.5283, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2882096069868996, |
|
"grad_norm": 0.4434624782109262, |
|
"learning_rate": 0.0001409521142635272, |
|
"loss": 1.5192, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.3100436681222707, |
|
"grad_norm": 0.4160064760695591, |
|
"learning_rate": 0.0001386203037091183, |
|
"loss": 1.5222, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3318777292576418, |
|
"grad_norm": 0.44897734162151265, |
|
"learning_rate": 0.00013626354404186404, |
|
"loss": 1.5612, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.3537117903930131, |
|
"grad_norm": 0.4378993469096492, |
|
"learning_rate": 0.00013388335775271467, |
|
"loss": 1.4442, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3755458515283843, |
|
"grad_norm": 0.4403089794683692, |
|
"learning_rate": 0.0001314812824664585, |
|
"loss": 1.5151, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.3973799126637554, |
|
"grad_norm": 0.44038010399377847, |
|
"learning_rate": 0.000129058869948401, |
|
"loss": 1.5635, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4192139737991267, |
|
"grad_norm": 0.4640302354831066, |
|
"learning_rate": 0.00012661768510190816, |
|
"loss": 1.4904, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.4410480349344978, |
|
"grad_norm": 0.45432337967027536, |
|
"learning_rate": 0.00012415930495746302, |
|
"loss": 1.5045, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.462882096069869, |
|
"grad_norm": 0.451657842641418, |
|
"learning_rate": 0.00012168531765388755, |
|
"loss": 1.392, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.48471615720524, |
|
"grad_norm": 0.43285118470181244, |
|
"learning_rate": 0.00011919732141238898, |
|
"loss": 1.461, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5065502183406112, |
|
"grad_norm": 0.4819434243733131, |
|
"learning_rate": 0.00011669692350409223, |
|
"loss": 1.6045, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.5283842794759825, |
|
"grad_norm": 0.440140827625262, |
|
"learning_rate": 0.00011418573921172635, |
|
"loss": 1.5039, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.5502183406113537, |
|
"grad_norm": 0.4475130341158297, |
|
"learning_rate": 0.00011166539078613525, |
|
"loss": 1.5163, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.572052401746725, |
|
"grad_norm": 0.4563951417730544, |
|
"learning_rate": 0.00010913750639828711, |
|
"loss": 1.4896, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.5938864628820961, |
|
"grad_norm": 0.4570029980279654, |
|
"learning_rate": 0.0001066037190874591, |
|
"loss": 1.4318, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.6157205240174672, |
|
"grad_norm": 0.45220265285594413, |
|
"learning_rate": 0.00010406566570627713, |
|
"loss": 1.484, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.6375545851528384, |
|
"grad_norm": 0.43217867151367745, |
|
"learning_rate": 0.0001015249858632926, |
|
"loss": 1.3569, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.6593886462882095, |
|
"grad_norm": 0.45524805316851796, |
|
"learning_rate": 9.898332086377805e-05, |
|
"loss": 1.5309, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6812227074235808, |
|
"grad_norm": 0.45864967058418454, |
|
"learning_rate": 9.644231264942724e-05, |
|
"loss": 1.5568, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.703056768558952, |
|
"grad_norm": 0.46156320050595717, |
|
"learning_rate": 9.390360273764411e-05, |
|
"loss": 1.5541, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7248908296943233, |
|
"grad_norm": 0.4424924245408702, |
|
"learning_rate": 9.136883116110542e-05, |
|
"loss": 1.4779, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.7467248908296944, |
|
"grad_norm": 0.4631844693338652, |
|
"learning_rate": 8.88396354082829e-05, |
|
"loss": 1.5041, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7685589519650655, |
|
"grad_norm": 0.49717848531939474, |
|
"learning_rate": 8.6317649365609e-05, |
|
"loss": 1.4976, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.7903930131004366, |
|
"grad_norm": 0.44110942790393187, |
|
"learning_rate": 8.380450226196925e-05, |
|
"loss": 1.3881, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.8122270742358078, |
|
"grad_norm": 0.5089589581007182, |
|
"learning_rate": 8.130181761620392e-05, |
|
"loss": 1.4779, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.8340611353711789, |
|
"grad_norm": 0.4698355464759574, |
|
"learning_rate": 7.881121218829787e-05, |
|
"loss": 1.4198, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.8558951965065502, |
|
"grad_norm": 0.4659031789449208, |
|
"learning_rate": 7.63342949349373e-05, |
|
"loss": 1.4861, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.8777292576419216, |
|
"grad_norm": 0.44894777329606517, |
|
"learning_rate": 7.387266597010704e-05, |
|
"loss": 1.503, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.8995633187772927, |
|
"grad_norm": 0.4475374854045599, |
|
"learning_rate": 7.142791553140045e-05, |
|
"loss": 1.5077, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.9213973799126638, |
|
"grad_norm": 0.4750409585386964, |
|
"learning_rate": 6.900162295270968e-05, |
|
"loss": 1.515, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.943231441048035, |
|
"grad_norm": 0.4786776337696274, |
|
"learning_rate": 6.659535564395982e-05, |
|
"loss": 1.5167, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.965065502183406, |
|
"grad_norm": 0.5012494244267932, |
|
"learning_rate": 6.421066807854584e-05, |
|
"loss": 1.5364, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.9868995633187772, |
|
"grad_norm": 0.445678005467061, |
|
"learning_rate": 6.184910078912687e-05, |
|
"loss": 1.4215, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.0087336244541483, |
|
"grad_norm": 0.4871967245962684, |
|
"learning_rate": 5.9512179372426325e-05, |
|
"loss": 1.4481, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.03056768558952, |
|
"grad_norm": 0.48019280605703346, |
|
"learning_rate": 5.720141350368072e-05, |
|
"loss": 1.45, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.052401746724891, |
|
"grad_norm": 0.5224256236632303, |
|
"learning_rate": 5.4918295961373923e-05, |
|
"loss": 1.4061, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.074235807860262, |
|
"grad_norm": 0.5108294729537572, |
|
"learning_rate": 5.266430166288705e-05, |
|
"loss": 1.3943, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.096069868995633, |
|
"grad_norm": 0.5371496292909286, |
|
"learning_rate": 5.044088671168644e-05, |
|
"loss": 1.3578, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.1179039301310043, |
|
"grad_norm": 0.5481243225476391, |
|
"learning_rate": 4.824948745666621e-05, |
|
"loss": 1.3686, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.1397379912663754, |
|
"grad_norm": 0.5490865552703573, |
|
"learning_rate": 4.6091519564251793e-05, |
|
"loss": 1.3655, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.1615720524017465, |
|
"grad_norm": 0.5341608991319753, |
|
"learning_rate": 4.3968377103865024e-05, |
|
"loss": 1.3681, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.183406113537118, |
|
"grad_norm": 0.5539995610447883, |
|
"learning_rate": 4.1881431647341054e-05, |
|
"loss": 1.3703, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.2052401746724892, |
|
"grad_norm": 0.5617916414503629, |
|
"learning_rate": 3.9832031382878766e-05, |
|
"loss": 1.3506, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.2270742358078603, |
|
"grad_norm": 0.5457137690561883, |
|
"learning_rate": 3.7821500244097274e-05, |
|
"loss": 1.33, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.2489082969432315, |
|
"grad_norm": 0.5996161081223403, |
|
"learning_rate": 3.585113705476143e-05, |
|
"loss": 1.389, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.2707423580786026, |
|
"grad_norm": 0.5451761149567781, |
|
"learning_rate": 3.392221468972805e-05, |
|
"loss": 1.3908, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.2925764192139737, |
|
"grad_norm": 0.6118020705146557, |
|
"learning_rate": 3.203597925265598e-05, |
|
"loss": 1.3559, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.314410480349345, |
|
"grad_norm": 0.5671066923036526, |
|
"learning_rate": 3.0193649271010095e-05, |
|
"loss": 1.3478, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.3362445414847164, |
|
"grad_norm": 0.6001118016692729, |
|
"learning_rate": 2.8396414908880098e-05, |
|
"loss": 1.3509, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.3580786026200875, |
|
"grad_norm": 0.5894837707963337, |
|
"learning_rate": 2.6645437198122502e-05, |
|
"loss": 1.4214, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.3799126637554586, |
|
"grad_norm": 0.5845209440156467, |
|
"learning_rate": 2.4941847288321797e-05, |
|
"loss": 1.3788, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.4017467248908297, |
|
"grad_norm": 0.5895995554814336, |
|
"learning_rate": 2.328674571605637e-05, |
|
"loss": 1.391, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.423580786026201, |
|
"grad_norm": 0.6008227175127105, |
|
"learning_rate": 2.1681201693940668e-05, |
|
"loss": 1.4373, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.445414847161572, |
|
"grad_norm": 0.6000811157919661, |
|
"learning_rate": 2.0126252419902614e-05, |
|
"loss": 1.4406, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.467248908296943, |
|
"grad_norm": 0.5911207625229586, |
|
"learning_rate": 1.8622902407143394e-05, |
|
"loss": 1.5141, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.489082969432314, |
|
"grad_norm": 0.615377574511787, |
|
"learning_rate": 1.7172122835211337e-05, |
|
"loss": 1.3896, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.5109170305676853, |
|
"grad_norm": 0.5857047240018323, |
|
"learning_rate": 1.577485092261012e-05, |
|
"loss": 1.4468, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.532751091703057, |
|
"grad_norm": 0.6061359574672782, |
|
"learning_rate": 1.4431989321345974e-05, |
|
"loss": 1.4299, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.554585152838428, |
|
"grad_norm": 0.6360146523250017, |
|
"learning_rate": 1.3144405533805138e-05, |
|
"loss": 1.4552, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.576419213973799, |
|
"grad_norm": 0.6147809358329509, |
|
"learning_rate": 1.191293135233844e-05, |
|
"loss": 1.3359, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.5982532751091703, |
|
"grad_norm": 0.6021229536534441, |
|
"learning_rate": 1.0738362321914997e-05, |
|
"loss": 1.3927, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.6200873362445414, |
|
"grad_norm": 0.5961109142620634, |
|
"learning_rate": 9.62145722619182e-06, |
|
"loss": 1.4896, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.641921397379913, |
|
"grad_norm": 0.578602732768201, |
|
"learning_rate": 8.562937597331899e-06, |
|
"loss": 1.4565, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.6637554585152836, |
|
"grad_norm": 0.6458615704409548, |
|
"learning_rate": 7.563487249887024e-06, |
|
"loss": 1.4511, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.685589519650655, |
|
"grad_norm": 0.6022868302443363, |
|
"learning_rate": 6.623751839046455e-06, |
|
"loss": 1.3836, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.7074235807860263, |
|
"grad_norm": 0.5988174075228108, |
|
"learning_rate": 5.744338443537134e-06, |
|
"loss": 1.4891, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.7292576419213974, |
|
"grad_norm": 0.6026020303199385, |
|
"learning_rate": 4.92581517344457e-06, |
|
"loss": 1.3409, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.7510917030567685, |
|
"grad_norm": 0.5919990549417811, |
|
"learning_rate": 4.168710803207865e-06, |
|
"loss": 1.4157, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.7729257641921397, |
|
"grad_norm": 0.6038824964947008, |
|
"learning_rate": 3.473514430026026e-06, |
|
"loss": 1.4138, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.7947598253275108, |
|
"grad_norm": 0.6135553904485512, |
|
"learning_rate": 2.840675157896111e-06, |
|
"loss": 1.411, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.816593886462882, |
|
"grad_norm": 0.6332119474956278, |
|
"learning_rate": 2.2706018074875045e-06, |
|
"loss": 1.2871, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.8384279475982535, |
|
"grad_norm": 0.607982963030175, |
|
"learning_rate": 1.7636626520395105e-06, |
|
"loss": 1.4341, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.8602620087336246, |
|
"grad_norm": 0.6326939787843192, |
|
"learning_rate": 1.3201851794530373e-06, |
|
"loss": 1.3992, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.8820960698689957, |
|
"grad_norm": 0.6624316672369768, |
|
"learning_rate": 9.404558807301067e-07, |
|
"loss": 1.424, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.903930131004367, |
|
"grad_norm": 0.5890404802190057, |
|
"learning_rate": 6.247200648976991e-07, |
|
"loss": 1.3734, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.925764192139738, |
|
"grad_norm": 0.588934601286785, |
|
"learning_rate": 3.7318170053559644e-07, |
|
"loss": 1.413, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.947598253275109, |
|
"grad_norm": 0.6171879119635909, |
|
"learning_rate": 1.8600328401061629e-07, |
|
"loss": 1.3424, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.96943231441048, |
|
"grad_norm": 0.6328278329175181, |
|
"learning_rate": 6.33057345022281e-08, |
|
"loss": 1.3699, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.9912663755458517, |
|
"grad_norm": 0.609229235169678, |
|
"learning_rate": 5.1683158875937e-09, |
|
"loss": 1.3262, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 687, |
|
"total_flos": 1562393690767360.0, |
|
"train_loss": 1.5581738730184898, |
|
"train_runtime": 1339.3383, |
|
"train_samples_per_second": 32.797, |
|
"train_steps_per_second": 0.513 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 687, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1562393690767360.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|