diff --git "a/checkpoint-25500/trainer_state.json" "b/checkpoint-25500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-25500/trainer_state.json" @@ -0,0 +1,18267 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.4742868232097806, + "eval_steps": 500, + "global_step": 25500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009703085581214826, + "grad_norm": 0.5093896998931756, + "learning_rate": 5e-06, + "loss": 1.7158, + "step": 10 + }, + { + "epoch": 0.0019406171162429653, + "grad_norm": 0.5499013987843694, + "learning_rate": 1e-05, + "loss": 1.6625, + "step": 20 + }, + { + "epoch": 0.002910925674364448, + "grad_norm": 0.6551534985295564, + "learning_rate": 1.5e-05, + "loss": 1.6378, + "step": 30 + }, + { + "epoch": 0.0038812342324859306, + "grad_norm": 0.7544385328936806, + "learning_rate": 2e-05, + "loss": 1.5712, + "step": 40 + }, + { + "epoch": 0.004851542790607413, + "grad_norm": 0.6539371672140126, + "learning_rate": 2.5e-05, + "loss": 1.5217, + "step": 50 + }, + { + "epoch": 0.005821851348728896, + "grad_norm": 0.6806376549433046, + "learning_rate": 3e-05, + "loss": 1.4412, + "step": 60 + }, + { + "epoch": 0.0067921599068503785, + "grad_norm": 0.9109066482466388, + "learning_rate": 3.5e-05, + "loss": 1.2303, + "step": 70 + }, + { + "epoch": 0.007762468464971861, + "grad_norm": 0.8941485474193254, + "learning_rate": 4e-05, + "loss": 1.1185, + "step": 80 + }, + { + "epoch": 0.008732777023093344, + "grad_norm": 0.6564948122239989, + "learning_rate": 4.5e-05, + "loss": 1.0481, + "step": 90 + }, + { + "epoch": 0.009703085581214826, + "grad_norm": 0.7749322437088195, + "learning_rate": 5e-05, + "loss": 0.9965, + "step": 100 + }, + { + "epoch": 0.010673394139336309, + "grad_norm": 0.7305812400337627, + "learning_rate": 4.998377571549095e-05, + "loss": 1.0689, + "step": 110 + }, + { + "epoch": 0.011643702697457792, + "grad_norm": 0.7320600025098228, + "learning_rate": 4.9967551430981895e-05, + "loss": 0.9554, + "step": 120 + }, + { + "epoch": 0.012614011255579274, + "grad_norm": 0.7997476828820285, + "learning_rate": 4.9951327146472845e-05, + "loss": 0.9048, + "step": 130 + }, + { + "epoch": 0.013584319813700757, + "grad_norm": 0.9942301355244718, + "learning_rate": 4.993510286196379e-05, + "loss": 0.9736, + "step": 140 + }, + { + "epoch": 0.01455462837182224, + "grad_norm": 0.8990043753777502, + "learning_rate": 4.991887857745474e-05, + "loss": 0.9447, + "step": 150 + }, + { + "epoch": 0.015524936929943722, + "grad_norm": 0.6931013922842434, + "learning_rate": 4.990265429294568e-05, + "loss": 0.8748, + "step": 160 + }, + { + "epoch": 0.016495245488065205, + "grad_norm": 0.8305007689150241, + "learning_rate": 4.988643000843663e-05, + "loss": 0.9467, + "step": 170 + }, + { + "epoch": 0.017465554046186688, + "grad_norm": 0.8993058386814992, + "learning_rate": 4.9870205723927573e-05, + "loss": 0.9394, + "step": 180 + }, + { + "epoch": 0.01843586260430817, + "grad_norm": 0.7486900307747556, + "learning_rate": 4.985398143941852e-05, + "loss": 0.931, + "step": 190 + }, + { + "epoch": 0.019406171162429653, + "grad_norm": 0.8879788388070488, + "learning_rate": 4.983775715490947e-05, + "loss": 0.9217, + "step": 200 + }, + { + "epoch": 0.020376479720551136, + "grad_norm": 0.7351527409279133, + "learning_rate": 4.9821532870400416e-05, + "loss": 0.8438, + "step": 210 + }, + { + "epoch": 0.021346788278672618, + "grad_norm": 0.9651267349255982, + "learning_rate": 4.9805308585891366e-05, + "loss": 0.8806, + "step": 220 + }, + { + "epoch": 0.0223170968367941, + "grad_norm": 0.8358141761944308, + "learning_rate": 4.978908430138231e-05, + "loss": 0.9075, + "step": 230 + }, + { + "epoch": 0.023287405394915583, + "grad_norm": 0.9068073418448239, + "learning_rate": 4.977286001687326e-05, + "loss": 0.9126, + "step": 240 + }, + { + "epoch": 0.024257713953037066, + "grad_norm": 0.8510875679665427, + "learning_rate": 4.97566357323642e-05, + "loss": 0.8509, + "step": 250 + }, + { + "epoch": 0.02522802251115855, + "grad_norm": 0.8371392324883873, + "learning_rate": 4.974041144785515e-05, + "loss": 0.9018, + "step": 260 + }, + { + "epoch": 0.02619833106928003, + "grad_norm": 0.7343155911324605, + "learning_rate": 4.9724187163346094e-05, + "loss": 0.9251, + "step": 270 + }, + { + "epoch": 0.027168639627401514, + "grad_norm": 1.012390858760769, + "learning_rate": 4.9707962878837044e-05, + "loss": 0.8881, + "step": 280 + }, + { + "epoch": 0.028138948185522997, + "grad_norm": 0.8092967868905977, + "learning_rate": 4.9691738594327994e-05, + "loss": 0.8831, + "step": 290 + }, + { + "epoch": 0.02910925674364448, + "grad_norm": 0.9545292755123069, + "learning_rate": 4.967551430981894e-05, + "loss": 0.8871, + "step": 300 + }, + { + "epoch": 0.030079565301765962, + "grad_norm": 0.8451115258584492, + "learning_rate": 4.9659290025309887e-05, + "loss": 0.8913, + "step": 310 + }, + { + "epoch": 0.031049873859887445, + "grad_norm": 0.984138459636415, + "learning_rate": 4.964306574080083e-05, + "loss": 0.9046, + "step": 320 + }, + { + "epoch": 0.03202018241800893, + "grad_norm": 0.9275621393016203, + "learning_rate": 4.962684145629178e-05, + "loss": 0.88, + "step": 330 + }, + { + "epoch": 0.03299049097613041, + "grad_norm": 1.0192818586577115, + "learning_rate": 4.961061717178272e-05, + "loss": 0.8923, + "step": 340 + }, + { + "epoch": 0.03396079953425189, + "grad_norm": 0.9779533854801943, + "learning_rate": 4.959439288727367e-05, + "loss": 0.868, + "step": 350 + }, + { + "epoch": 0.034931108092373375, + "grad_norm": 0.8554421373526296, + "learning_rate": 4.957816860276462e-05, + "loss": 0.8661, + "step": 360 + }, + { + "epoch": 0.03590141665049486, + "grad_norm": 0.7404878078324254, + "learning_rate": 4.9561944318255565e-05, + "loss": 0.8883, + "step": 370 + }, + { + "epoch": 0.03687172520861634, + "grad_norm": 0.9879847832445883, + "learning_rate": 4.9545720033746515e-05, + "loss": 0.8156, + "step": 380 + }, + { + "epoch": 0.03784203376673782, + "grad_norm": 1.0209320862188216, + "learning_rate": 4.952949574923746e-05, + "loss": 0.8212, + "step": 390 + }, + { + "epoch": 0.038812342324859306, + "grad_norm": 1.0823734215432599, + "learning_rate": 4.951327146472841e-05, + "loss": 0.8619, + "step": 400 + }, + { + "epoch": 0.03978265088298079, + "grad_norm": 1.033686091190584, + "learning_rate": 4.949704718021935e-05, + "loss": 0.818, + "step": 410 + }, + { + "epoch": 0.04075295944110227, + "grad_norm": 0.9587642487246072, + "learning_rate": 4.94808228957103e-05, + "loss": 0.8377, + "step": 420 + }, + { + "epoch": 0.041723267999223754, + "grad_norm": 0.8072742668289667, + "learning_rate": 4.946459861120124e-05, + "loss": 0.8043, + "step": 430 + }, + { + "epoch": 0.042693576557345236, + "grad_norm": 1.0090353092499478, + "learning_rate": 4.944837432669219e-05, + "loss": 0.8829, + "step": 440 + }, + { + "epoch": 0.04366388511546672, + "grad_norm": 1.177538600328968, + "learning_rate": 4.943215004218314e-05, + "loss": 0.859, + "step": 450 + }, + { + "epoch": 0.0446341936735882, + "grad_norm": 1.225063127801835, + "learning_rate": 4.9415925757674086e-05, + "loss": 0.8231, + "step": 460 + }, + { + "epoch": 0.045604502231709684, + "grad_norm": 0.8224632705866025, + "learning_rate": 4.9399701473165035e-05, + "loss": 0.8335, + "step": 470 + }, + { + "epoch": 0.04657481078983117, + "grad_norm": 1.153934818941284, + "learning_rate": 4.938347718865598e-05, + "loss": 0.8046, + "step": 480 + }, + { + "epoch": 0.04754511934795265, + "grad_norm": 1.0954567838682858, + "learning_rate": 4.936725290414693e-05, + "loss": 0.8975, + "step": 490 + }, + { + "epoch": 0.04851542790607413, + "grad_norm": 1.2466392758405571, + "learning_rate": 4.935102861963788e-05, + "loss": 0.8361, + "step": 500 + }, + { + "epoch": 0.049485736464195615, + "grad_norm": 0.902356430744448, + "learning_rate": 4.933480433512883e-05, + "loss": 0.8048, + "step": 510 + }, + { + "epoch": 0.0504560450223171, + "grad_norm": 1.01027561407788, + "learning_rate": 4.931858005061977e-05, + "loss": 0.8094, + "step": 520 + }, + { + "epoch": 0.05142635358043858, + "grad_norm": 1.1330239054084805, + "learning_rate": 4.930235576611072e-05, + "loss": 0.8288, + "step": 530 + }, + { + "epoch": 0.05239666213856006, + "grad_norm": 1.19235957479236, + "learning_rate": 4.9286131481601664e-05, + "loss": 0.8358, + "step": 540 + }, + { + "epoch": 0.053366970696681545, + "grad_norm": 1.070882879754836, + "learning_rate": 4.926990719709261e-05, + "loss": 0.783, + "step": 550 + }, + { + "epoch": 0.05433727925480303, + "grad_norm": 0.772829083325615, + "learning_rate": 4.925368291258356e-05, + "loss": 0.7888, + "step": 560 + }, + { + "epoch": 0.05530758781292451, + "grad_norm": 1.0485603174488358, + "learning_rate": 4.9237458628074506e-05, + "loss": 0.8893, + "step": 570 + }, + { + "epoch": 0.05627789637104599, + "grad_norm": 1.1365438528959075, + "learning_rate": 4.9221234343565456e-05, + "loss": 0.8533, + "step": 580 + }, + { + "epoch": 0.057248204929167476, + "grad_norm": 1.1791401079390134, + "learning_rate": 4.92050100590564e-05, + "loss": 0.8208, + "step": 590 + }, + { + "epoch": 0.05821851348728896, + "grad_norm": 1.165054586063552, + "learning_rate": 4.918878577454735e-05, + "loss": 0.7945, + "step": 600 + }, + { + "epoch": 0.05918882204541044, + "grad_norm": 1.0001362775159148, + "learning_rate": 4.917256149003829e-05, + "loss": 0.8426, + "step": 610 + }, + { + "epoch": 0.060159130603531924, + "grad_norm": 1.1955538077863535, + "learning_rate": 4.915633720552924e-05, + "loss": 0.8346, + "step": 620 + }, + { + "epoch": 0.06112943916165341, + "grad_norm": 1.016839980469479, + "learning_rate": 4.9140112921020184e-05, + "loss": 0.8224, + "step": 630 + }, + { + "epoch": 0.06209974771977489, + "grad_norm": 0.9782695489269584, + "learning_rate": 4.9123888636511134e-05, + "loss": 0.849, + "step": 640 + }, + { + "epoch": 0.06307005627789637, + "grad_norm": 1.0768176346339298, + "learning_rate": 4.9107664352002084e-05, + "loss": 0.8519, + "step": 650 + }, + { + "epoch": 0.06404036483601785, + "grad_norm": 1.0636061081219466, + "learning_rate": 4.909144006749303e-05, + "loss": 0.7887, + "step": 660 + }, + { + "epoch": 0.06501067339413934, + "grad_norm": 1.0734895912783853, + "learning_rate": 4.907521578298398e-05, + "loss": 0.8415, + "step": 670 + }, + { + "epoch": 0.06598098195226082, + "grad_norm": 1.0796531470403106, + "learning_rate": 4.905899149847492e-05, + "loss": 0.7512, + "step": 680 + }, + { + "epoch": 0.0669512905103823, + "grad_norm": 0.9361767973637918, + "learning_rate": 4.904276721396587e-05, + "loss": 0.8482, + "step": 690 + }, + { + "epoch": 0.06792159906850379, + "grad_norm": 1.5287022172498188, + "learning_rate": 4.902654292945681e-05, + "loss": 0.8389, + "step": 700 + }, + { + "epoch": 0.06889190762662527, + "grad_norm": 1.129894676066631, + "learning_rate": 4.901031864494776e-05, + "loss": 0.7629, + "step": 710 + }, + { + "epoch": 0.06986221618474675, + "grad_norm": 1.060923634010241, + "learning_rate": 4.8994094360438705e-05, + "loss": 0.8134, + "step": 720 + }, + { + "epoch": 0.07083252474286823, + "grad_norm": 1.121507522572716, + "learning_rate": 4.8977870075929655e-05, + "loss": 0.8295, + "step": 730 + }, + { + "epoch": 0.07180283330098972, + "grad_norm": 1.2375518463265478, + "learning_rate": 4.8961645791420605e-05, + "loss": 0.8006, + "step": 740 + }, + { + "epoch": 0.0727731418591112, + "grad_norm": 1.1957590053535605, + "learning_rate": 4.894542150691155e-05, + "loss": 0.8185, + "step": 750 + }, + { + "epoch": 0.07374345041723268, + "grad_norm": 1.1525450079069435, + "learning_rate": 4.89291972224025e-05, + "loss": 0.8186, + "step": 760 + }, + { + "epoch": 0.07471375897535416, + "grad_norm": 1.2637346291101144, + "learning_rate": 4.891297293789344e-05, + "loss": 0.8094, + "step": 770 + }, + { + "epoch": 0.07568406753347565, + "grad_norm": 1.1118603684784645, + "learning_rate": 4.889674865338439e-05, + "loss": 0.8075, + "step": 780 + }, + { + "epoch": 0.07665437609159713, + "grad_norm": 0.987279065085845, + "learning_rate": 4.888052436887533e-05, + "loss": 0.8088, + "step": 790 + }, + { + "epoch": 0.07762468464971861, + "grad_norm": 1.050050445685608, + "learning_rate": 4.886430008436628e-05, + "loss": 0.8283, + "step": 800 + }, + { + "epoch": 0.0785949932078401, + "grad_norm": 1.1064553828881234, + "learning_rate": 4.8848075799857226e-05, + "loss": 0.7906, + "step": 810 + }, + { + "epoch": 0.07956530176596158, + "grad_norm": 1.0679283923210974, + "learning_rate": 4.8831851515348176e-05, + "loss": 0.7888, + "step": 820 + }, + { + "epoch": 0.08053561032408306, + "grad_norm": 1.325000406776113, + "learning_rate": 4.8815627230839126e-05, + "loss": 0.8573, + "step": 830 + }, + { + "epoch": 0.08150591888220454, + "grad_norm": 1.1430990025005974, + "learning_rate": 4.879940294633007e-05, + "loss": 0.823, + "step": 840 + }, + { + "epoch": 0.08247622744032602, + "grad_norm": 0.9708573314011439, + "learning_rate": 4.878317866182102e-05, + "loss": 0.7812, + "step": 850 + }, + { + "epoch": 0.08344653599844751, + "grad_norm": 1.040516439704035, + "learning_rate": 4.876695437731196e-05, + "loss": 0.8043, + "step": 860 + }, + { + "epoch": 0.08441684455656899, + "grad_norm": 1.4624875951419556, + "learning_rate": 4.875073009280291e-05, + "loss": 0.8054, + "step": 870 + }, + { + "epoch": 0.08538715311469047, + "grad_norm": 1.2973550173157966, + "learning_rate": 4.8734505808293854e-05, + "loss": 0.7693, + "step": 880 + }, + { + "epoch": 0.08635746167281196, + "grad_norm": 1.21645485433679, + "learning_rate": 4.8718281523784804e-05, + "loss": 0.7402, + "step": 890 + }, + { + "epoch": 0.08732777023093344, + "grad_norm": 1.2989905455220712, + "learning_rate": 4.870205723927575e-05, + "loss": 0.7603, + "step": 900 + }, + { + "epoch": 0.08829807878905492, + "grad_norm": 1.1647056182819275, + "learning_rate": 4.86858329547667e-05, + "loss": 0.7576, + "step": 910 + }, + { + "epoch": 0.0892683873471764, + "grad_norm": 1.32821951027441, + "learning_rate": 4.8669608670257646e-05, + "loss": 0.7593, + "step": 920 + }, + { + "epoch": 0.09023869590529789, + "grad_norm": 0.9792995911846096, + "learning_rate": 4.865338438574859e-05, + "loss": 0.7726, + "step": 930 + }, + { + "epoch": 0.09120900446341937, + "grad_norm": 1.1716259071546666, + "learning_rate": 4.863716010123954e-05, + "loss": 0.7585, + "step": 940 + }, + { + "epoch": 0.09217931302154085, + "grad_norm": 1.4206264005961533, + "learning_rate": 4.862093581673048e-05, + "loss": 0.7922, + "step": 950 + }, + { + "epoch": 0.09314962157966233, + "grad_norm": 0.9969780924285648, + "learning_rate": 4.860471153222143e-05, + "loss": 0.8338, + "step": 960 + }, + { + "epoch": 0.09411993013778382, + "grad_norm": 1.2259249945683814, + "learning_rate": 4.8588487247712375e-05, + "loss": 0.7848, + "step": 970 + }, + { + "epoch": 0.0950902386959053, + "grad_norm": 1.304284918297249, + "learning_rate": 4.8572262963203325e-05, + "loss": 0.799, + "step": 980 + }, + { + "epoch": 0.09606054725402678, + "grad_norm": 1.382632512351389, + "learning_rate": 4.8556038678694274e-05, + "loss": 0.7358, + "step": 990 + }, + { + "epoch": 0.09703085581214826, + "grad_norm": 1.1047797334502243, + "learning_rate": 4.853981439418522e-05, + "loss": 0.7175, + "step": 1000 + }, + { + "epoch": 0.09800116437026975, + "grad_norm": 1.298146739590951, + "learning_rate": 4.852359010967617e-05, + "loss": 0.7694, + "step": 1010 + }, + { + "epoch": 0.09897147292839123, + "grad_norm": 1.3449159574026, + "learning_rate": 4.850736582516711e-05, + "loss": 0.7549, + "step": 1020 + }, + { + "epoch": 0.09994178148651271, + "grad_norm": 1.0510958795717098, + "learning_rate": 4.849114154065806e-05, + "loss": 0.7851, + "step": 1030 + }, + { + "epoch": 0.1009120900446342, + "grad_norm": 1.2932499353997113, + "learning_rate": 4.8474917256149e-05, + "loss": 0.7948, + "step": 1040 + }, + { + "epoch": 0.10188239860275568, + "grad_norm": 1.024772482994231, + "learning_rate": 4.845869297163995e-05, + "loss": 0.7551, + "step": 1050 + }, + { + "epoch": 0.10285270716087716, + "grad_norm": 0.9151226850367016, + "learning_rate": 4.8442468687130896e-05, + "loss": 0.8212, + "step": 1060 + }, + { + "epoch": 0.10382301571899864, + "grad_norm": 1.0356064137085648, + "learning_rate": 4.8426244402621846e-05, + "loss": 0.7815, + "step": 1070 + }, + { + "epoch": 0.10479332427712013, + "grad_norm": 1.0454220890712578, + "learning_rate": 4.8410020118112795e-05, + "loss": 0.7813, + "step": 1080 + }, + { + "epoch": 0.10576363283524161, + "grad_norm": 1.0245673186100301, + "learning_rate": 4.839379583360374e-05, + "loss": 0.7759, + "step": 1090 + }, + { + "epoch": 0.10673394139336309, + "grad_norm": 1.1805883290044246, + "learning_rate": 4.837757154909469e-05, + "loss": 0.8016, + "step": 1100 + }, + { + "epoch": 0.10770424995148457, + "grad_norm": 1.305171444801399, + "learning_rate": 4.836134726458563e-05, + "loss": 0.7603, + "step": 1110 + }, + { + "epoch": 0.10867455850960606, + "grad_norm": 1.1334982322569604, + "learning_rate": 4.834512298007658e-05, + "loss": 0.7818, + "step": 1120 + }, + { + "epoch": 0.10964486706772754, + "grad_norm": 1.4897469886835581, + "learning_rate": 4.8328898695567524e-05, + "loss": 0.7391, + "step": 1130 + }, + { + "epoch": 0.11061517562584902, + "grad_norm": 1.12299562877673, + "learning_rate": 4.8312674411058474e-05, + "loss": 0.8129, + "step": 1140 + }, + { + "epoch": 0.1115854841839705, + "grad_norm": 1.2581064725802191, + "learning_rate": 4.8296450126549417e-05, + "loss": 0.7132, + "step": 1150 + }, + { + "epoch": 0.11255579274209199, + "grad_norm": 1.1117033056933057, + "learning_rate": 4.8280225842040366e-05, + "loss": 0.7247, + "step": 1160 + }, + { + "epoch": 0.11352610130021347, + "grad_norm": 1.5072697151276053, + "learning_rate": 4.8264001557531316e-05, + "loss": 0.7832, + "step": 1170 + }, + { + "epoch": 0.11449640985833495, + "grad_norm": 1.3629201153146466, + "learning_rate": 4.824777727302226e-05, + "loss": 0.7894, + "step": 1180 + }, + { + "epoch": 0.11546671841645643, + "grad_norm": 1.3456355263382838, + "learning_rate": 4.823155298851321e-05, + "loss": 0.8145, + "step": 1190 + }, + { + "epoch": 0.11643702697457792, + "grad_norm": 1.3206439343547949, + "learning_rate": 4.821532870400415e-05, + "loss": 0.7802, + "step": 1200 + }, + { + "epoch": 0.1174073355326994, + "grad_norm": 1.0980976013487813, + "learning_rate": 4.81991044194951e-05, + "loss": 0.801, + "step": 1210 + }, + { + "epoch": 0.11837764409082088, + "grad_norm": 1.1370443655089153, + "learning_rate": 4.8182880134986045e-05, + "loss": 0.8006, + "step": 1220 + }, + { + "epoch": 0.11934795264894237, + "grad_norm": 1.4354316240483984, + "learning_rate": 4.8166655850476994e-05, + "loss": 0.8131, + "step": 1230 + }, + { + "epoch": 0.12031826120706385, + "grad_norm": 0.9112243461310537, + "learning_rate": 4.815043156596794e-05, + "loss": 0.8145, + "step": 1240 + }, + { + "epoch": 0.12128856976518533, + "grad_norm": 0.8213091259360801, + "learning_rate": 4.813420728145889e-05, + "loss": 0.7114, + "step": 1250 + }, + { + "epoch": 0.12225887832330681, + "grad_norm": 1.3405078182982422, + "learning_rate": 4.811798299694984e-05, + "loss": 0.7986, + "step": 1260 + }, + { + "epoch": 0.1232291868814283, + "grad_norm": 1.4540711324279514, + "learning_rate": 4.810175871244078e-05, + "loss": 0.7281, + "step": 1270 + }, + { + "epoch": 0.12419949543954978, + "grad_norm": 1.0040222075537582, + "learning_rate": 4.8085534427931737e-05, + "loss": 0.8048, + "step": 1280 + }, + { + "epoch": 0.12516980399767125, + "grad_norm": 1.9264161003194282, + "learning_rate": 4.806931014342268e-05, + "loss": 0.7512, + "step": 1290 + }, + { + "epoch": 0.12614011255579274, + "grad_norm": 1.155430871835739, + "learning_rate": 4.805308585891363e-05, + "loss": 0.7909, + "step": 1300 + }, + { + "epoch": 0.1271104211139142, + "grad_norm": 1.382153914724162, + "learning_rate": 4.803686157440457e-05, + "loss": 0.7613, + "step": 1310 + }, + { + "epoch": 0.1280807296720357, + "grad_norm": 1.1079273142955168, + "learning_rate": 4.802063728989552e-05, + "loss": 0.7747, + "step": 1320 + }, + { + "epoch": 0.12905103823015718, + "grad_norm": 1.4797892743755068, + "learning_rate": 4.8004413005386465e-05, + "loss": 0.7448, + "step": 1330 + }, + { + "epoch": 0.13002134678827867, + "grad_norm": 1.2840858833683126, + "learning_rate": 4.7988188720877415e-05, + "loss": 0.7567, + "step": 1340 + }, + { + "epoch": 0.13099165534640014, + "grad_norm": 1.7212676971427285, + "learning_rate": 4.797196443636836e-05, + "loss": 0.7743, + "step": 1350 + }, + { + "epoch": 0.13196196390452164, + "grad_norm": 1.7283159526025742, + "learning_rate": 4.795574015185931e-05, + "loss": 0.7696, + "step": 1360 + }, + { + "epoch": 0.1329322724626431, + "grad_norm": 1.0739621496825589, + "learning_rate": 4.793951586735026e-05, + "loss": 0.7408, + "step": 1370 + }, + { + "epoch": 0.1339025810207646, + "grad_norm": 1.183199109006057, + "learning_rate": 4.79232915828412e-05, + "loss": 0.7935, + "step": 1380 + }, + { + "epoch": 0.13487288957888607, + "grad_norm": 1.3981562096537532, + "learning_rate": 4.790706729833215e-05, + "loss": 0.7113, + "step": 1390 + }, + { + "epoch": 0.13584319813700757, + "grad_norm": 1.192516841319882, + "learning_rate": 4.789084301382309e-05, + "loss": 0.7786, + "step": 1400 + }, + { + "epoch": 0.13681350669512904, + "grad_norm": 1.9269898255758637, + "learning_rate": 4.787461872931404e-05, + "loss": 0.7262, + "step": 1410 + }, + { + "epoch": 0.13778381525325054, + "grad_norm": 1.1218056549529871, + "learning_rate": 4.7858394444804986e-05, + "loss": 0.6762, + "step": 1420 + }, + { + "epoch": 0.138754123811372, + "grad_norm": 1.3635607352483248, + "learning_rate": 4.7842170160295936e-05, + "loss": 0.7733, + "step": 1430 + }, + { + "epoch": 0.1397244323694935, + "grad_norm": 1.2380674478503626, + "learning_rate": 4.782594587578688e-05, + "loss": 0.7599, + "step": 1440 + }, + { + "epoch": 0.14069474092761497, + "grad_norm": 1.4869366640536255, + "learning_rate": 4.780972159127783e-05, + "loss": 0.7408, + "step": 1450 + }, + { + "epoch": 0.14166504948573647, + "grad_norm": 1.413887084722376, + "learning_rate": 4.779349730676878e-05, + "loss": 0.7792, + "step": 1460 + }, + { + "epoch": 0.14263535804385793, + "grad_norm": 1.1522475316568597, + "learning_rate": 4.777727302225972e-05, + "loss": 0.7494, + "step": 1470 + }, + { + "epoch": 0.14360566660197943, + "grad_norm": 1.5458892686752617, + "learning_rate": 4.776104873775067e-05, + "loss": 0.7151, + "step": 1480 + }, + { + "epoch": 0.1445759751601009, + "grad_norm": 1.8892150029961168, + "learning_rate": 4.7744824453241614e-05, + "loss": 0.7025, + "step": 1490 + }, + { + "epoch": 0.1455462837182224, + "grad_norm": 1.5206314987834826, + "learning_rate": 4.7728600168732564e-05, + "loss": 0.7748, + "step": 1500 + }, + { + "epoch": 0.14651659227634387, + "grad_norm": 1.2234736643633124, + "learning_rate": 4.771237588422351e-05, + "loss": 0.7321, + "step": 1510 + }, + { + "epoch": 0.14748690083446536, + "grad_norm": 1.3582944219822406, + "learning_rate": 4.7696151599714456e-05, + "loss": 0.7524, + "step": 1520 + }, + { + "epoch": 0.14845720939258683, + "grad_norm": 1.3505468356868415, + "learning_rate": 4.76799273152054e-05, + "loss": 0.8032, + "step": 1530 + }, + { + "epoch": 0.14942751795070833, + "grad_norm": 1.161565772908295, + "learning_rate": 4.766370303069635e-05, + "loss": 0.7449, + "step": 1540 + }, + { + "epoch": 0.1503978265088298, + "grad_norm": 1.5018575379494306, + "learning_rate": 4.76474787461873e-05, + "loss": 0.779, + "step": 1550 + }, + { + "epoch": 0.1513681350669513, + "grad_norm": 1.2744144878497465, + "learning_rate": 4.763125446167824e-05, + "loss": 0.7396, + "step": 1560 + }, + { + "epoch": 0.15233844362507276, + "grad_norm": 1.2326350258332728, + "learning_rate": 4.761503017716919e-05, + "loss": 0.7249, + "step": 1570 + }, + { + "epoch": 0.15330875218319426, + "grad_norm": 1.2982918445496714, + "learning_rate": 4.7598805892660135e-05, + "loss": 0.7453, + "step": 1580 + }, + { + "epoch": 0.15427906074131573, + "grad_norm": 1.4792372825641715, + "learning_rate": 4.7582581608151085e-05, + "loss": 0.7573, + "step": 1590 + }, + { + "epoch": 0.15524936929943722, + "grad_norm": 1.2935035055138504, + "learning_rate": 4.756635732364203e-05, + "loss": 0.7125, + "step": 1600 + }, + { + "epoch": 0.1562196778575587, + "grad_norm": 1.2527694639896991, + "learning_rate": 4.755013303913298e-05, + "loss": 0.7406, + "step": 1610 + }, + { + "epoch": 0.1571899864156802, + "grad_norm": 1.0724672098454868, + "learning_rate": 4.753390875462392e-05, + "loss": 0.7108, + "step": 1620 + }, + { + "epoch": 0.15816029497380166, + "grad_norm": 1.1696947872465324, + "learning_rate": 4.751768447011487e-05, + "loss": 0.7636, + "step": 1630 + }, + { + "epoch": 0.15913060353192315, + "grad_norm": 1.216857763890884, + "learning_rate": 4.750146018560582e-05, + "loss": 0.7916, + "step": 1640 + }, + { + "epoch": 0.16010091209004462, + "grad_norm": 1.1371281502973842, + "learning_rate": 4.748523590109676e-05, + "loss": 0.7663, + "step": 1650 + }, + { + "epoch": 0.16107122064816612, + "grad_norm": 1.2599433230373354, + "learning_rate": 4.746901161658771e-05, + "loss": 0.7279, + "step": 1660 + }, + { + "epoch": 0.1620415292062876, + "grad_norm": 1.4580521921521419, + "learning_rate": 4.7452787332078656e-05, + "loss": 0.7241, + "step": 1670 + }, + { + "epoch": 0.16301183776440908, + "grad_norm": 1.2227652156436173, + "learning_rate": 4.7436563047569605e-05, + "loss": 0.7082, + "step": 1680 + }, + { + "epoch": 0.16398214632253055, + "grad_norm": 1.3578429437561153, + "learning_rate": 4.742033876306055e-05, + "loss": 0.6972, + "step": 1690 + }, + { + "epoch": 0.16495245488065205, + "grad_norm": 1.2809095001474842, + "learning_rate": 4.74041144785515e-05, + "loss": 0.7267, + "step": 1700 + }, + { + "epoch": 0.16592276343877352, + "grad_norm": 1.1017366555347645, + "learning_rate": 4.738789019404245e-05, + "loss": 0.7243, + "step": 1710 + }, + { + "epoch": 0.16689307199689501, + "grad_norm": 1.2852463688715783, + "learning_rate": 4.737166590953339e-05, + "loss": 0.7276, + "step": 1720 + }, + { + "epoch": 0.16786338055501648, + "grad_norm": 1.3099425645156408, + "learning_rate": 4.735544162502434e-05, + "loss": 0.7583, + "step": 1730 + }, + { + "epoch": 0.16883368911313798, + "grad_norm": 1.5220953005112245, + "learning_rate": 4.7339217340515284e-05, + "loss": 0.7651, + "step": 1740 + }, + { + "epoch": 0.16980399767125945, + "grad_norm": 1.3251019142596, + "learning_rate": 4.7322993056006233e-05, + "loss": 0.7428, + "step": 1750 + }, + { + "epoch": 0.17077430622938095, + "grad_norm": 1.3275994643711895, + "learning_rate": 4.7306768771497176e-05, + "loss": 0.7552, + "step": 1760 + }, + { + "epoch": 0.17174461478750241, + "grad_norm": 1.1363294732621385, + "learning_rate": 4.7290544486988126e-05, + "loss": 0.7441, + "step": 1770 + }, + { + "epoch": 0.1727149233456239, + "grad_norm": 1.6604556501118164, + "learning_rate": 4.727432020247907e-05, + "loss": 0.7404, + "step": 1780 + }, + { + "epoch": 0.17368523190374538, + "grad_norm": 1.4971063178125654, + "learning_rate": 4.725809591797002e-05, + "loss": 0.707, + "step": 1790 + }, + { + "epoch": 0.17465554046186688, + "grad_norm": 1.296038406932857, + "learning_rate": 4.724187163346097e-05, + "loss": 0.7226, + "step": 1800 + }, + { + "epoch": 0.17562584901998834, + "grad_norm": 0.9900558600646284, + "learning_rate": 4.722564734895191e-05, + "loss": 0.7107, + "step": 1810 + }, + { + "epoch": 0.17659615757810984, + "grad_norm": 0.9589095684989648, + "learning_rate": 4.720942306444286e-05, + "loss": 0.6789, + "step": 1820 + }, + { + "epoch": 0.1775664661362313, + "grad_norm": 1.300487413115222, + "learning_rate": 4.7193198779933805e-05, + "loss": 0.7656, + "step": 1830 + }, + { + "epoch": 0.1785367746943528, + "grad_norm": 1.4678054839136885, + "learning_rate": 4.7176974495424754e-05, + "loss": 0.7559, + "step": 1840 + }, + { + "epoch": 0.17950708325247428, + "grad_norm": 1.1487384302132937, + "learning_rate": 4.71607502109157e-05, + "loss": 0.7218, + "step": 1850 + }, + { + "epoch": 0.18047739181059577, + "grad_norm": 1.5013237178369594, + "learning_rate": 4.714452592640665e-05, + "loss": 0.6618, + "step": 1860 + }, + { + "epoch": 0.18144770036871724, + "grad_norm": 1.3544109774626758, + "learning_rate": 4.712830164189759e-05, + "loss": 0.7348, + "step": 1870 + }, + { + "epoch": 0.18241800892683874, + "grad_norm": 1.7098997535129123, + "learning_rate": 4.711207735738854e-05, + "loss": 0.6721, + "step": 1880 + }, + { + "epoch": 0.1833883174849602, + "grad_norm": 1.439742442692076, + "learning_rate": 4.709585307287949e-05, + "loss": 0.6823, + "step": 1890 + }, + { + "epoch": 0.1843586260430817, + "grad_norm": 1.1240799236385792, + "learning_rate": 4.707962878837043e-05, + "loss": 0.7337, + "step": 1900 + }, + { + "epoch": 0.18532893460120317, + "grad_norm": 1.611378134043144, + "learning_rate": 4.706340450386138e-05, + "loss": 0.7348, + "step": 1910 + }, + { + "epoch": 0.18629924315932467, + "grad_norm": 1.15451740477432, + "learning_rate": 4.7047180219352325e-05, + "loss": 0.6945, + "step": 1920 + }, + { + "epoch": 0.18726955171744614, + "grad_norm": 1.5743486470192276, + "learning_rate": 4.7030955934843275e-05, + "loss": 0.8062, + "step": 1930 + }, + { + "epoch": 0.18823986027556763, + "grad_norm": 1.0248040084556684, + "learning_rate": 4.701473165033422e-05, + "loss": 0.7155, + "step": 1940 + }, + { + "epoch": 0.1892101688336891, + "grad_norm": 1.0310414805904136, + "learning_rate": 4.699850736582517e-05, + "loss": 0.6776, + "step": 1950 + }, + { + "epoch": 0.1901804773918106, + "grad_norm": 1.0785156193356158, + "learning_rate": 4.698228308131611e-05, + "loss": 0.7092, + "step": 1960 + }, + { + "epoch": 0.19115078594993207, + "grad_norm": 1.4378941602091937, + "learning_rate": 4.696605879680706e-05, + "loss": 0.7286, + "step": 1970 + }, + { + "epoch": 0.19212109450805356, + "grad_norm": 1.1595135486156671, + "learning_rate": 4.694983451229801e-05, + "loss": 0.7479, + "step": 1980 + }, + { + "epoch": 0.19309140306617503, + "grad_norm": 1.69348514333408, + "learning_rate": 4.6933610227788953e-05, + "loss": 0.7263, + "step": 1990 + }, + { + "epoch": 0.19406171162429653, + "grad_norm": 1.502953634133657, + "learning_rate": 4.69173859432799e-05, + "loss": 0.7655, + "step": 2000 + }, + { + "epoch": 0.19406171162429653, + "eval_loss": 0.7587813138961792, + "eval_runtime": 2477.8973, + "eval_samples_per_second": 0.723, + "eval_steps_per_second": 0.362, + "step": 2000 + }, + { + "epoch": 0.195032020182418, + "grad_norm": 1.4070334559445785, + "learning_rate": 4.6901161658770846e-05, + "loss": 0.7475, + "step": 2010 + }, + { + "epoch": 0.1960023287405395, + "grad_norm": 0.8673128363877267, + "learning_rate": 4.6884937374261796e-05, + "loss": 0.6747, + "step": 2020 + }, + { + "epoch": 0.19697263729866096, + "grad_norm": 1.428849127809278, + "learning_rate": 4.686871308975274e-05, + "loss": 0.7666, + "step": 2030 + }, + { + "epoch": 0.19794294585678246, + "grad_norm": 1.2749540514563555, + "learning_rate": 4.685248880524369e-05, + "loss": 0.7346, + "step": 2040 + }, + { + "epoch": 0.19891325441490393, + "grad_norm": 1.2400883720583105, + "learning_rate": 4.683626452073464e-05, + "loss": 0.7012, + "step": 2050 + }, + { + "epoch": 0.19988356297302542, + "grad_norm": 1.4035260703119048, + "learning_rate": 4.682004023622559e-05, + "loss": 0.6697, + "step": 2060 + }, + { + "epoch": 0.2008538715311469, + "grad_norm": 1.9046063428505313, + "learning_rate": 4.680381595171653e-05, + "loss": 0.7126, + "step": 2070 + }, + { + "epoch": 0.2018241800892684, + "grad_norm": 1.3281602578294986, + "learning_rate": 4.678759166720748e-05, + "loss": 0.6895, + "step": 2080 + }, + { + "epoch": 0.20279448864738986, + "grad_norm": 1.271644663424638, + "learning_rate": 4.677136738269843e-05, + "loss": 0.7354, + "step": 2090 + }, + { + "epoch": 0.20376479720551136, + "grad_norm": 1.2206696245686643, + "learning_rate": 4.6755143098189374e-05, + "loss": 0.7252, + "step": 2100 + }, + { + "epoch": 0.20473510576363282, + "grad_norm": 1.3032653564716208, + "learning_rate": 4.6738918813680324e-05, + "loss": 0.683, + "step": 2110 + }, + { + "epoch": 0.20570541432175432, + "grad_norm": 1.3212954807490243, + "learning_rate": 4.6722694529171267e-05, + "loss": 0.737, + "step": 2120 + }, + { + "epoch": 0.2066757228798758, + "grad_norm": 1.2586128849417775, + "learning_rate": 4.6706470244662216e-05, + "loss": 0.6841, + "step": 2130 + }, + { + "epoch": 0.20764603143799729, + "grad_norm": 1.3491929764497603, + "learning_rate": 4.669024596015316e-05, + "loss": 0.7637, + "step": 2140 + }, + { + "epoch": 0.20861633999611875, + "grad_norm": 1.5081978528458062, + "learning_rate": 4.667402167564411e-05, + "loss": 0.713, + "step": 2150 + }, + { + "epoch": 0.20958664855424025, + "grad_norm": 1.5788893940918114, + "learning_rate": 4.665779739113505e-05, + "loss": 0.6337, + "step": 2160 + }, + { + "epoch": 0.21055695711236172, + "grad_norm": 1.422646038947752, + "learning_rate": 4.6641573106626e-05, + "loss": 0.6653, + "step": 2170 + }, + { + "epoch": 0.21152726567048322, + "grad_norm": 1.3417475771584986, + "learning_rate": 4.662534882211695e-05, + "loss": 0.7246, + "step": 2180 + }, + { + "epoch": 0.21249757422860469, + "grad_norm": 1.0925391942237144, + "learning_rate": 4.6609124537607895e-05, + "loss": 0.7228, + "step": 2190 + }, + { + "epoch": 0.21346788278672618, + "grad_norm": 1.8433218311262853, + "learning_rate": 4.6592900253098844e-05, + "loss": 0.7507, + "step": 2200 + }, + { + "epoch": 0.21443819134484765, + "grad_norm": 1.792431274692439, + "learning_rate": 4.657667596858979e-05, + "loss": 0.6949, + "step": 2210 + }, + { + "epoch": 0.21540849990296915, + "grad_norm": 1.3809251069016177, + "learning_rate": 4.656045168408074e-05, + "loss": 0.7252, + "step": 2220 + }, + { + "epoch": 0.21637880846109062, + "grad_norm": 1.3641658734062512, + "learning_rate": 4.654422739957168e-05, + "loss": 0.7518, + "step": 2230 + }, + { + "epoch": 0.2173491170192121, + "grad_norm": 1.452315608698053, + "learning_rate": 4.652800311506263e-05, + "loss": 0.6684, + "step": 2240 + }, + { + "epoch": 0.21831942557733358, + "grad_norm": 1.1444356316783801, + "learning_rate": 4.651177883055357e-05, + "loss": 0.6714, + "step": 2250 + }, + { + "epoch": 0.21928973413545508, + "grad_norm": 1.2977358748240138, + "learning_rate": 4.649555454604452e-05, + "loss": 0.7129, + "step": 2260 + }, + { + "epoch": 0.22026004269357655, + "grad_norm": 1.328329389241565, + "learning_rate": 4.647933026153547e-05, + "loss": 0.6292, + "step": 2270 + }, + { + "epoch": 0.22123035125169804, + "grad_norm": 1.2461104595186587, + "learning_rate": 4.6463105977026415e-05, + "loss": 0.7372, + "step": 2280 + }, + { + "epoch": 0.2222006598098195, + "grad_norm": 1.3008437677725404, + "learning_rate": 4.6446881692517365e-05, + "loss": 0.6503, + "step": 2290 + }, + { + "epoch": 0.223170968367941, + "grad_norm": 1.3630765232741517, + "learning_rate": 4.643065740800831e-05, + "loss": 0.657, + "step": 2300 + }, + { + "epoch": 0.22414127692606248, + "grad_norm": 0.9600325037717949, + "learning_rate": 4.641443312349926e-05, + "loss": 0.7117, + "step": 2310 + }, + { + "epoch": 0.22511158548418397, + "grad_norm": 1.733755330685857, + "learning_rate": 4.63982088389902e-05, + "loss": 0.7594, + "step": 2320 + }, + { + "epoch": 0.22608189404230544, + "grad_norm": 1.722193618002385, + "learning_rate": 4.638198455448115e-05, + "loss": 0.6555, + "step": 2330 + }, + { + "epoch": 0.22705220260042694, + "grad_norm": 1.2647254702280388, + "learning_rate": 4.63657602699721e-05, + "loss": 0.6996, + "step": 2340 + }, + { + "epoch": 0.2280225111585484, + "grad_norm": 1.2801225623311008, + "learning_rate": 4.6349535985463043e-05, + "loss": 0.7038, + "step": 2350 + }, + { + "epoch": 0.2289928197166699, + "grad_norm": 1.214420395864136, + "learning_rate": 4.633331170095399e-05, + "loss": 0.6617, + "step": 2360 + }, + { + "epoch": 0.22996312827479137, + "grad_norm": 1.1970250608654163, + "learning_rate": 4.6317087416444936e-05, + "loss": 0.7505, + "step": 2370 + }, + { + "epoch": 0.23093343683291287, + "grad_norm": 1.318541272085749, + "learning_rate": 4.6300863131935886e-05, + "loss": 0.703, + "step": 2380 + }, + { + "epoch": 0.23190374539103434, + "grad_norm": 1.0733620278703964, + "learning_rate": 4.628463884742683e-05, + "loss": 0.7076, + "step": 2390 + }, + { + "epoch": 0.23287405394915583, + "grad_norm": 1.320066160024292, + "learning_rate": 4.626841456291778e-05, + "loss": 0.6845, + "step": 2400 + }, + { + "epoch": 0.2338443625072773, + "grad_norm": 1.3916335472579557, + "learning_rate": 4.625219027840872e-05, + "loss": 0.7213, + "step": 2410 + }, + { + "epoch": 0.2348146710653988, + "grad_norm": 1.6506684734262902, + "learning_rate": 4.623596599389967e-05, + "loss": 0.6995, + "step": 2420 + }, + { + "epoch": 0.23578497962352027, + "grad_norm": 0.8976988021024955, + "learning_rate": 4.621974170939062e-05, + "loss": 0.6994, + "step": 2430 + }, + { + "epoch": 0.23675528818164177, + "grad_norm": 1.7783475330175254, + "learning_rate": 4.6203517424881564e-05, + "loss": 0.7475, + "step": 2440 + }, + { + "epoch": 0.23772559673976323, + "grad_norm": 1.3329956259541478, + "learning_rate": 4.6187293140372514e-05, + "loss": 0.6991, + "step": 2450 + }, + { + "epoch": 0.23869590529788473, + "grad_norm": 1.3282617232306233, + "learning_rate": 4.617106885586346e-05, + "loss": 0.7328, + "step": 2460 + }, + { + "epoch": 0.2396662138560062, + "grad_norm": 1.1394548446385124, + "learning_rate": 4.615484457135441e-05, + "loss": 0.7092, + "step": 2470 + }, + { + "epoch": 0.2406365224141277, + "grad_norm": 1.3304671581253036, + "learning_rate": 4.613862028684535e-05, + "loss": 0.6765, + "step": 2480 + }, + { + "epoch": 0.24160683097224916, + "grad_norm": 1.5514738877871737, + "learning_rate": 4.61223960023363e-05, + "loss": 0.6558, + "step": 2490 + }, + { + "epoch": 0.24257713953037066, + "grad_norm": 1.3668799500092241, + "learning_rate": 4.610617171782724e-05, + "loss": 0.7106, + "step": 2500 + }, + { + "epoch": 0.24257713953037066, + "eval_loss": 0.7454198598861694, + "eval_runtime": 2468.3109, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 2500 + }, + { + "epoch": 0.24354744808849213, + "grad_norm": 1.27051542074557, + "learning_rate": 4.608994743331819e-05, + "loss": 0.7037, + "step": 2510 + }, + { + "epoch": 0.24451775664661363, + "grad_norm": 1.4333699367072212, + "learning_rate": 4.607372314880914e-05, + "loss": 0.7054, + "step": 2520 + }, + { + "epoch": 0.2454880652047351, + "grad_norm": 1.3383562796673616, + "learning_rate": 4.6057498864300085e-05, + "loss": 0.7674, + "step": 2530 + }, + { + "epoch": 0.2464583737628566, + "grad_norm": 1.5405408532446832, + "learning_rate": 4.6041274579791035e-05, + "loss": 0.7541, + "step": 2540 + }, + { + "epoch": 0.24742868232097806, + "grad_norm": 1.4592658544748531, + "learning_rate": 4.602505029528198e-05, + "loss": 0.7286, + "step": 2550 + }, + { + "epoch": 0.24839899087909956, + "grad_norm": 1.4060719508768202, + "learning_rate": 4.600882601077293e-05, + "loss": 0.7722, + "step": 2560 + }, + { + "epoch": 0.24936929943722103, + "grad_norm": 1.2688352607364943, + "learning_rate": 4.599260172626387e-05, + "loss": 0.6851, + "step": 2570 + }, + { + "epoch": 0.2503396079953425, + "grad_norm": 1.1798367463314652, + "learning_rate": 4.597637744175482e-05, + "loss": 0.6897, + "step": 2580 + }, + { + "epoch": 0.251309916553464, + "grad_norm": 1.4210256244438406, + "learning_rate": 4.5960153157245763e-05, + "loss": 0.7005, + "step": 2590 + }, + { + "epoch": 0.2522802251115855, + "grad_norm": 1.3304574917634664, + "learning_rate": 4.594392887273671e-05, + "loss": 0.6878, + "step": 2600 + }, + { + "epoch": 0.25325053366970696, + "grad_norm": 1.4281359203156077, + "learning_rate": 4.592770458822766e-05, + "loss": 0.7519, + "step": 2610 + }, + { + "epoch": 0.2542208422278284, + "grad_norm": 1.351420571440429, + "learning_rate": 4.5911480303718606e-05, + "loss": 0.6939, + "step": 2620 + }, + { + "epoch": 0.25519115078594995, + "grad_norm": 1.3618970623647955, + "learning_rate": 4.5895256019209556e-05, + "loss": 0.669, + "step": 2630 + }, + { + "epoch": 0.2561614593440714, + "grad_norm": 1.1008259683717303, + "learning_rate": 4.58790317347005e-05, + "loss": 0.6331, + "step": 2640 + }, + { + "epoch": 0.2571317679021929, + "grad_norm": 1.690823489066079, + "learning_rate": 4.586280745019145e-05, + "loss": 0.6571, + "step": 2650 + }, + { + "epoch": 0.25810207646031436, + "grad_norm": 1.2328191346656623, + "learning_rate": 4.584658316568239e-05, + "loss": 0.6712, + "step": 2660 + }, + { + "epoch": 0.2590723850184359, + "grad_norm": 1.1997509506925832, + "learning_rate": 4.583035888117334e-05, + "loss": 0.6998, + "step": 2670 + }, + { + "epoch": 0.26004269357655735, + "grad_norm": 1.3726212075390893, + "learning_rate": 4.5814134596664284e-05, + "loss": 0.6577, + "step": 2680 + }, + { + "epoch": 0.2610130021346788, + "grad_norm": 1.4778299478224584, + "learning_rate": 4.5797910312155234e-05, + "loss": 0.7213, + "step": 2690 + }, + { + "epoch": 0.2619833106928003, + "grad_norm": 1.2065482843241282, + "learning_rate": 4.5781686027646184e-05, + "loss": 0.6504, + "step": 2700 + }, + { + "epoch": 0.2629536192509218, + "grad_norm": 1.6950271620933635, + "learning_rate": 4.576546174313713e-05, + "loss": 0.6938, + "step": 2710 + }, + { + "epoch": 0.2639239278090433, + "grad_norm": 1.049429219350124, + "learning_rate": 4.5749237458628077e-05, + "loss": 0.7235, + "step": 2720 + }, + { + "epoch": 0.26489423636716475, + "grad_norm": 1.2856525880832654, + "learning_rate": 4.573301317411902e-05, + "loss": 0.769, + "step": 2730 + }, + { + "epoch": 0.2658645449252862, + "grad_norm": 1.1413973985217811, + "learning_rate": 4.571678888960997e-05, + "loss": 0.6542, + "step": 2740 + }, + { + "epoch": 0.26683485348340774, + "grad_norm": 1.2963652895204112, + "learning_rate": 4.570056460510091e-05, + "loss": 0.6988, + "step": 2750 + }, + { + "epoch": 0.2678051620415292, + "grad_norm": 1.200192916715494, + "learning_rate": 4.568434032059186e-05, + "loss": 0.662, + "step": 2760 + }, + { + "epoch": 0.2687754705996507, + "grad_norm": 1.2418786623251215, + "learning_rate": 4.566811603608281e-05, + "loss": 0.6615, + "step": 2770 + }, + { + "epoch": 0.26974577915777215, + "grad_norm": 1.2920018198618357, + "learning_rate": 4.5651891751573755e-05, + "loss": 0.683, + "step": 2780 + }, + { + "epoch": 0.27071608771589367, + "grad_norm": 1.721771869410589, + "learning_rate": 4.5635667467064705e-05, + "loss": 0.6742, + "step": 2790 + }, + { + "epoch": 0.27168639627401514, + "grad_norm": 1.595428896858632, + "learning_rate": 4.561944318255565e-05, + "loss": 0.6961, + "step": 2800 + }, + { + "epoch": 0.2726567048321366, + "grad_norm": 1.5674989639835977, + "learning_rate": 4.56032188980466e-05, + "loss": 0.6246, + "step": 2810 + }, + { + "epoch": 0.2736270133902581, + "grad_norm": 1.333761320008167, + "learning_rate": 4.558699461353754e-05, + "loss": 0.7142, + "step": 2820 + }, + { + "epoch": 0.2745973219483796, + "grad_norm": 1.175942959739195, + "learning_rate": 4.557077032902849e-05, + "loss": 0.7335, + "step": 2830 + }, + { + "epoch": 0.27556763050650107, + "grad_norm": 1.0433896859463463, + "learning_rate": 4.555454604451944e-05, + "loss": 0.7523, + "step": 2840 + }, + { + "epoch": 0.27653793906462254, + "grad_norm": 1.4484346047275096, + "learning_rate": 4.553832176001039e-05, + "loss": 0.6576, + "step": 2850 + }, + { + "epoch": 0.277508247622744, + "grad_norm": 1.922226387784083, + "learning_rate": 4.552209747550133e-05, + "loss": 0.6797, + "step": 2860 + }, + { + "epoch": 0.27847855618086553, + "grad_norm": 1.4433287579337053, + "learning_rate": 4.550587319099228e-05, + "loss": 0.6787, + "step": 2870 + }, + { + "epoch": 0.279448864738987, + "grad_norm": 1.4997180997318538, + "learning_rate": 4.5489648906483225e-05, + "loss": 0.7091, + "step": 2880 + }, + { + "epoch": 0.28041917329710847, + "grad_norm": 1.347204680869145, + "learning_rate": 4.5473424621974175e-05, + "loss": 0.725, + "step": 2890 + }, + { + "epoch": 0.28138948185522994, + "grad_norm": 1.3046706448190106, + "learning_rate": 4.5457200337465125e-05, + "loss": 0.6669, + "step": 2900 + }, + { + "epoch": 0.28235979041335146, + "grad_norm": 1.1467739204887912, + "learning_rate": 4.544097605295607e-05, + "loss": 0.756, + "step": 2910 + }, + { + "epoch": 0.28333009897147293, + "grad_norm": 1.162771827035537, + "learning_rate": 4.542475176844702e-05, + "loss": 0.7147, + "step": 2920 + }, + { + "epoch": 0.2843004075295944, + "grad_norm": 1.6563795609214405, + "learning_rate": 4.540852748393796e-05, + "loss": 0.6845, + "step": 2930 + }, + { + "epoch": 0.28527071608771587, + "grad_norm": 1.6728193237645246, + "learning_rate": 4.539230319942891e-05, + "loss": 0.673, + "step": 2940 + }, + { + "epoch": 0.2862410246458374, + "grad_norm": 1.194612705131068, + "learning_rate": 4.5376078914919854e-05, + "loss": 0.6942, + "step": 2950 + }, + { + "epoch": 0.28721133320395886, + "grad_norm": 1.439958876835649, + "learning_rate": 4.53598546304108e-05, + "loss": 0.6809, + "step": 2960 + }, + { + "epoch": 0.28818164176208033, + "grad_norm": 1.5873335965261735, + "learning_rate": 4.534363034590175e-05, + "loss": 0.6877, + "step": 2970 + }, + { + "epoch": 0.2891519503202018, + "grad_norm": 1.0997513483559662, + "learning_rate": 4.5327406061392696e-05, + "loss": 0.665, + "step": 2980 + }, + { + "epoch": 0.2901222588783233, + "grad_norm": 1.3952744081716912, + "learning_rate": 4.5311181776883646e-05, + "loss": 0.7524, + "step": 2990 + }, + { + "epoch": 0.2910925674364448, + "grad_norm": 1.1613111987177211, + "learning_rate": 4.529495749237459e-05, + "loss": 0.6659, + "step": 3000 + }, + { + "epoch": 0.2910925674364448, + "eval_loss": 0.7351760268211365, + "eval_runtime": 2466.7648, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 3000 + }, + { + "epoch": 0.29206287599456626, + "grad_norm": 1.404418707672653, + "learning_rate": 4.527873320786554e-05, + "loss": 0.6635, + "step": 3010 + }, + { + "epoch": 0.29303318455268773, + "grad_norm": 1.7333713648105467, + "learning_rate": 4.526250892335648e-05, + "loss": 0.6918, + "step": 3020 + }, + { + "epoch": 0.29400349311080926, + "grad_norm": 1.325947337933012, + "learning_rate": 4.524628463884743e-05, + "loss": 0.708, + "step": 3030 + }, + { + "epoch": 0.2949738016689307, + "grad_norm": 1.420774851884038, + "learning_rate": 4.5230060354338374e-05, + "loss": 0.7173, + "step": 3040 + }, + { + "epoch": 0.2959441102270522, + "grad_norm": 1.1753762610267322, + "learning_rate": 4.5213836069829324e-05, + "loss": 0.6654, + "step": 3050 + }, + { + "epoch": 0.29691441878517366, + "grad_norm": 1.526767780243654, + "learning_rate": 4.5197611785320274e-05, + "loss": 0.6321, + "step": 3060 + }, + { + "epoch": 0.2978847273432952, + "grad_norm": 1.8280158534376985, + "learning_rate": 4.518138750081122e-05, + "loss": 0.6647, + "step": 3070 + }, + { + "epoch": 0.29885503590141665, + "grad_norm": 1.4181228966831005, + "learning_rate": 4.516516321630217e-05, + "loss": 0.7338, + "step": 3080 + }, + { + "epoch": 0.2998253444595381, + "grad_norm": 1.7077549961994072, + "learning_rate": 4.514893893179311e-05, + "loss": 0.6272, + "step": 3090 + }, + { + "epoch": 0.3007956530176596, + "grad_norm": 1.518857601222952, + "learning_rate": 4.513271464728406e-05, + "loss": 0.6184, + "step": 3100 + }, + { + "epoch": 0.3017659615757811, + "grad_norm": 1.5688964933772704, + "learning_rate": 4.5116490362775e-05, + "loss": 0.6904, + "step": 3110 + }, + { + "epoch": 0.3027362701339026, + "grad_norm": 1.346013000056015, + "learning_rate": 4.510026607826595e-05, + "loss": 0.6591, + "step": 3120 + }, + { + "epoch": 0.30370657869202405, + "grad_norm": 1.7289926750048026, + "learning_rate": 4.5084041793756895e-05, + "loss": 0.602, + "step": 3130 + }, + { + "epoch": 0.3046768872501455, + "grad_norm": 1.634884511910698, + "learning_rate": 4.5067817509247845e-05, + "loss": 0.6344, + "step": 3140 + }, + { + "epoch": 0.30564719580826705, + "grad_norm": 1.2072013322253554, + "learning_rate": 4.5051593224738795e-05, + "loss": 0.6441, + "step": 3150 + }, + { + "epoch": 0.3066175043663885, + "grad_norm": 1.4877523069726029, + "learning_rate": 4.503536894022974e-05, + "loss": 0.6711, + "step": 3160 + }, + { + "epoch": 0.30758781292451, + "grad_norm": 1.3820572558751547, + "learning_rate": 4.501914465572069e-05, + "loss": 0.7005, + "step": 3170 + }, + { + "epoch": 0.30855812148263145, + "grad_norm": 1.4545570501775118, + "learning_rate": 4.500292037121163e-05, + "loss": 0.6796, + "step": 3180 + }, + { + "epoch": 0.309528430040753, + "grad_norm": 1.4415846087886384, + "learning_rate": 4.498669608670258e-05, + "loss": 0.6753, + "step": 3190 + }, + { + "epoch": 0.31049873859887445, + "grad_norm": 1.253477394104618, + "learning_rate": 4.497047180219352e-05, + "loss": 0.745, + "step": 3200 + }, + { + "epoch": 0.3114690471569959, + "grad_norm": 1.6938495814472803, + "learning_rate": 4.495424751768447e-05, + "loss": 0.6656, + "step": 3210 + }, + { + "epoch": 0.3124393557151174, + "grad_norm": 1.7041386856543572, + "learning_rate": 4.4938023233175416e-05, + "loss": 0.678, + "step": 3220 + }, + { + "epoch": 0.3134096642732389, + "grad_norm": 1.945535890749437, + "learning_rate": 4.4921798948666366e-05, + "loss": 0.6741, + "step": 3230 + }, + { + "epoch": 0.3143799728313604, + "grad_norm": 1.504845552046309, + "learning_rate": 4.4905574664157316e-05, + "loss": 0.7055, + "step": 3240 + }, + { + "epoch": 0.31535028138948185, + "grad_norm": 1.5218102037928898, + "learning_rate": 4.488935037964826e-05, + "loss": 0.6664, + "step": 3250 + }, + { + "epoch": 0.3163205899476033, + "grad_norm": 1.281958434229701, + "learning_rate": 4.487312609513921e-05, + "loss": 0.689, + "step": 3260 + }, + { + "epoch": 0.31729089850572484, + "grad_norm": 1.4017870385811553, + "learning_rate": 4.485690181063015e-05, + "loss": 0.656, + "step": 3270 + }, + { + "epoch": 0.3182612070638463, + "grad_norm": 1.1312164184452325, + "learning_rate": 4.48406775261211e-05, + "loss": 0.6956, + "step": 3280 + }, + { + "epoch": 0.3192315156219678, + "grad_norm": 1.5553483087810633, + "learning_rate": 4.4824453241612044e-05, + "loss": 0.627, + "step": 3290 + }, + { + "epoch": 0.32020182418008925, + "grad_norm": 1.5414421667820604, + "learning_rate": 4.4808228957102994e-05, + "loss": 0.6567, + "step": 3300 + }, + { + "epoch": 0.32117213273821077, + "grad_norm": 1.3882171728699897, + "learning_rate": 4.479200467259394e-05, + "loss": 0.6478, + "step": 3310 + }, + { + "epoch": 0.32214244129633224, + "grad_norm": 1.2870563704453035, + "learning_rate": 4.477578038808489e-05, + "loss": 0.6258, + "step": 3320 + }, + { + "epoch": 0.3231127498544537, + "grad_norm": 1.2176939882656843, + "learning_rate": 4.4759556103575836e-05, + "loss": 0.6831, + "step": 3330 + }, + { + "epoch": 0.3240830584125752, + "grad_norm": 1.5859704946797195, + "learning_rate": 4.474333181906678e-05, + "loss": 0.6875, + "step": 3340 + }, + { + "epoch": 0.3250533669706967, + "grad_norm": 1.3048860685179964, + "learning_rate": 4.472710753455773e-05, + "loss": 0.6797, + "step": 3350 + }, + { + "epoch": 0.32602367552881817, + "grad_norm": 1.4244729025826581, + "learning_rate": 4.471088325004867e-05, + "loss": 0.6671, + "step": 3360 + }, + { + "epoch": 0.32699398408693964, + "grad_norm": 1.664993378309651, + "learning_rate": 4.469465896553962e-05, + "loss": 0.6946, + "step": 3370 + }, + { + "epoch": 0.3279642926450611, + "grad_norm": 1.4374586076882605, + "learning_rate": 4.4678434681030565e-05, + "loss": 0.7152, + "step": 3380 + }, + { + "epoch": 0.32893460120318263, + "grad_norm": 1.183729655148727, + "learning_rate": 4.4662210396521515e-05, + "loss": 0.6695, + "step": 3390 + }, + { + "epoch": 0.3299049097613041, + "grad_norm": 1.3106066103858482, + "learning_rate": 4.4645986112012464e-05, + "loss": 0.681, + "step": 3400 + }, + { + "epoch": 0.33087521831942557, + "grad_norm": 1.9834642707734547, + "learning_rate": 4.462976182750341e-05, + "loss": 0.6544, + "step": 3410 + }, + { + "epoch": 0.33184552687754704, + "grad_norm": 1.2898353030549436, + "learning_rate": 4.461353754299436e-05, + "loss": 0.6516, + "step": 3420 + }, + { + "epoch": 0.33281583543566856, + "grad_norm": 1.3210516235415775, + "learning_rate": 4.45973132584853e-05, + "loss": 0.6684, + "step": 3430 + }, + { + "epoch": 0.33378614399379003, + "grad_norm": 1.4442553376708276, + "learning_rate": 4.458108897397625e-05, + "loss": 0.6673, + "step": 3440 + }, + { + "epoch": 0.3347564525519115, + "grad_norm": 1.1500209299804536, + "learning_rate": 4.456486468946719e-05, + "loss": 0.7158, + "step": 3450 + }, + { + "epoch": 0.33572676111003297, + "grad_norm": 1.31538694061074, + "learning_rate": 4.454864040495814e-05, + "loss": 0.647, + "step": 3460 + }, + { + "epoch": 0.3366970696681545, + "grad_norm": 1.4081971729681526, + "learning_rate": 4.4532416120449086e-05, + "loss": 0.6964, + "step": 3470 + }, + { + "epoch": 0.33766737822627596, + "grad_norm": 1.4794888005976117, + "learning_rate": 4.4516191835940036e-05, + "loss": 0.6644, + "step": 3480 + }, + { + "epoch": 0.33863768678439743, + "grad_norm": 1.3524205715043236, + "learning_rate": 4.4499967551430985e-05, + "loss": 0.6421, + "step": 3490 + }, + { + "epoch": 0.3396079953425189, + "grad_norm": 1.5105858350763188, + "learning_rate": 4.448374326692193e-05, + "loss": 0.7105, + "step": 3500 + }, + { + "epoch": 0.3396079953425189, + "eval_loss": 0.7246462106704712, + "eval_runtime": 2471.0932, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.363, + "step": 3500 + }, + { + "epoch": 0.3405783039006404, + "grad_norm": 1.4583137688880468, + "learning_rate": 4.446751898241288e-05, + "loss": 0.6087, + "step": 3510 + }, + { + "epoch": 0.3415486124587619, + "grad_norm": 1.4130179418528255, + "learning_rate": 4.445129469790382e-05, + "loss": 0.6628, + "step": 3520 + }, + { + "epoch": 0.34251892101688336, + "grad_norm": 1.3225389415201874, + "learning_rate": 4.443507041339477e-05, + "loss": 0.625, + "step": 3530 + }, + { + "epoch": 0.34348922957500483, + "grad_norm": 1.8587881703474012, + "learning_rate": 4.4418846128885714e-05, + "loss": 0.6416, + "step": 3540 + }, + { + "epoch": 0.34445953813312635, + "grad_norm": 1.2742394540775415, + "learning_rate": 4.4402621844376664e-05, + "loss": 0.632, + "step": 3550 + }, + { + "epoch": 0.3454298466912478, + "grad_norm": 1.64437983962982, + "learning_rate": 4.4386397559867607e-05, + "loss": 0.6383, + "step": 3560 + }, + { + "epoch": 0.3464001552493693, + "grad_norm": 1.3231966805125737, + "learning_rate": 4.4370173275358556e-05, + "loss": 0.6841, + "step": 3570 + }, + { + "epoch": 0.34737046380749076, + "grad_norm": 1.167387762307211, + "learning_rate": 4.4353948990849506e-05, + "loss": 0.616, + "step": 3580 + }, + { + "epoch": 0.3483407723656123, + "grad_norm": 1.4336867980215322, + "learning_rate": 4.433772470634045e-05, + "loss": 0.6195, + "step": 3590 + }, + { + "epoch": 0.34931108092373375, + "grad_norm": 1.403759520909343, + "learning_rate": 4.43215004218314e-05, + "loss": 0.6787, + "step": 3600 + }, + { + "epoch": 0.3502813894818552, + "grad_norm": 1.4695150471417817, + "learning_rate": 4.430527613732234e-05, + "loss": 0.6433, + "step": 3610 + }, + { + "epoch": 0.3512516980399767, + "grad_norm": 1.3322730792588582, + "learning_rate": 4.42890518528133e-05, + "loss": 0.6827, + "step": 3620 + }, + { + "epoch": 0.3522220065980982, + "grad_norm": 1.2541946759332971, + "learning_rate": 4.427282756830424e-05, + "loss": 0.6767, + "step": 3630 + }, + { + "epoch": 0.3531923151562197, + "grad_norm": 1.3220529304919946, + "learning_rate": 4.425660328379519e-05, + "loss": 0.6785, + "step": 3640 + }, + { + "epoch": 0.35416262371434115, + "grad_norm": 1.8159199916167459, + "learning_rate": 4.4240378999286134e-05, + "loss": 0.6504, + "step": 3650 + }, + { + "epoch": 0.3551329322724626, + "grad_norm": 1.5513977697025123, + "learning_rate": 4.4224154714777084e-05, + "loss": 0.668, + "step": 3660 + }, + { + "epoch": 0.35610324083058414, + "grad_norm": 1.9383083203795937, + "learning_rate": 4.420793043026803e-05, + "loss": 0.6314, + "step": 3670 + }, + { + "epoch": 0.3570735493887056, + "grad_norm": 1.5611559659864904, + "learning_rate": 4.419170614575898e-05, + "loss": 0.6617, + "step": 3680 + }, + { + "epoch": 0.3580438579468271, + "grad_norm": 1.2579691148448051, + "learning_rate": 4.4175481861249927e-05, + "loss": 0.6641, + "step": 3690 + }, + { + "epoch": 0.35901416650494855, + "grad_norm": 1.8250727183362423, + "learning_rate": 4.415925757674087e-05, + "loss": 0.628, + "step": 3700 + }, + { + "epoch": 0.3599844750630701, + "grad_norm": 1.8663848290613063, + "learning_rate": 4.414303329223182e-05, + "loss": 0.6819, + "step": 3710 + }, + { + "epoch": 0.36095478362119154, + "grad_norm": 0.998691604867035, + "learning_rate": 4.412680900772276e-05, + "loss": 0.6603, + "step": 3720 + }, + { + "epoch": 0.361925092179313, + "grad_norm": 1.5186010030355703, + "learning_rate": 4.411058472321371e-05, + "loss": 0.6608, + "step": 3730 + }, + { + "epoch": 0.3628954007374345, + "grad_norm": 1.3444105870233574, + "learning_rate": 4.4094360438704655e-05, + "loss": 0.6684, + "step": 3740 + }, + { + "epoch": 0.363865709295556, + "grad_norm": 1.8433609494425311, + "learning_rate": 4.4078136154195605e-05, + "loss": 0.7007, + "step": 3750 + }, + { + "epoch": 0.3648360178536775, + "grad_norm": 1.3736456316572478, + "learning_rate": 4.406191186968655e-05, + "loss": 0.6691, + "step": 3760 + }, + { + "epoch": 0.36580632641179894, + "grad_norm": 1.3845972262001316, + "learning_rate": 4.40456875851775e-05, + "loss": 0.699, + "step": 3770 + }, + { + "epoch": 0.3667766349699204, + "grad_norm": 1.3202439744871353, + "learning_rate": 4.402946330066845e-05, + "loss": 0.6463, + "step": 3780 + }, + { + "epoch": 0.36774694352804194, + "grad_norm": 1.527206134833558, + "learning_rate": 4.401323901615939e-05, + "loss": 0.6261, + "step": 3790 + }, + { + "epoch": 0.3687172520861634, + "grad_norm": 1.85165256113485, + "learning_rate": 4.399701473165034e-05, + "loss": 0.6268, + "step": 3800 + }, + { + "epoch": 0.3696875606442849, + "grad_norm": 1.7774545505998887, + "learning_rate": 4.398079044714128e-05, + "loss": 0.674, + "step": 3810 + }, + { + "epoch": 0.37065786920240634, + "grad_norm": 1.1302938960409563, + "learning_rate": 4.396456616263223e-05, + "loss": 0.6814, + "step": 3820 + }, + { + "epoch": 0.37162817776052787, + "grad_norm": 1.5633030889510193, + "learning_rate": 4.3948341878123176e-05, + "loss": 0.6821, + "step": 3830 + }, + { + "epoch": 0.37259848631864934, + "grad_norm": 1.4226623590722947, + "learning_rate": 4.3932117593614126e-05, + "loss": 0.6721, + "step": 3840 + }, + { + "epoch": 0.3735687948767708, + "grad_norm": 1.3788997600836588, + "learning_rate": 4.391589330910507e-05, + "loss": 0.6484, + "step": 3850 + }, + { + "epoch": 0.3745391034348923, + "grad_norm": 1.9387617060516118, + "learning_rate": 4.389966902459602e-05, + "loss": 0.7059, + "step": 3860 + }, + { + "epoch": 0.3755094119930138, + "grad_norm": 1.4159508230128068, + "learning_rate": 4.388344474008697e-05, + "loss": 0.6763, + "step": 3870 + }, + { + "epoch": 0.37647972055113527, + "grad_norm": 1.4395070409428377, + "learning_rate": 4.386722045557791e-05, + "loss": 0.6441, + "step": 3880 + }, + { + "epoch": 0.37745002910925674, + "grad_norm": 1.5259821305785304, + "learning_rate": 4.385099617106886e-05, + "loss": 0.7342, + "step": 3890 + }, + { + "epoch": 0.3784203376673782, + "grad_norm": 1.58232361711394, + "learning_rate": 4.3834771886559804e-05, + "loss": 0.7058, + "step": 3900 + }, + { + "epoch": 0.37939064622549973, + "grad_norm": 1.2758386103028025, + "learning_rate": 4.3818547602050754e-05, + "loss": 0.6116, + "step": 3910 + }, + { + "epoch": 0.3803609547836212, + "grad_norm": 1.354723087565567, + "learning_rate": 4.38023233175417e-05, + "loss": 0.6709, + "step": 3920 + }, + { + "epoch": 0.38133126334174267, + "grad_norm": 1.5790529961766175, + "learning_rate": 4.3786099033032646e-05, + "loss": 0.7028, + "step": 3930 + }, + { + "epoch": 0.38230157189986413, + "grad_norm": 1.9609521386475108, + "learning_rate": 4.376987474852359e-05, + "loss": 0.5906, + "step": 3940 + }, + { + "epoch": 0.38327188045798566, + "grad_norm": 1.3349118770454718, + "learning_rate": 4.375365046401454e-05, + "loss": 0.6623, + "step": 3950 + }, + { + "epoch": 0.3842421890161071, + "grad_norm": 1.2937167531350466, + "learning_rate": 4.373742617950549e-05, + "loss": 0.6369, + "step": 3960 + }, + { + "epoch": 0.3852124975742286, + "grad_norm": 1.3419705920519438, + "learning_rate": 4.372120189499643e-05, + "loss": 0.7277, + "step": 3970 + }, + { + "epoch": 0.38618280613235006, + "grad_norm": 1.7073814830995901, + "learning_rate": 4.370497761048738e-05, + "loss": 0.6241, + "step": 3980 + }, + { + "epoch": 0.3871531146904716, + "grad_norm": 1.373271354057727, + "learning_rate": 4.3688753325978325e-05, + "loss": 0.6481, + "step": 3990 + }, + { + "epoch": 0.38812342324859306, + "grad_norm": 1.7949929197599008, + "learning_rate": 4.3672529041469275e-05, + "loss": 0.6125, + "step": 4000 + }, + { + "epoch": 0.38812342324859306, + "eval_loss": 0.7137264609336853, + "eval_runtime": 2466.144, + "eval_samples_per_second": 0.727, + "eval_steps_per_second": 0.363, + "step": 4000 + }, + { + "epoch": 0.3890937318067145, + "grad_norm": 0.8329579782452979, + "learning_rate": 4.365630475696022e-05, + "loss": 0.6341, + "step": 4010 + }, + { + "epoch": 0.390064040364836, + "grad_norm": 1.2617089929475087, + "learning_rate": 4.364008047245117e-05, + "loss": 0.671, + "step": 4020 + }, + { + "epoch": 0.3910343489229575, + "grad_norm": 1.5577127482655397, + "learning_rate": 4.362385618794212e-05, + "loss": 0.6793, + "step": 4030 + }, + { + "epoch": 0.392004657481079, + "grad_norm": 1.6386238000694935, + "learning_rate": 4.360763190343306e-05, + "loss": 0.7014, + "step": 4040 + }, + { + "epoch": 0.39297496603920046, + "grad_norm": 1.5648938337659175, + "learning_rate": 4.359140761892401e-05, + "loss": 0.6867, + "step": 4050 + }, + { + "epoch": 0.3939452745973219, + "grad_norm": 1.6294675174253543, + "learning_rate": 4.357518333441495e-05, + "loss": 0.6706, + "step": 4060 + }, + { + "epoch": 0.39491558315544345, + "grad_norm": 1.549167727126926, + "learning_rate": 4.35589590499059e-05, + "loss": 0.6536, + "step": 4070 + }, + { + "epoch": 0.3958858917135649, + "grad_norm": 1.4182199112027882, + "learning_rate": 4.3542734765396846e-05, + "loss": 0.679, + "step": 4080 + }, + { + "epoch": 0.3968562002716864, + "grad_norm": 1.2821446945633657, + "learning_rate": 4.3526510480887795e-05, + "loss": 0.6643, + "step": 4090 + }, + { + "epoch": 0.39782650882980786, + "grad_norm": 1.6858376405816184, + "learning_rate": 4.351028619637874e-05, + "loss": 0.6394, + "step": 4100 + }, + { + "epoch": 0.3987968173879294, + "grad_norm": 1.4610732414303427, + "learning_rate": 4.349406191186969e-05, + "loss": 0.7123, + "step": 4110 + }, + { + "epoch": 0.39976712594605085, + "grad_norm": 1.504769761142886, + "learning_rate": 4.347783762736064e-05, + "loss": 0.6071, + "step": 4120 + }, + { + "epoch": 0.4007374345041723, + "grad_norm": 1.6912171907525997, + "learning_rate": 4.346161334285158e-05, + "loss": 0.5923, + "step": 4130 + }, + { + "epoch": 0.4017077430622938, + "grad_norm": 1.5783940240743402, + "learning_rate": 4.344538905834253e-05, + "loss": 0.5902, + "step": 4140 + }, + { + "epoch": 0.4026780516204153, + "grad_norm": 1.300258602942144, + "learning_rate": 4.3429164773833474e-05, + "loss": 0.668, + "step": 4150 + }, + { + "epoch": 0.4036483601785368, + "grad_norm": 1.3879301457682354, + "learning_rate": 4.3412940489324423e-05, + "loss": 0.6578, + "step": 4160 + }, + { + "epoch": 0.40461866873665825, + "grad_norm": 1.0418247331643762, + "learning_rate": 4.3396716204815366e-05, + "loss": 0.6539, + "step": 4170 + }, + { + "epoch": 0.4055889772947797, + "grad_norm": 1.817053798243336, + "learning_rate": 4.3380491920306316e-05, + "loss": 0.6296, + "step": 4180 + }, + { + "epoch": 0.40655928585290124, + "grad_norm": 1.8784845030864437, + "learning_rate": 4.336426763579726e-05, + "loss": 0.6435, + "step": 4190 + }, + { + "epoch": 0.4075295944110227, + "grad_norm": 1.3173177735824406, + "learning_rate": 4.334804335128821e-05, + "loss": 0.6467, + "step": 4200 + }, + { + "epoch": 0.4084999029691442, + "grad_norm": 1.6609920860725593, + "learning_rate": 4.333181906677916e-05, + "loss": 0.6506, + "step": 4210 + }, + { + "epoch": 0.40947021152726565, + "grad_norm": 1.3422418231507067, + "learning_rate": 4.33155947822701e-05, + "loss": 0.6794, + "step": 4220 + }, + { + "epoch": 0.4104405200853872, + "grad_norm": 1.6530218294736676, + "learning_rate": 4.329937049776105e-05, + "loss": 0.6458, + "step": 4230 + }, + { + "epoch": 0.41141082864350864, + "grad_norm": 1.7371351925998049, + "learning_rate": 4.3283146213251995e-05, + "loss": 0.6085, + "step": 4240 + }, + { + "epoch": 0.4123811372016301, + "grad_norm": 1.1986220099049816, + "learning_rate": 4.3266921928742944e-05, + "loss": 0.6763, + "step": 4250 + }, + { + "epoch": 0.4133514457597516, + "grad_norm": 1.4910668696454408, + "learning_rate": 4.325069764423389e-05, + "loss": 0.6943, + "step": 4260 + }, + { + "epoch": 0.4143217543178731, + "grad_norm": 1.605222234480949, + "learning_rate": 4.323447335972484e-05, + "loss": 0.5683, + "step": 4270 + }, + { + "epoch": 0.41529206287599457, + "grad_norm": 1.6517210814862113, + "learning_rate": 4.321824907521578e-05, + "loss": 0.6121, + "step": 4280 + }, + { + "epoch": 0.41626237143411604, + "grad_norm": 1.5780093620130797, + "learning_rate": 4.320202479070673e-05, + "loss": 0.634, + "step": 4290 + }, + { + "epoch": 0.4172326799922375, + "grad_norm": 1.4948523984666717, + "learning_rate": 4.318580050619768e-05, + "loss": 0.6736, + "step": 4300 + }, + { + "epoch": 0.41820298855035903, + "grad_norm": 1.412186726690487, + "learning_rate": 4.316957622168862e-05, + "loss": 0.6645, + "step": 4310 + }, + { + "epoch": 0.4191732971084805, + "grad_norm": 1.3014470286002153, + "learning_rate": 4.315335193717957e-05, + "loss": 0.611, + "step": 4320 + }, + { + "epoch": 0.42014360566660197, + "grad_norm": 1.5892208566989257, + "learning_rate": 4.3137127652670515e-05, + "loss": 0.6545, + "step": 4330 + }, + { + "epoch": 0.42111391422472344, + "grad_norm": 1.912773548887141, + "learning_rate": 4.3120903368161465e-05, + "loss": 0.6454, + "step": 4340 + }, + { + "epoch": 0.42208422278284496, + "grad_norm": 1.7286922382345113, + "learning_rate": 4.310467908365241e-05, + "loss": 0.6591, + "step": 4350 + }, + { + "epoch": 0.42305453134096643, + "grad_norm": 1.8839412216123284, + "learning_rate": 4.308845479914336e-05, + "loss": 0.5875, + "step": 4360 + }, + { + "epoch": 0.4240248398990879, + "grad_norm": 1.433166987175659, + "learning_rate": 4.30722305146343e-05, + "loss": 0.6405, + "step": 4370 + }, + { + "epoch": 0.42499514845720937, + "grad_norm": 1.493313820047122, + "learning_rate": 4.305600623012525e-05, + "loss": 0.6421, + "step": 4380 + }, + { + "epoch": 0.4259654570153309, + "grad_norm": 1.470156638554737, + "learning_rate": 4.30397819456162e-05, + "loss": 0.6212, + "step": 4390 + }, + { + "epoch": 0.42693576557345236, + "grad_norm": 1.801577389486602, + "learning_rate": 4.3023557661107143e-05, + "loss": 0.589, + "step": 4400 + }, + { + "epoch": 0.42790607413157383, + "grad_norm": 1.6174671178388904, + "learning_rate": 4.30073333765981e-05, + "loss": 0.6612, + "step": 4410 + }, + { + "epoch": 0.4288763826896953, + "grad_norm": 1.5169174455759806, + "learning_rate": 4.299110909208904e-05, + "loss": 0.6418, + "step": 4420 + }, + { + "epoch": 0.4298466912478168, + "grad_norm": 1.7159303308076814, + "learning_rate": 4.297488480757999e-05, + "loss": 0.6363, + "step": 4430 + }, + { + "epoch": 0.4308169998059383, + "grad_norm": 1.9158386115366701, + "learning_rate": 4.2958660523070936e-05, + "loss": 0.6639, + "step": 4440 + }, + { + "epoch": 0.43178730836405976, + "grad_norm": 1.6614034382324376, + "learning_rate": 4.2942436238561885e-05, + "loss": 0.6546, + "step": 4450 + }, + { + "epoch": 0.43275761692218123, + "grad_norm": 1.528595189219001, + "learning_rate": 4.292621195405283e-05, + "loss": 0.6151, + "step": 4460 + }, + { + "epoch": 0.43372792548030276, + "grad_norm": 1.297393909600355, + "learning_rate": 4.290998766954378e-05, + "loss": 0.6142, + "step": 4470 + }, + { + "epoch": 0.4346982340384242, + "grad_norm": 1.6025277242190177, + "learning_rate": 4.289376338503472e-05, + "loss": 0.6415, + "step": 4480 + }, + { + "epoch": 0.4356685425965457, + "grad_norm": 1.550877285078, + "learning_rate": 4.287753910052567e-05, + "loss": 0.6896, + "step": 4490 + }, + { + "epoch": 0.43663885115466716, + "grad_norm": 1.8134887382719538, + "learning_rate": 4.286131481601662e-05, + "loss": 0.674, + "step": 4500 + }, + { + "epoch": 0.43663885115466716, + "eval_loss": 0.70569908618927, + "eval_runtime": 2470.7726, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.363, + "step": 4500 + }, + { + "epoch": 0.4376091597127887, + "grad_norm": 1.5715320462855416, + "learning_rate": 4.2845090531507564e-05, + "loss": 0.6107, + "step": 4510 + }, + { + "epoch": 0.43857946827091016, + "grad_norm": 1.659219569116577, + "learning_rate": 4.2828866246998514e-05, + "loss": 0.554, + "step": 4520 + }, + { + "epoch": 0.4395497768290316, + "grad_norm": 1.5014148593583845, + "learning_rate": 4.2812641962489457e-05, + "loss": 0.6541, + "step": 4530 + }, + { + "epoch": 0.4405200853871531, + "grad_norm": 1.2080187388338057, + "learning_rate": 4.2796417677980406e-05, + "loss": 0.6704, + "step": 4540 + }, + { + "epoch": 0.4414903939452746, + "grad_norm": 1.4344070923299042, + "learning_rate": 4.278019339347135e-05, + "loss": 0.637, + "step": 4550 + }, + { + "epoch": 0.4424607025033961, + "grad_norm": 1.9816113300875973, + "learning_rate": 4.27639691089623e-05, + "loss": 0.6036, + "step": 4560 + }, + { + "epoch": 0.44343101106151755, + "grad_norm": 1.270379951275499, + "learning_rate": 4.274774482445324e-05, + "loss": 0.6481, + "step": 4570 + }, + { + "epoch": 0.444401319619639, + "grad_norm": 1.559280795643455, + "learning_rate": 4.273152053994419e-05, + "loss": 0.62, + "step": 4580 + }, + { + "epoch": 0.44537162817776055, + "grad_norm": 1.8377481053949813, + "learning_rate": 4.271529625543514e-05, + "loss": 0.6662, + "step": 4590 + }, + { + "epoch": 0.446341936735882, + "grad_norm": 1.3952819132659193, + "learning_rate": 4.2699071970926085e-05, + "loss": 0.6969, + "step": 4600 + }, + { + "epoch": 0.4473122452940035, + "grad_norm": 1.5967451654603113, + "learning_rate": 4.2682847686417034e-05, + "loss": 0.6604, + "step": 4610 + }, + { + "epoch": 0.44828255385212495, + "grad_norm": 1.5849329398639342, + "learning_rate": 4.266662340190798e-05, + "loss": 0.6589, + "step": 4620 + }, + { + "epoch": 0.4492528624102465, + "grad_norm": 1.7238645892594557, + "learning_rate": 4.265039911739893e-05, + "loss": 0.6658, + "step": 4630 + }, + { + "epoch": 0.45022317096836795, + "grad_norm": 1.7849098949361206, + "learning_rate": 4.263417483288987e-05, + "loss": 0.6507, + "step": 4640 + }, + { + "epoch": 0.4511934795264894, + "grad_norm": 1.3539873989398281, + "learning_rate": 4.261795054838082e-05, + "loss": 0.631, + "step": 4650 + }, + { + "epoch": 0.4521637880846109, + "grad_norm": 1.6177606598072791, + "learning_rate": 4.260172626387176e-05, + "loss": 0.7667, + "step": 4660 + }, + { + "epoch": 0.4531340966427324, + "grad_norm": 1.7683798129102917, + "learning_rate": 4.258550197936271e-05, + "loss": 0.6182, + "step": 4670 + }, + { + "epoch": 0.4541044052008539, + "grad_norm": 1.5536790285936453, + "learning_rate": 4.256927769485366e-05, + "loss": 0.6515, + "step": 4680 + }, + { + "epoch": 0.45507471375897535, + "grad_norm": 1.4626963492189242, + "learning_rate": 4.2553053410344605e-05, + "loss": 0.5595, + "step": 4690 + }, + { + "epoch": 0.4560450223170968, + "grad_norm": 1.8455989589758681, + "learning_rate": 4.2536829125835555e-05, + "loss": 0.6382, + "step": 4700 + }, + { + "epoch": 0.45701533087521834, + "grad_norm": 1.8260482347716946, + "learning_rate": 4.25206048413265e-05, + "loss": 0.6708, + "step": 4710 + }, + { + "epoch": 0.4579856394333398, + "grad_norm": 1.506588655076192, + "learning_rate": 4.250438055681745e-05, + "loss": 0.6138, + "step": 4720 + }, + { + "epoch": 0.4589559479914613, + "grad_norm": 1.7415606928937182, + "learning_rate": 4.248815627230839e-05, + "loss": 0.6688, + "step": 4730 + }, + { + "epoch": 0.45992625654958275, + "grad_norm": 1.2860177177533143, + "learning_rate": 4.247193198779934e-05, + "loss": 0.6496, + "step": 4740 + }, + { + "epoch": 0.46089656510770427, + "grad_norm": 1.419953340190783, + "learning_rate": 4.245570770329029e-05, + "loss": 0.6399, + "step": 4750 + }, + { + "epoch": 0.46186687366582574, + "grad_norm": 1.4197804283366926, + "learning_rate": 4.2439483418781234e-05, + "loss": 0.572, + "step": 4760 + }, + { + "epoch": 0.4628371822239472, + "grad_norm": 1.1761100117238914, + "learning_rate": 4.242325913427218e-05, + "loss": 0.6099, + "step": 4770 + }, + { + "epoch": 0.4638074907820687, + "grad_norm": 1.5767612487613212, + "learning_rate": 4.2407034849763126e-05, + "loss": 0.6223, + "step": 4780 + }, + { + "epoch": 0.4647777993401902, + "grad_norm": 1.8050008733247063, + "learning_rate": 4.2390810565254076e-05, + "loss": 0.5695, + "step": 4790 + }, + { + "epoch": 0.46574810789831167, + "grad_norm": 1.3088703288484584, + "learning_rate": 4.237458628074502e-05, + "loss": 0.5723, + "step": 4800 + }, + { + "epoch": 0.46671841645643314, + "grad_norm": 1.7711140961973422, + "learning_rate": 4.235836199623597e-05, + "loss": 0.6173, + "step": 4810 + }, + { + "epoch": 0.4676887250145546, + "grad_norm": 1.6431929005817145, + "learning_rate": 4.234213771172691e-05, + "loss": 0.5982, + "step": 4820 + }, + { + "epoch": 0.46865903357267613, + "grad_norm": 1.557431489902951, + "learning_rate": 4.232591342721786e-05, + "loss": 0.6098, + "step": 4830 + }, + { + "epoch": 0.4696293421307976, + "grad_norm": 1.4479656995240782, + "learning_rate": 4.230968914270881e-05, + "loss": 0.5699, + "step": 4840 + }, + { + "epoch": 0.47059965068891907, + "grad_norm": 1.6046344258439647, + "learning_rate": 4.2293464858199754e-05, + "loss": 0.6243, + "step": 4850 + }, + { + "epoch": 0.47156995924704054, + "grad_norm": 1.252218532607539, + "learning_rate": 4.2277240573690704e-05, + "loss": 0.5787, + "step": 4860 + }, + { + "epoch": 0.47254026780516206, + "grad_norm": 1.4372595855894126, + "learning_rate": 4.226101628918165e-05, + "loss": 0.6483, + "step": 4870 + }, + { + "epoch": 0.47351057636328353, + "grad_norm": 1.6112811676393963, + "learning_rate": 4.22447920046726e-05, + "loss": 0.6042, + "step": 4880 + }, + { + "epoch": 0.474480884921405, + "grad_norm": 1.6387669351547591, + "learning_rate": 4.222856772016354e-05, + "loss": 0.6176, + "step": 4890 + }, + { + "epoch": 0.47545119347952647, + "grad_norm": 1.4893296879883764, + "learning_rate": 4.221234343565449e-05, + "loss": 0.5976, + "step": 4900 + }, + { + "epoch": 0.476421502037648, + "grad_norm": 1.663980815080282, + "learning_rate": 4.219611915114543e-05, + "loss": 0.6655, + "step": 4910 + }, + { + "epoch": 0.47739181059576946, + "grad_norm": 1.3387695933594599, + "learning_rate": 4.217989486663638e-05, + "loss": 0.6166, + "step": 4920 + }, + { + "epoch": 0.47836211915389093, + "grad_norm": 1.7319521884893254, + "learning_rate": 4.216367058212733e-05, + "loss": 0.6225, + "step": 4930 + }, + { + "epoch": 0.4793324277120124, + "grad_norm": 1.4563922038982844, + "learning_rate": 4.2147446297618275e-05, + "loss": 0.6351, + "step": 4940 + }, + { + "epoch": 0.4803027362701339, + "grad_norm": 1.7473358339447822, + "learning_rate": 4.2131222013109225e-05, + "loss": 0.6205, + "step": 4950 + }, + { + "epoch": 0.4812730448282554, + "grad_norm": 1.4006136660964967, + "learning_rate": 4.211499772860017e-05, + "loss": 0.6992, + "step": 4960 + }, + { + "epoch": 0.48224335338637686, + "grad_norm": 1.7446969484471915, + "learning_rate": 4.209877344409112e-05, + "loss": 0.6251, + "step": 4970 + }, + { + "epoch": 0.48321366194449833, + "grad_norm": 1.3789648220512414, + "learning_rate": 4.208254915958206e-05, + "loss": 0.5856, + "step": 4980 + }, + { + "epoch": 0.48418397050261985, + "grad_norm": 1.77919007957107, + "learning_rate": 4.206632487507301e-05, + "loss": 0.621, + "step": 4990 + }, + { + "epoch": 0.4851542790607413, + "grad_norm": 1.5926084629766706, + "learning_rate": 4.2050100590563953e-05, + "loss": 0.6542, + "step": 5000 + }, + { + "epoch": 0.4851542790607413, + "eval_loss": 0.6975318789482117, + "eval_runtime": 2469.0533, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 5000 + }, + { + "epoch": 0.4861245876188628, + "grad_norm": 1.7733999980186248, + "learning_rate": 4.20338763060549e-05, + "loss": 0.6943, + "step": 5010 + }, + { + "epoch": 0.48709489617698426, + "grad_norm": 1.7221032223753934, + "learning_rate": 4.201765202154585e-05, + "loss": 0.6338, + "step": 5020 + }, + { + "epoch": 0.4880652047351058, + "grad_norm": 1.7180735514930228, + "learning_rate": 4.2001427737036796e-05, + "loss": 0.6698, + "step": 5030 + }, + { + "epoch": 0.48903551329322725, + "grad_norm": 1.698242629280347, + "learning_rate": 4.1985203452527746e-05, + "loss": 0.6972, + "step": 5040 + }, + { + "epoch": 0.4900058218513487, + "grad_norm": 1.8269714939747912, + "learning_rate": 4.196897916801869e-05, + "loss": 0.6212, + "step": 5050 + }, + { + "epoch": 0.4909761304094702, + "grad_norm": 1.4693827904505679, + "learning_rate": 4.195275488350964e-05, + "loss": 0.5933, + "step": 5060 + }, + { + "epoch": 0.4919464389675917, + "grad_norm": 1.49361517085645, + "learning_rate": 4.193653059900058e-05, + "loss": 0.5489, + "step": 5070 + }, + { + "epoch": 0.4929167475257132, + "grad_norm": 1.5630723710374232, + "learning_rate": 4.192030631449153e-05, + "loss": 0.6578, + "step": 5080 + }, + { + "epoch": 0.49388705608383465, + "grad_norm": 1.6495843346442778, + "learning_rate": 4.1904082029982474e-05, + "loss": 0.6906, + "step": 5090 + }, + { + "epoch": 0.4948573646419561, + "grad_norm": 1.7983678208711196, + "learning_rate": 4.1887857745473424e-05, + "loss": 0.5486, + "step": 5100 + }, + { + "epoch": 0.49582767320007765, + "grad_norm": 2.00722752034129, + "learning_rate": 4.1871633460964374e-05, + "loss": 0.648, + "step": 5110 + }, + { + "epoch": 0.4967979817581991, + "grad_norm": 1.7918897206429327, + "learning_rate": 4.185540917645532e-05, + "loss": 0.6693, + "step": 5120 + }, + { + "epoch": 0.4977682903163206, + "grad_norm": 1.3410097713324785, + "learning_rate": 4.1839184891946267e-05, + "loss": 0.6065, + "step": 5130 + }, + { + "epoch": 0.49873859887444205, + "grad_norm": 1.8485275360772722, + "learning_rate": 4.182296060743721e-05, + "loss": 0.6802, + "step": 5140 + }, + { + "epoch": 0.4997089074325636, + "grad_norm": 1.923253820990962, + "learning_rate": 4.180673632292816e-05, + "loss": 0.6014, + "step": 5150 + }, + { + "epoch": 0.500679215990685, + "grad_norm": 1.325223633752785, + "learning_rate": 4.17905120384191e-05, + "loss": 0.5787, + "step": 5160 + }, + { + "epoch": 0.5016495245488065, + "grad_norm": 1.7937826702440574, + "learning_rate": 4.177428775391005e-05, + "loss": 0.6197, + "step": 5170 + }, + { + "epoch": 0.502619833106928, + "grad_norm": 1.4050667202069218, + "learning_rate": 4.1758063469401e-05, + "loss": 0.6, + "step": 5180 + }, + { + "epoch": 0.5035901416650495, + "grad_norm": 1.5656335055581363, + "learning_rate": 4.174183918489195e-05, + "loss": 0.6924, + "step": 5190 + }, + { + "epoch": 0.504560450223171, + "grad_norm": 1.3325372640233277, + "learning_rate": 4.1725614900382895e-05, + "loss": 0.6477, + "step": 5200 + }, + { + "epoch": 0.5055307587812925, + "grad_norm": 1.5612035852842387, + "learning_rate": 4.1709390615873844e-05, + "loss": 0.6372, + "step": 5210 + }, + { + "epoch": 0.5065010673394139, + "grad_norm": 1.6164770277307943, + "learning_rate": 4.1693166331364794e-05, + "loss": 0.5822, + "step": 5220 + }, + { + "epoch": 0.5074713758975354, + "grad_norm": 1.8957196740567803, + "learning_rate": 4.167694204685574e-05, + "loss": 0.5695, + "step": 5230 + }, + { + "epoch": 0.5084416844556569, + "grad_norm": 1.505057761308431, + "learning_rate": 4.166071776234669e-05, + "loss": 0.5893, + "step": 5240 + }, + { + "epoch": 0.5094119930137784, + "grad_norm": 1.2392435413477463, + "learning_rate": 4.164449347783763e-05, + "loss": 0.6641, + "step": 5250 + }, + { + "epoch": 0.5103823015718999, + "grad_norm": 1.8002682332596205, + "learning_rate": 4.162826919332858e-05, + "loss": 0.5892, + "step": 5260 + }, + { + "epoch": 0.5113526101300213, + "grad_norm": 1.5161558066332184, + "learning_rate": 4.161204490881952e-05, + "loss": 0.5985, + "step": 5270 + }, + { + "epoch": 0.5123229186881428, + "grad_norm": 1.3773663290971068, + "learning_rate": 4.159582062431047e-05, + "loss": 0.6244, + "step": 5280 + }, + { + "epoch": 0.5132932272462644, + "grad_norm": 1.8649060352228473, + "learning_rate": 4.1579596339801415e-05, + "loss": 0.6144, + "step": 5290 + }, + { + "epoch": 0.5142635358043858, + "grad_norm": 1.711468575333758, + "learning_rate": 4.1563372055292365e-05, + "loss": 0.5889, + "step": 5300 + }, + { + "epoch": 0.5152338443625073, + "grad_norm": 1.4531536302097425, + "learning_rate": 4.1547147770783315e-05, + "loss": 0.6063, + "step": 5310 + }, + { + "epoch": 0.5162041529206287, + "grad_norm": 1.6628035885794548, + "learning_rate": 4.153092348627426e-05, + "loss": 0.5872, + "step": 5320 + }, + { + "epoch": 0.5171744614787502, + "grad_norm": 1.43386966167638, + "learning_rate": 4.151469920176521e-05, + "loss": 0.5836, + "step": 5330 + }, + { + "epoch": 0.5181447700368718, + "grad_norm": 1.6338584720484304, + "learning_rate": 4.149847491725615e-05, + "loss": 0.583, + "step": 5340 + }, + { + "epoch": 0.5191150785949932, + "grad_norm": 1.602663089927568, + "learning_rate": 4.14822506327471e-05, + "loss": 0.6466, + "step": 5350 + }, + { + "epoch": 0.5200853871531147, + "grad_norm": 0.993656188560276, + "learning_rate": 4.1466026348238044e-05, + "loss": 0.5972, + "step": 5360 + }, + { + "epoch": 0.5210556957112362, + "grad_norm": 1.5353879720655148, + "learning_rate": 4.144980206372899e-05, + "loss": 0.6159, + "step": 5370 + }, + { + "epoch": 0.5220260042693576, + "grad_norm": 1.4710027502404226, + "learning_rate": 4.143357777921994e-05, + "loss": 0.6231, + "step": 5380 + }, + { + "epoch": 0.5229963128274792, + "grad_norm": 1.3154974116618938, + "learning_rate": 4.1417353494710886e-05, + "loss": 0.6414, + "step": 5390 + }, + { + "epoch": 0.5239666213856006, + "grad_norm": 1.5917256061619933, + "learning_rate": 4.1401129210201836e-05, + "loss": 0.5574, + "step": 5400 + }, + { + "epoch": 0.5249369299437221, + "grad_norm": 1.999701613939348, + "learning_rate": 4.138490492569278e-05, + "loss": 0.6659, + "step": 5410 + }, + { + "epoch": 0.5259072385018436, + "grad_norm": 1.158059375940914, + "learning_rate": 4.136868064118373e-05, + "loss": 0.5702, + "step": 5420 + }, + { + "epoch": 0.526877547059965, + "grad_norm": 1.6503529559993917, + "learning_rate": 4.135245635667467e-05, + "loss": 0.5866, + "step": 5430 + }, + { + "epoch": 0.5278478556180866, + "grad_norm": 1.778595452682844, + "learning_rate": 4.133623207216562e-05, + "loss": 0.6727, + "step": 5440 + }, + { + "epoch": 0.5288181641762081, + "grad_norm": 2.153378806689067, + "learning_rate": 4.1320007787656564e-05, + "loss": 0.5849, + "step": 5450 + }, + { + "epoch": 0.5297884727343295, + "grad_norm": 1.5480145123110607, + "learning_rate": 4.1303783503147514e-05, + "loss": 0.5985, + "step": 5460 + }, + { + "epoch": 0.530758781292451, + "grad_norm": 1.3074515838089584, + "learning_rate": 4.1287559218638464e-05, + "loss": 0.6567, + "step": 5470 + }, + { + "epoch": 0.5317290898505724, + "grad_norm": 1.2634236320868193, + "learning_rate": 4.127133493412941e-05, + "loss": 0.6744, + "step": 5480 + }, + { + "epoch": 0.532699398408694, + "grad_norm": 1.4158920202942755, + "learning_rate": 4.125511064962036e-05, + "loss": 0.6089, + "step": 5490 + }, + { + "epoch": 0.5336697069668155, + "grad_norm": 1.530835103672291, + "learning_rate": 4.12388863651113e-05, + "loss": 0.6637, + "step": 5500 + }, + { + "epoch": 0.5336697069668155, + "eval_loss": 0.6922717094421387, + "eval_runtime": 2471.913, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 5500 + }, + { + "epoch": 0.5346400155249369, + "grad_norm": 1.0979844633621818, + "learning_rate": 4.122266208060225e-05, + "loss": 0.6711, + "step": 5510 + }, + { + "epoch": 0.5356103240830584, + "grad_norm": 1.9531703151807043, + "learning_rate": 4.120643779609319e-05, + "loss": 0.634, + "step": 5520 + }, + { + "epoch": 0.5365806326411799, + "grad_norm": 1.4345990864009392, + "learning_rate": 4.119021351158414e-05, + "loss": 0.6289, + "step": 5530 + }, + { + "epoch": 0.5375509411993014, + "grad_norm": 1.5002874747132873, + "learning_rate": 4.1173989227075085e-05, + "loss": 0.6655, + "step": 5540 + }, + { + "epoch": 0.5385212497574229, + "grad_norm": 1.4114133378596565, + "learning_rate": 4.1157764942566035e-05, + "loss": 0.6314, + "step": 5550 + }, + { + "epoch": 0.5394915583155443, + "grad_norm": 1.2792208401822267, + "learning_rate": 4.1141540658056985e-05, + "loss": 0.5622, + "step": 5560 + }, + { + "epoch": 0.5404618668736658, + "grad_norm": 1.4436849148969537, + "learning_rate": 4.112531637354793e-05, + "loss": 0.6758, + "step": 5570 + }, + { + "epoch": 0.5414321754317873, + "grad_norm": 1.495469226735889, + "learning_rate": 4.110909208903888e-05, + "loss": 0.6457, + "step": 5580 + }, + { + "epoch": 0.5424024839899088, + "grad_norm": 1.5696729181281173, + "learning_rate": 4.109286780452982e-05, + "loss": 0.5434, + "step": 5590 + }, + { + "epoch": 0.5433727925480303, + "grad_norm": 1.4884099597371836, + "learning_rate": 4.107664352002077e-05, + "loss": 0.6602, + "step": 5600 + }, + { + "epoch": 0.5443431011061518, + "grad_norm": 1.364422329620206, + "learning_rate": 4.106041923551171e-05, + "loss": 0.6093, + "step": 5610 + }, + { + "epoch": 0.5453134096642732, + "grad_norm": 1.710525492450016, + "learning_rate": 4.104419495100266e-05, + "loss": 0.5733, + "step": 5620 + }, + { + "epoch": 0.5462837182223947, + "grad_norm": 1.8645615729328109, + "learning_rate": 4.1027970666493606e-05, + "loss": 0.6301, + "step": 5630 + }, + { + "epoch": 0.5472540267805162, + "grad_norm": 2.1212649730065403, + "learning_rate": 4.1011746381984556e-05, + "loss": 0.6489, + "step": 5640 + }, + { + "epoch": 0.5482243353386377, + "grad_norm": 1.3978835293983003, + "learning_rate": 4.0995522097475506e-05, + "loss": 0.6651, + "step": 5650 + }, + { + "epoch": 0.5491946438967592, + "grad_norm": 1.968855731350402, + "learning_rate": 4.097929781296645e-05, + "loss": 0.5849, + "step": 5660 + }, + { + "epoch": 0.5501649524548806, + "grad_norm": 1.3003479487793523, + "learning_rate": 4.09630735284574e-05, + "loss": 0.5981, + "step": 5670 + }, + { + "epoch": 0.5511352610130021, + "grad_norm": 1.3093675797291962, + "learning_rate": 4.094684924394834e-05, + "loss": 0.6001, + "step": 5680 + }, + { + "epoch": 0.5521055695711237, + "grad_norm": 1.386619668616355, + "learning_rate": 4.093062495943929e-05, + "loss": 0.587, + "step": 5690 + }, + { + "epoch": 0.5530758781292451, + "grad_norm": 2.1925053494451108, + "learning_rate": 4.0914400674930234e-05, + "loss": 0.6748, + "step": 5700 + }, + { + "epoch": 0.5540461866873666, + "grad_norm": 1.401533269192193, + "learning_rate": 4.0898176390421184e-05, + "loss": 0.5987, + "step": 5710 + }, + { + "epoch": 0.555016495245488, + "grad_norm": 1.8417454170154262, + "learning_rate": 4.088195210591213e-05, + "loss": 0.6169, + "step": 5720 + }, + { + "epoch": 0.5559868038036095, + "grad_norm": 1.70211748961807, + "learning_rate": 4.086572782140308e-05, + "loss": 0.6231, + "step": 5730 + }, + { + "epoch": 0.5569571123617311, + "grad_norm": 1.449836249162655, + "learning_rate": 4.0849503536894026e-05, + "loss": 0.6636, + "step": 5740 + }, + { + "epoch": 0.5579274209198525, + "grad_norm": 1.411704552119965, + "learning_rate": 4.083327925238497e-05, + "loss": 0.6334, + "step": 5750 + }, + { + "epoch": 0.558897729477974, + "grad_norm": 1.3373247464479125, + "learning_rate": 4.081705496787592e-05, + "loss": 0.6127, + "step": 5760 + }, + { + "epoch": 0.5598680380360955, + "grad_norm": 1.6669305203635734, + "learning_rate": 4.080083068336686e-05, + "loss": 0.6419, + "step": 5770 + }, + { + "epoch": 0.5608383465942169, + "grad_norm": 1.3599639094085172, + "learning_rate": 4.078460639885781e-05, + "loss": 0.6838, + "step": 5780 + }, + { + "epoch": 0.5618086551523385, + "grad_norm": 1.5754322431175416, + "learning_rate": 4.0768382114348755e-05, + "loss": 0.5997, + "step": 5790 + }, + { + "epoch": 0.5627789637104599, + "grad_norm": 1.667207521590911, + "learning_rate": 4.0752157829839705e-05, + "loss": 0.6199, + "step": 5800 + }, + { + "epoch": 0.5637492722685814, + "grad_norm": 1.5869372089733027, + "learning_rate": 4.0735933545330654e-05, + "loss": 0.5779, + "step": 5810 + }, + { + "epoch": 0.5647195808267029, + "grad_norm": 1.7150097071735784, + "learning_rate": 4.07197092608216e-05, + "loss": 0.5978, + "step": 5820 + }, + { + "epoch": 0.5656898893848243, + "grad_norm": 1.1835353092728575, + "learning_rate": 4.070348497631255e-05, + "loss": 0.6781, + "step": 5830 + }, + { + "epoch": 0.5666601979429459, + "grad_norm": 1.7307613010314937, + "learning_rate": 4.068726069180349e-05, + "loss": 0.634, + "step": 5840 + }, + { + "epoch": 0.5676305065010674, + "grad_norm": 1.6364191249486493, + "learning_rate": 4.067103640729444e-05, + "loss": 0.6377, + "step": 5850 + }, + { + "epoch": 0.5686008150591888, + "grad_norm": 1.4754833764988036, + "learning_rate": 4.065481212278538e-05, + "loss": 0.6148, + "step": 5860 + }, + { + "epoch": 0.5695711236173103, + "grad_norm": 1.8389583610281375, + "learning_rate": 4.063858783827633e-05, + "loss": 0.6739, + "step": 5870 + }, + { + "epoch": 0.5705414321754317, + "grad_norm": 1.4669099461048227, + "learning_rate": 4.0622363553767276e-05, + "loss": 0.598, + "step": 5880 + }, + { + "epoch": 0.5715117407335533, + "grad_norm": 1.609973267529918, + "learning_rate": 4.0606139269258226e-05, + "loss": 0.5476, + "step": 5890 + }, + { + "epoch": 0.5724820492916748, + "grad_norm": 1.6951117185016165, + "learning_rate": 4.0589914984749175e-05, + "loss": 0.595, + "step": 5900 + }, + { + "epoch": 0.5734523578497962, + "grad_norm": 1.494158253886906, + "learning_rate": 4.057369070024012e-05, + "loss": 0.6886, + "step": 5910 + }, + { + "epoch": 0.5744226664079177, + "grad_norm": 1.3036559330975726, + "learning_rate": 4.055746641573107e-05, + "loss": 0.5986, + "step": 5920 + }, + { + "epoch": 0.5753929749660393, + "grad_norm": 1.4737709906961922, + "learning_rate": 4.054124213122201e-05, + "loss": 0.615, + "step": 5930 + }, + { + "epoch": 0.5763632835241607, + "grad_norm": 1.4022175413897375, + "learning_rate": 4.052501784671296e-05, + "loss": 0.6398, + "step": 5940 + }, + { + "epoch": 0.5773335920822822, + "grad_norm": 1.433959705605443, + "learning_rate": 4.0508793562203904e-05, + "loss": 0.6415, + "step": 5950 + }, + { + "epoch": 0.5783039006404036, + "grad_norm": 1.7123129741732308, + "learning_rate": 4.0492569277694854e-05, + "loss": 0.6216, + "step": 5960 + }, + { + "epoch": 0.5792742091985251, + "grad_norm": 2.131052124619936, + "learning_rate": 4.04763449931858e-05, + "loss": 0.5596, + "step": 5970 + }, + { + "epoch": 0.5802445177566466, + "grad_norm": 1.2233223645362516, + "learning_rate": 4.046012070867675e-05, + "loss": 0.6023, + "step": 5980 + }, + { + "epoch": 0.5812148263147681, + "grad_norm": 1.734811390420522, + "learning_rate": 4.0443896424167696e-05, + "loss": 0.6347, + "step": 5990 + }, + { + "epoch": 0.5821851348728896, + "grad_norm": 1.626024609025115, + "learning_rate": 4.0427672139658646e-05, + "loss": 0.5688, + "step": 6000 + }, + { + "epoch": 0.5821851348728896, + "eval_loss": 0.6903010606765747, + "eval_runtime": 2473.5711, + "eval_samples_per_second": 0.724, + "eval_steps_per_second": 0.362, + "step": 6000 + }, + { + "epoch": 0.5831554434310111, + "grad_norm": 1.3892850877811105, + "learning_rate": 4.0411447855149596e-05, + "loss": 0.6131, + "step": 6010 + }, + { + "epoch": 0.5841257519891325, + "grad_norm": 1.6290044308163973, + "learning_rate": 4.039522357064054e-05, + "loss": 0.6319, + "step": 6020 + }, + { + "epoch": 0.585096060547254, + "grad_norm": 1.7770012777694764, + "learning_rate": 4.037899928613149e-05, + "loss": 0.5756, + "step": 6030 + }, + { + "epoch": 0.5860663691053755, + "grad_norm": 0.9954028008191703, + "learning_rate": 4.036277500162243e-05, + "loss": 0.6161, + "step": 6040 + }, + { + "epoch": 0.587036677663497, + "grad_norm": 1.7893541919562175, + "learning_rate": 4.034655071711338e-05, + "loss": 0.62, + "step": 6050 + }, + { + "epoch": 0.5880069862216185, + "grad_norm": 1.6500512739247042, + "learning_rate": 4.0330326432604324e-05, + "loss": 0.5928, + "step": 6060 + }, + { + "epoch": 0.5889772947797399, + "grad_norm": 1.6568317215206447, + "learning_rate": 4.0314102148095274e-05, + "loss": 0.6043, + "step": 6070 + }, + { + "epoch": 0.5899476033378614, + "grad_norm": 2.006336465212855, + "learning_rate": 4.029787786358622e-05, + "loss": 0.6283, + "step": 6080 + }, + { + "epoch": 0.590917911895983, + "grad_norm": 1.481658511307882, + "learning_rate": 4.028165357907717e-05, + "loss": 0.6229, + "step": 6090 + }, + { + "epoch": 0.5918882204541044, + "grad_norm": 1.7467752898199094, + "learning_rate": 4.0265429294568117e-05, + "loss": 0.5997, + "step": 6100 + }, + { + "epoch": 0.5928585290122259, + "grad_norm": 1.747997366501937, + "learning_rate": 4.024920501005906e-05, + "loss": 0.6441, + "step": 6110 + }, + { + "epoch": 0.5938288375703473, + "grad_norm": 1.6963969244893895, + "learning_rate": 4.023298072555001e-05, + "loss": 0.6059, + "step": 6120 + }, + { + "epoch": 0.5947991461284688, + "grad_norm": 1.335184560881826, + "learning_rate": 4.021675644104095e-05, + "loss": 0.5627, + "step": 6130 + }, + { + "epoch": 0.5957694546865904, + "grad_norm": 1.5256295408049876, + "learning_rate": 4.02005321565319e-05, + "loss": 0.578, + "step": 6140 + }, + { + "epoch": 0.5967397632447118, + "grad_norm": 1.7615333752357474, + "learning_rate": 4.0184307872022845e-05, + "loss": 0.6046, + "step": 6150 + }, + { + "epoch": 0.5977100718028333, + "grad_norm": 1.9321393707932597, + "learning_rate": 4.0168083587513795e-05, + "loss": 0.615, + "step": 6160 + }, + { + "epoch": 0.5986803803609548, + "grad_norm": 1.2267052175289888, + "learning_rate": 4.015185930300474e-05, + "loss": 0.6238, + "step": 6170 + }, + { + "epoch": 0.5996506889190762, + "grad_norm": 1.8092624768121868, + "learning_rate": 4.013563501849569e-05, + "loss": 0.6514, + "step": 6180 + }, + { + "epoch": 0.6006209974771978, + "grad_norm": 1.546108807816704, + "learning_rate": 4.011941073398664e-05, + "loss": 0.6071, + "step": 6190 + }, + { + "epoch": 0.6015913060353192, + "grad_norm": 1.8186789877657976, + "learning_rate": 4.010318644947758e-05, + "loss": 0.6624, + "step": 6200 + }, + { + "epoch": 0.6025616145934407, + "grad_norm": 1.6349446430924426, + "learning_rate": 4.008696216496853e-05, + "loss": 0.6208, + "step": 6210 + }, + { + "epoch": 0.6035319231515622, + "grad_norm": 1.6574782074117176, + "learning_rate": 4.007073788045947e-05, + "loss": 0.6067, + "step": 6220 + }, + { + "epoch": 0.6045022317096836, + "grad_norm": 1.372441926371392, + "learning_rate": 4.005451359595042e-05, + "loss": 0.5805, + "step": 6230 + }, + { + "epoch": 0.6054725402678052, + "grad_norm": 2.1366961584133164, + "learning_rate": 4.0038289311441366e-05, + "loss": 0.6797, + "step": 6240 + }, + { + "epoch": 0.6064428488259267, + "grad_norm": 1.8553936516223448, + "learning_rate": 4.0022065026932316e-05, + "loss": 0.582, + "step": 6250 + }, + { + "epoch": 0.6074131573840481, + "grad_norm": 1.95303083914795, + "learning_rate": 4.000584074242326e-05, + "loss": 0.6474, + "step": 6260 + }, + { + "epoch": 0.6083834659421696, + "grad_norm": 1.6884162400220937, + "learning_rate": 3.998961645791421e-05, + "loss": 0.6339, + "step": 6270 + }, + { + "epoch": 0.609353774500291, + "grad_norm": 1.7825690304753616, + "learning_rate": 3.997339217340516e-05, + "loss": 0.5568, + "step": 6280 + }, + { + "epoch": 0.6103240830584126, + "grad_norm": 1.5564493386693086, + "learning_rate": 3.99571678888961e-05, + "loss": 0.602, + "step": 6290 + }, + { + "epoch": 0.6112943916165341, + "grad_norm": 1.667415447715766, + "learning_rate": 3.994094360438705e-05, + "loss": 0.5877, + "step": 6300 + }, + { + "epoch": 0.6122647001746555, + "grad_norm": 1.6232084554425263, + "learning_rate": 3.9924719319877994e-05, + "loss": 0.6247, + "step": 6310 + }, + { + "epoch": 0.613235008732777, + "grad_norm": 1.7806635141428606, + "learning_rate": 3.9908495035368944e-05, + "loss": 0.5651, + "step": 6320 + }, + { + "epoch": 0.6142053172908986, + "grad_norm": 1.2898375089967073, + "learning_rate": 3.989227075085989e-05, + "loss": 0.6206, + "step": 6330 + }, + { + "epoch": 0.61517562584902, + "grad_norm": 1.762789516395008, + "learning_rate": 3.9876046466350836e-05, + "loss": 0.5538, + "step": 6340 + }, + { + "epoch": 0.6161459344071415, + "grad_norm": 1.8092369800491652, + "learning_rate": 3.985982218184178e-05, + "loss": 0.6236, + "step": 6350 + }, + { + "epoch": 0.6171162429652629, + "grad_norm": 1.8060820825881239, + "learning_rate": 3.984359789733273e-05, + "loss": 0.5875, + "step": 6360 + }, + { + "epoch": 0.6180865515233844, + "grad_norm": 1.7317977897542403, + "learning_rate": 3.982737361282368e-05, + "loss": 0.594, + "step": 6370 + }, + { + "epoch": 0.619056860081506, + "grad_norm": 1.5504516693013288, + "learning_rate": 3.981114932831462e-05, + "loss": 0.5833, + "step": 6380 + }, + { + "epoch": 0.6200271686396274, + "grad_norm": 1.3714460442712506, + "learning_rate": 3.979492504380557e-05, + "loss": 0.6048, + "step": 6390 + }, + { + "epoch": 0.6209974771977489, + "grad_norm": 1.741408754147969, + "learning_rate": 3.9778700759296515e-05, + "loss": 0.6729, + "step": 6400 + }, + { + "epoch": 0.6219677857558704, + "grad_norm": 1.0600787702033703, + "learning_rate": 3.9762476474787465e-05, + "loss": 0.6934, + "step": 6410 + }, + { + "epoch": 0.6229380943139918, + "grad_norm": 1.5722449067246123, + "learning_rate": 3.974625219027841e-05, + "loss": 0.6043, + "step": 6420 + }, + { + "epoch": 0.6239084028721134, + "grad_norm": 1.3176596782485965, + "learning_rate": 3.973002790576936e-05, + "loss": 0.5898, + "step": 6430 + }, + { + "epoch": 0.6248787114302348, + "grad_norm": 1.4177190750303141, + "learning_rate": 3.971380362126031e-05, + "loss": 0.6146, + "step": 6440 + }, + { + "epoch": 0.6258490199883563, + "grad_norm": 1.3993450966845369, + "learning_rate": 3.969757933675125e-05, + "loss": 0.608, + "step": 6450 + }, + { + "epoch": 0.6268193285464778, + "grad_norm": 1.6552784809097676, + "learning_rate": 3.96813550522422e-05, + "loss": 0.6078, + "step": 6460 + }, + { + "epoch": 0.6277896371045992, + "grad_norm": 1.315768136368434, + "learning_rate": 3.966513076773314e-05, + "loss": 0.6021, + "step": 6470 + }, + { + "epoch": 0.6287599456627208, + "grad_norm": 1.431816586824017, + "learning_rate": 3.964890648322409e-05, + "loss": 0.5571, + "step": 6480 + }, + { + "epoch": 0.6297302542208423, + "grad_norm": 1.4950226300857892, + "learning_rate": 3.9632682198715036e-05, + "loss": 0.6297, + "step": 6490 + }, + { + "epoch": 0.6307005627789637, + "grad_norm": 2.0781333722978284, + "learning_rate": 3.9616457914205985e-05, + "loss": 0.596, + "step": 6500 + }, + { + "epoch": 0.6307005627789637, + "eval_loss": 0.6844401955604553, + "eval_runtime": 2468.3658, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 6500 + }, + { + "epoch": 0.6316708713370852, + "grad_norm": 1.5337482284796442, + "learning_rate": 3.960023362969693e-05, + "loss": 0.6634, + "step": 6510 + }, + { + "epoch": 0.6326411798952066, + "grad_norm": 1.7040234243162709, + "learning_rate": 3.958400934518788e-05, + "loss": 0.5936, + "step": 6520 + }, + { + "epoch": 0.6336114884533282, + "grad_norm": 1.7943789476429368, + "learning_rate": 3.956778506067883e-05, + "loss": 0.6361, + "step": 6530 + }, + { + "epoch": 0.6345817970114497, + "grad_norm": 1.5737933588874193, + "learning_rate": 3.955156077616977e-05, + "loss": 0.6192, + "step": 6540 + }, + { + "epoch": 0.6355521055695711, + "grad_norm": 1.532774889369556, + "learning_rate": 3.953533649166072e-05, + "loss": 0.5694, + "step": 6550 + }, + { + "epoch": 0.6365224141276926, + "grad_norm": 1.6854751291833254, + "learning_rate": 3.9519112207151664e-05, + "loss": 0.6365, + "step": 6560 + }, + { + "epoch": 0.6374927226858141, + "grad_norm": 1.4850693595634958, + "learning_rate": 3.9502887922642613e-05, + "loss": 0.5966, + "step": 6570 + }, + { + "epoch": 0.6384630312439356, + "grad_norm": 1.5032167913978984, + "learning_rate": 3.9486663638133556e-05, + "loss": 0.6413, + "step": 6580 + }, + { + "epoch": 0.6394333398020571, + "grad_norm": 1.4058296063426399, + "learning_rate": 3.9470439353624506e-05, + "loss": 0.5845, + "step": 6590 + }, + { + "epoch": 0.6404036483601785, + "grad_norm": 1.563521067932564, + "learning_rate": 3.945421506911545e-05, + "loss": 0.6179, + "step": 6600 + }, + { + "epoch": 0.6413739569183, + "grad_norm": 1.4036808350972751, + "learning_rate": 3.94379907846064e-05, + "loss": 0.5942, + "step": 6610 + }, + { + "epoch": 0.6423442654764215, + "grad_norm": 1.4964922216668848, + "learning_rate": 3.942176650009735e-05, + "loss": 0.6516, + "step": 6620 + }, + { + "epoch": 0.643314574034543, + "grad_norm": 1.6801427039047954, + "learning_rate": 3.940554221558829e-05, + "loss": 0.6192, + "step": 6630 + }, + { + "epoch": 0.6442848825926645, + "grad_norm": 1.582294622357827, + "learning_rate": 3.938931793107924e-05, + "loss": 0.6051, + "step": 6640 + }, + { + "epoch": 0.645255191150786, + "grad_norm": 1.531859995817891, + "learning_rate": 3.9373093646570185e-05, + "loss": 0.6407, + "step": 6650 + }, + { + "epoch": 0.6462254997089074, + "grad_norm": 1.9088041020547684, + "learning_rate": 3.9356869362061134e-05, + "loss": 0.6078, + "step": 6660 + }, + { + "epoch": 0.6471958082670289, + "grad_norm": 1.3066981837793257, + "learning_rate": 3.934064507755208e-05, + "loss": 0.5665, + "step": 6670 + }, + { + "epoch": 0.6481661168251504, + "grad_norm": 1.313352722546981, + "learning_rate": 3.932442079304303e-05, + "loss": 0.5201, + "step": 6680 + }, + { + "epoch": 0.6491364253832719, + "grad_norm": 1.8145770193834194, + "learning_rate": 3.930819650853397e-05, + "loss": 0.636, + "step": 6690 + }, + { + "epoch": 0.6501067339413934, + "grad_norm": 1.6198734288858008, + "learning_rate": 3.929197222402492e-05, + "loss": 0.6529, + "step": 6700 + }, + { + "epoch": 0.6510770424995148, + "grad_norm": 1.6029042771301745, + "learning_rate": 3.927574793951587e-05, + "loss": 0.5777, + "step": 6710 + }, + { + "epoch": 0.6520473510576363, + "grad_norm": 1.7982640944263595, + "learning_rate": 3.925952365500681e-05, + "loss": 0.5833, + "step": 6720 + }, + { + "epoch": 0.6530176596157579, + "grad_norm": 1.9034497019204402, + "learning_rate": 3.924329937049776e-05, + "loss": 0.5981, + "step": 6730 + }, + { + "epoch": 0.6539879681738793, + "grad_norm": 1.3239435404528657, + "learning_rate": 3.9227075085988705e-05, + "loss": 0.5861, + "step": 6740 + }, + { + "epoch": 0.6549582767320008, + "grad_norm": 1.7333381509447607, + "learning_rate": 3.921085080147966e-05, + "loss": 0.5696, + "step": 6750 + }, + { + "epoch": 0.6559285852901222, + "grad_norm": 1.7408114949745195, + "learning_rate": 3.9194626516970605e-05, + "loss": 0.5868, + "step": 6760 + }, + { + "epoch": 0.6568988938482437, + "grad_norm": 1.4140478484561587, + "learning_rate": 3.9178402232461555e-05, + "loss": 0.6126, + "step": 6770 + }, + { + "epoch": 0.6578692024063653, + "grad_norm": 1.2359753742563322, + "learning_rate": 3.91621779479525e-05, + "loss": 0.595, + "step": 6780 + }, + { + "epoch": 0.6588395109644867, + "grad_norm": 1.4986954153208918, + "learning_rate": 3.914595366344345e-05, + "loss": 0.6399, + "step": 6790 + }, + { + "epoch": 0.6598098195226082, + "grad_norm": 1.60396773259511, + "learning_rate": 3.912972937893439e-05, + "loss": 0.573, + "step": 6800 + }, + { + "epoch": 0.6607801280807297, + "grad_norm": 1.6446241731745532, + "learning_rate": 3.911350509442534e-05, + "loss": 0.5758, + "step": 6810 + }, + { + "epoch": 0.6617504366388511, + "grad_norm": 1.6940327126152295, + "learning_rate": 3.909728080991629e-05, + "loss": 0.5646, + "step": 6820 + }, + { + "epoch": 0.6627207451969727, + "grad_norm": 1.6725911752868794, + "learning_rate": 3.908105652540723e-05, + "loss": 0.6282, + "step": 6830 + }, + { + "epoch": 0.6636910537550941, + "grad_norm": 1.724226678968073, + "learning_rate": 3.906483224089818e-05, + "loss": 0.6391, + "step": 6840 + }, + { + "epoch": 0.6646613623132156, + "grad_norm": 1.5959383703623191, + "learning_rate": 3.9048607956389126e-05, + "loss": 0.6003, + "step": 6850 + }, + { + "epoch": 0.6656316708713371, + "grad_norm": 1.7487148965006274, + "learning_rate": 3.9032383671880075e-05, + "loss": 0.6019, + "step": 6860 + }, + { + "epoch": 0.6666019794294585, + "grad_norm": 2.0088682339975312, + "learning_rate": 3.901615938737102e-05, + "loss": 0.5846, + "step": 6870 + }, + { + "epoch": 0.6675722879875801, + "grad_norm": 1.5646052877933723, + "learning_rate": 3.899993510286197e-05, + "loss": 0.6222, + "step": 6880 + }, + { + "epoch": 0.6685425965457016, + "grad_norm": 1.5750290096746722, + "learning_rate": 3.898371081835291e-05, + "loss": 0.6018, + "step": 6890 + }, + { + "epoch": 0.669512905103823, + "grad_norm": 1.2258053581934112, + "learning_rate": 3.896748653384386e-05, + "loss": 0.5697, + "step": 6900 + }, + { + "epoch": 0.6704832136619445, + "grad_norm": 1.9376975148901512, + "learning_rate": 3.895126224933481e-05, + "loss": 0.6168, + "step": 6910 + }, + { + "epoch": 0.6714535222200659, + "grad_norm": 1.4670771869188424, + "learning_rate": 3.8935037964825754e-05, + "loss": 0.5964, + "step": 6920 + }, + { + "epoch": 0.6724238307781875, + "grad_norm": 1.324084539118919, + "learning_rate": 3.8918813680316704e-05, + "loss": 0.5346, + "step": 6930 + }, + { + "epoch": 0.673394139336309, + "grad_norm": 1.717496207353836, + "learning_rate": 3.8902589395807647e-05, + "loss": 0.6112, + "step": 6940 + }, + { + "epoch": 0.6743644478944304, + "grad_norm": 2.019875964877309, + "learning_rate": 3.8886365111298596e-05, + "loss": 0.6522, + "step": 6950 + }, + { + "epoch": 0.6753347564525519, + "grad_norm": 1.3354957180640497, + "learning_rate": 3.887014082678954e-05, + "loss": 0.6256, + "step": 6960 + }, + { + "epoch": 0.6763050650106734, + "grad_norm": 1.6937952057212555, + "learning_rate": 3.885391654228049e-05, + "loss": 0.6176, + "step": 6970 + }, + { + "epoch": 0.6772753735687949, + "grad_norm": 1.5422166727550382, + "learning_rate": 3.883769225777143e-05, + "loss": 0.5546, + "step": 6980 + }, + { + "epoch": 0.6782456821269164, + "grad_norm": 1.7417491098472007, + "learning_rate": 3.882146797326238e-05, + "loss": 0.6267, + "step": 6990 + }, + { + "epoch": 0.6792159906850378, + "grad_norm": 1.5366921907221318, + "learning_rate": 3.880524368875333e-05, + "loss": 0.6218, + "step": 7000 + }, + { + "epoch": 0.6792159906850378, + "eval_loss": 0.678726851940155, + "eval_runtime": 2469.9254, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 7000 + }, + { + "epoch": 0.6801862992431593, + "grad_norm": 1.5097589222422163, + "learning_rate": 3.8789019404244275e-05, + "loss": 0.5925, + "step": 7010 + }, + { + "epoch": 0.6811566078012808, + "grad_norm": 2.1865443717654256, + "learning_rate": 3.8772795119735224e-05, + "loss": 0.6125, + "step": 7020 + }, + { + "epoch": 0.6821269163594023, + "grad_norm": 1.3913795509553841, + "learning_rate": 3.875657083522617e-05, + "loss": 0.6366, + "step": 7030 + }, + { + "epoch": 0.6830972249175238, + "grad_norm": 1.314507220249943, + "learning_rate": 3.874034655071712e-05, + "loss": 0.5609, + "step": 7040 + }, + { + "epoch": 0.6840675334756453, + "grad_norm": 1.5985439411272098, + "learning_rate": 3.872412226620806e-05, + "loss": 0.5946, + "step": 7050 + }, + { + "epoch": 0.6850378420337667, + "grad_norm": 1.8453526914168807, + "learning_rate": 3.870789798169901e-05, + "loss": 0.5389, + "step": 7060 + }, + { + "epoch": 0.6860081505918882, + "grad_norm": 1.746744853463753, + "learning_rate": 3.869167369718995e-05, + "loss": 0.5821, + "step": 7070 + }, + { + "epoch": 0.6869784591500097, + "grad_norm": 1.351619112366914, + "learning_rate": 3.86754494126809e-05, + "loss": 0.6333, + "step": 7080 + }, + { + "epoch": 0.6879487677081312, + "grad_norm": 1.6766693580749343, + "learning_rate": 3.865922512817185e-05, + "loss": 0.5905, + "step": 7090 + }, + { + "epoch": 0.6889190762662527, + "grad_norm": 1.6907020127934513, + "learning_rate": 3.8643000843662795e-05, + "loss": 0.583, + "step": 7100 + }, + { + "epoch": 0.6898893848243741, + "grad_norm": 1.8609743061215689, + "learning_rate": 3.8626776559153745e-05, + "loss": 0.6131, + "step": 7110 + }, + { + "epoch": 0.6908596933824956, + "grad_norm": 1.3507375736553595, + "learning_rate": 3.861055227464469e-05, + "loss": 0.6399, + "step": 7120 + }, + { + "epoch": 0.6918300019406172, + "grad_norm": 1.7786145489846785, + "learning_rate": 3.859432799013564e-05, + "loss": 0.5148, + "step": 7130 + }, + { + "epoch": 0.6928003104987386, + "grad_norm": 1.7336491727001198, + "learning_rate": 3.857810370562658e-05, + "loss": 0.6154, + "step": 7140 + }, + { + "epoch": 0.6937706190568601, + "grad_norm": 1.4164103938896966, + "learning_rate": 3.856187942111753e-05, + "loss": 0.5768, + "step": 7150 + }, + { + "epoch": 0.6947409276149815, + "grad_norm": 1.5705135987803418, + "learning_rate": 3.854565513660848e-05, + "loss": 0.6102, + "step": 7160 + }, + { + "epoch": 0.695711236173103, + "grad_norm": 1.4121262983361098, + "learning_rate": 3.8529430852099424e-05, + "loss": 0.61, + "step": 7170 + }, + { + "epoch": 0.6966815447312246, + "grad_norm": 1.8472104212242206, + "learning_rate": 3.851320656759037e-05, + "loss": 0.5911, + "step": 7180 + }, + { + "epoch": 0.697651853289346, + "grad_norm": 1.5344754452736826, + "learning_rate": 3.8496982283081316e-05, + "loss": 0.606, + "step": 7190 + }, + { + "epoch": 0.6986221618474675, + "grad_norm": 1.6606427535357149, + "learning_rate": 3.8480757998572266e-05, + "loss": 0.6062, + "step": 7200 + }, + { + "epoch": 0.699592470405589, + "grad_norm": 1.633183446097436, + "learning_rate": 3.846453371406321e-05, + "loss": 0.5828, + "step": 7210 + }, + { + "epoch": 0.7005627789637104, + "grad_norm": 1.646557901144212, + "learning_rate": 3.844830942955416e-05, + "loss": 0.5354, + "step": 7220 + }, + { + "epoch": 0.701533087521832, + "grad_norm": 1.64918421348089, + "learning_rate": 3.84320851450451e-05, + "loss": 0.551, + "step": 7230 + }, + { + "epoch": 0.7025033960799534, + "grad_norm": 1.6382049443147468, + "learning_rate": 3.841586086053605e-05, + "loss": 0.5723, + "step": 7240 + }, + { + "epoch": 0.7034737046380749, + "grad_norm": 2.1034428309614523, + "learning_rate": 3.8399636576027e-05, + "loss": 0.6282, + "step": 7250 + }, + { + "epoch": 0.7044440131961964, + "grad_norm": 1.4993896668880777, + "learning_rate": 3.8383412291517944e-05, + "loss": 0.5221, + "step": 7260 + }, + { + "epoch": 0.7054143217543178, + "grad_norm": 1.5357479402580956, + "learning_rate": 3.8367188007008894e-05, + "loss": 0.6176, + "step": 7270 + }, + { + "epoch": 0.7063846303124394, + "grad_norm": 1.5289777074279678, + "learning_rate": 3.835096372249984e-05, + "loss": 0.6045, + "step": 7280 + }, + { + "epoch": 0.7073549388705609, + "grad_norm": 1.6907343435757922, + "learning_rate": 3.833473943799079e-05, + "loss": 0.5778, + "step": 7290 + }, + { + "epoch": 0.7083252474286823, + "grad_norm": 1.546188967929013, + "learning_rate": 3.831851515348173e-05, + "loss": 0.5966, + "step": 7300 + }, + { + "epoch": 0.7092955559868038, + "grad_norm": 1.6668813308937025, + "learning_rate": 3.830229086897268e-05, + "loss": 0.5443, + "step": 7310 + }, + { + "epoch": 0.7102658645449252, + "grad_norm": 2.0411822746490444, + "learning_rate": 3.828606658446362e-05, + "loss": 0.5663, + "step": 7320 + }, + { + "epoch": 0.7112361731030468, + "grad_norm": 1.5825597761459882, + "learning_rate": 3.826984229995457e-05, + "loss": 0.6143, + "step": 7330 + }, + { + "epoch": 0.7122064816611683, + "grad_norm": 1.7867885913406227, + "learning_rate": 3.825361801544552e-05, + "loss": 0.6094, + "step": 7340 + }, + { + "epoch": 0.7131767902192897, + "grad_norm": 1.4415937345324663, + "learning_rate": 3.8237393730936465e-05, + "loss": 0.552, + "step": 7350 + }, + { + "epoch": 0.7141470987774112, + "grad_norm": 1.8623916924146842, + "learning_rate": 3.8221169446427415e-05, + "loss": 0.6035, + "step": 7360 + }, + { + "epoch": 0.7151174073355328, + "grad_norm": 1.6112921767840875, + "learning_rate": 3.820494516191836e-05, + "loss": 0.5957, + "step": 7370 + }, + { + "epoch": 0.7160877158936542, + "grad_norm": 1.6295510822010462, + "learning_rate": 3.818872087740931e-05, + "loss": 0.5906, + "step": 7380 + }, + { + "epoch": 0.7170580244517757, + "grad_norm": 1.3787962703695553, + "learning_rate": 3.817249659290025e-05, + "loss": 0.5884, + "step": 7390 + }, + { + "epoch": 0.7180283330098971, + "grad_norm": 1.5293927693421874, + "learning_rate": 3.81562723083912e-05, + "loss": 0.6277, + "step": 7400 + }, + { + "epoch": 0.7189986415680186, + "grad_norm": 1.4147252260121417, + "learning_rate": 3.8140048023882143e-05, + "loss": 0.5725, + "step": 7410 + }, + { + "epoch": 0.7199689501261402, + "grad_norm": 1.7736403425897478, + "learning_rate": 3.812382373937309e-05, + "loss": 0.533, + "step": 7420 + }, + { + "epoch": 0.7209392586842616, + "grad_norm": 1.642532067779172, + "learning_rate": 3.810759945486404e-05, + "loss": 0.6405, + "step": 7430 + }, + { + "epoch": 0.7219095672423831, + "grad_norm": 1.8702737255141961, + "learning_rate": 3.8091375170354986e-05, + "loss": 0.523, + "step": 7440 + }, + { + "epoch": 0.7228798758005046, + "grad_norm": 1.82343375200645, + "learning_rate": 3.8075150885845936e-05, + "loss": 0.54, + "step": 7450 + }, + { + "epoch": 0.723850184358626, + "grad_norm": 1.6212132399265973, + "learning_rate": 3.805892660133688e-05, + "loss": 0.6016, + "step": 7460 + }, + { + "epoch": 0.7248204929167475, + "grad_norm": 1.7707895342391216, + "learning_rate": 3.804270231682783e-05, + "loss": 0.5724, + "step": 7470 + }, + { + "epoch": 0.725790801474869, + "grad_norm": 1.4832754109529855, + "learning_rate": 3.802647803231877e-05, + "loss": 0.5554, + "step": 7480 + }, + { + "epoch": 0.7267611100329905, + "grad_norm": 1.4890517160303556, + "learning_rate": 3.801025374780972e-05, + "loss": 0.6065, + "step": 7490 + }, + { + "epoch": 0.727731418591112, + "grad_norm": 2.047151127095432, + "learning_rate": 3.799402946330067e-05, + "loss": 0.6299, + "step": 7500 + }, + { + "epoch": 0.727731418591112, + "eval_loss": 0.6740881204605103, + "eval_runtime": 2470.6314, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.363, + "step": 7500 + }, + { + "epoch": 0.7287017271492334, + "grad_norm": 1.6234544208509818, + "learning_rate": 3.7977805178791614e-05, + "loss": 0.6013, + "step": 7510 + }, + { + "epoch": 0.729672035707355, + "grad_norm": 1.4452320512475898, + "learning_rate": 3.7961580894282564e-05, + "loss": 0.5479, + "step": 7520 + }, + { + "epoch": 0.7306423442654765, + "grad_norm": 1.2290296761164277, + "learning_rate": 3.794535660977351e-05, + "loss": 0.5471, + "step": 7530 + }, + { + "epoch": 0.7316126528235979, + "grad_norm": 1.567003648013224, + "learning_rate": 3.792913232526446e-05, + "loss": 0.6092, + "step": 7540 + }, + { + "epoch": 0.7325829613817194, + "grad_norm": 1.6011455152210408, + "learning_rate": 3.7912908040755406e-05, + "loss": 0.629, + "step": 7550 + }, + { + "epoch": 0.7335532699398408, + "grad_norm": 2.1218593111989192, + "learning_rate": 3.7896683756246356e-05, + "loss": 0.5773, + "step": 7560 + }, + { + "epoch": 0.7345235784979623, + "grad_norm": 1.8323104555880425, + "learning_rate": 3.78804594717373e-05, + "loss": 0.618, + "step": 7570 + }, + { + "epoch": 0.7354938870560839, + "grad_norm": 1.4152516707860787, + "learning_rate": 3.786423518722825e-05, + "loss": 0.6062, + "step": 7580 + }, + { + "epoch": 0.7364641956142053, + "grad_norm": 1.6284692342846199, + "learning_rate": 3.784801090271919e-05, + "loss": 0.5671, + "step": 7590 + }, + { + "epoch": 0.7374345041723268, + "grad_norm": 1.426043120608999, + "learning_rate": 3.783178661821014e-05, + "loss": 0.5972, + "step": 7600 + }, + { + "epoch": 0.7384048127304483, + "grad_norm": 1.893183882199419, + "learning_rate": 3.7815562333701085e-05, + "loss": 0.6193, + "step": 7610 + }, + { + "epoch": 0.7393751212885697, + "grad_norm": 1.9982181406611617, + "learning_rate": 3.7799338049192034e-05, + "loss": 0.5532, + "step": 7620 + }, + { + "epoch": 0.7403454298466913, + "grad_norm": 1.8557864601006913, + "learning_rate": 3.7783113764682984e-05, + "loss": 0.5777, + "step": 7630 + }, + { + "epoch": 0.7413157384048127, + "grad_norm": 1.8775040642513798, + "learning_rate": 3.776688948017393e-05, + "loss": 0.6268, + "step": 7640 + }, + { + "epoch": 0.7422860469629342, + "grad_norm": 1.758574155734976, + "learning_rate": 3.775066519566488e-05, + "loss": 0.5662, + "step": 7650 + }, + { + "epoch": 0.7432563555210557, + "grad_norm": 2.194684585403217, + "learning_rate": 3.773444091115582e-05, + "loss": 0.6628, + "step": 7660 + }, + { + "epoch": 0.7442266640791771, + "grad_norm": 1.3673983887882022, + "learning_rate": 3.771821662664677e-05, + "loss": 0.6061, + "step": 7670 + }, + { + "epoch": 0.7451969726372987, + "grad_norm": 1.2724099570692131, + "learning_rate": 3.770199234213771e-05, + "loss": 0.6337, + "step": 7680 + }, + { + "epoch": 0.7461672811954202, + "grad_norm": 1.6823171665935568, + "learning_rate": 3.768576805762866e-05, + "loss": 0.5528, + "step": 7690 + }, + { + "epoch": 0.7471375897535416, + "grad_norm": 1.9633396330523931, + "learning_rate": 3.7669543773119605e-05, + "loss": 0.5808, + "step": 7700 + }, + { + "epoch": 0.7481078983116631, + "grad_norm": 1.7782111547784767, + "learning_rate": 3.7653319488610555e-05, + "loss": 0.5767, + "step": 7710 + }, + { + "epoch": 0.7490782068697845, + "grad_norm": 2.3110549543336205, + "learning_rate": 3.7637095204101505e-05, + "loss": 0.6062, + "step": 7720 + }, + { + "epoch": 0.7500485154279061, + "grad_norm": 1.6939270574842946, + "learning_rate": 3.762087091959245e-05, + "loss": 0.6225, + "step": 7730 + }, + { + "epoch": 0.7510188239860276, + "grad_norm": 1.4820399391564874, + "learning_rate": 3.76046466350834e-05, + "loss": 0.6047, + "step": 7740 + }, + { + "epoch": 0.751989132544149, + "grad_norm": 1.5672189262381615, + "learning_rate": 3.758842235057434e-05, + "loss": 0.5879, + "step": 7750 + }, + { + "epoch": 0.7529594411022705, + "grad_norm": 1.6472834744978406, + "learning_rate": 3.757219806606529e-05, + "loss": 0.5895, + "step": 7760 + }, + { + "epoch": 0.7539297496603921, + "grad_norm": 1.1201299864920753, + "learning_rate": 3.7555973781556234e-05, + "loss": 0.5801, + "step": 7770 + }, + { + "epoch": 0.7549000582185135, + "grad_norm": 2.1519831187317107, + "learning_rate": 3.753974949704718e-05, + "loss": 0.5806, + "step": 7780 + }, + { + "epoch": 0.755870366776635, + "grad_norm": 1.6558911877127844, + "learning_rate": 3.752352521253813e-05, + "loss": 0.6243, + "step": 7790 + }, + { + "epoch": 0.7568406753347564, + "grad_norm": 1.56259604588081, + "learning_rate": 3.7507300928029076e-05, + "loss": 0.5611, + "step": 7800 + }, + { + "epoch": 0.7578109838928779, + "grad_norm": 1.7228227668603993, + "learning_rate": 3.7491076643520026e-05, + "loss": 0.5517, + "step": 7810 + }, + { + "epoch": 0.7587812924509995, + "grad_norm": 1.4902669117767953, + "learning_rate": 3.747485235901097e-05, + "loss": 0.538, + "step": 7820 + }, + { + "epoch": 0.7597516010091209, + "grad_norm": 1.782224663749718, + "learning_rate": 3.745862807450192e-05, + "loss": 0.5753, + "step": 7830 + }, + { + "epoch": 0.7607219095672424, + "grad_norm": 1.9997818457050736, + "learning_rate": 3.744240378999286e-05, + "loss": 0.6007, + "step": 7840 + }, + { + "epoch": 0.7616922181253639, + "grad_norm": 1.4094657223176301, + "learning_rate": 3.742617950548381e-05, + "loss": 0.6338, + "step": 7850 + }, + { + "epoch": 0.7626625266834853, + "grad_norm": 1.9288686398933608, + "learning_rate": 3.7409955220974754e-05, + "loss": 0.5906, + "step": 7860 + }, + { + "epoch": 0.7636328352416069, + "grad_norm": 1.7390584642787001, + "learning_rate": 3.7393730936465704e-05, + "loss": 0.5738, + "step": 7870 + }, + { + "epoch": 0.7646031437997283, + "grad_norm": 1.9438974814009289, + "learning_rate": 3.7377506651956654e-05, + "loss": 0.5121, + "step": 7880 + }, + { + "epoch": 0.7655734523578498, + "grad_norm": 1.740068812702714, + "learning_rate": 3.73612823674476e-05, + "loss": 0.557, + "step": 7890 + }, + { + "epoch": 0.7665437609159713, + "grad_norm": 1.7983844321630307, + "learning_rate": 3.734505808293855e-05, + "loss": 0.5748, + "step": 7900 + }, + { + "epoch": 0.7675140694740927, + "grad_norm": 1.5464273991421298, + "learning_rate": 3.732883379842949e-05, + "loss": 0.547, + "step": 7910 + }, + { + "epoch": 0.7684843780322143, + "grad_norm": 1.5357637162840818, + "learning_rate": 3.731260951392044e-05, + "loss": 0.5628, + "step": 7920 + }, + { + "epoch": 0.7694546865903358, + "grad_norm": 1.5933643507985389, + "learning_rate": 3.729638522941138e-05, + "loss": 0.6377, + "step": 7930 + }, + { + "epoch": 0.7704249951484572, + "grad_norm": 1.885102854313203, + "learning_rate": 3.728016094490233e-05, + "loss": 0.5542, + "step": 7940 + }, + { + "epoch": 0.7713953037065787, + "grad_norm": 1.2646336337440816, + "learning_rate": 3.7263936660393275e-05, + "loss": 0.5909, + "step": 7950 + }, + { + "epoch": 0.7723656122647001, + "grad_norm": 1.5489929838962764, + "learning_rate": 3.7247712375884225e-05, + "loss": 0.6417, + "step": 7960 + }, + { + "epoch": 0.7733359208228217, + "grad_norm": 1.5642843810424312, + "learning_rate": 3.7231488091375175e-05, + "loss": 0.5739, + "step": 7970 + }, + { + "epoch": 0.7743062293809432, + "grad_norm": 1.2994344306176584, + "learning_rate": 3.721526380686612e-05, + "loss": 0.6521, + "step": 7980 + }, + { + "epoch": 0.7752765379390646, + "grad_norm": 1.7547352757346097, + "learning_rate": 3.719903952235707e-05, + "loss": 0.6055, + "step": 7990 + }, + { + "epoch": 0.7762468464971861, + "grad_norm": 1.8680201530781706, + "learning_rate": 3.718281523784801e-05, + "loss": 0.5468, + "step": 8000 + }, + { + "epoch": 0.7762468464971861, + "eval_loss": 0.6728888750076294, + "eval_runtime": 2470.4642, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.363, + "step": 8000 + }, + { + "epoch": 0.7772171550553076, + "grad_norm": 1.8353606131921394, + "learning_rate": 3.716659095333896e-05, + "loss": 0.6019, + "step": 8010 + }, + { + "epoch": 0.778187463613429, + "grad_norm": 2.1165186332460624, + "learning_rate": 3.71503666688299e-05, + "loss": 0.6144, + "step": 8020 + }, + { + "epoch": 0.7791577721715506, + "grad_norm": 1.2983357938097837, + "learning_rate": 3.713414238432085e-05, + "loss": 0.5765, + "step": 8030 + }, + { + "epoch": 0.780128080729672, + "grad_norm": 1.6998652219779764, + "learning_rate": 3.7117918099811796e-05, + "loss": 0.5673, + "step": 8040 + }, + { + "epoch": 0.7810983892877935, + "grad_norm": 1.6651384645995448, + "learning_rate": 3.7101693815302746e-05, + "loss": 0.5813, + "step": 8050 + }, + { + "epoch": 0.782068697845915, + "grad_norm": 1.711156688730374, + "learning_rate": 3.7085469530793696e-05, + "loss": 0.5804, + "step": 8060 + }, + { + "epoch": 0.7830390064040365, + "grad_norm": 1.8429084688690953, + "learning_rate": 3.706924524628464e-05, + "loss": 0.6076, + "step": 8070 + }, + { + "epoch": 0.784009314962158, + "grad_norm": 2.029890621987729, + "learning_rate": 3.705302096177559e-05, + "loss": 0.6248, + "step": 8080 + }, + { + "epoch": 0.7849796235202795, + "grad_norm": 2.075042801349425, + "learning_rate": 3.703679667726653e-05, + "loss": 0.5546, + "step": 8090 + }, + { + "epoch": 0.7859499320784009, + "grad_norm": 1.9442644766970887, + "learning_rate": 3.702057239275748e-05, + "loss": 0.5754, + "step": 8100 + }, + { + "epoch": 0.7869202406365224, + "grad_norm": 2.2898169839511167, + "learning_rate": 3.7004348108248424e-05, + "loss": 0.5479, + "step": 8110 + }, + { + "epoch": 0.7878905491946439, + "grad_norm": 1.1971567816899984, + "learning_rate": 3.6988123823739374e-05, + "loss": 0.5841, + "step": 8120 + }, + { + "epoch": 0.7888608577527654, + "grad_norm": 1.6483060799493618, + "learning_rate": 3.697189953923032e-05, + "loss": 0.5993, + "step": 8130 + }, + { + "epoch": 0.7898311663108869, + "grad_norm": 2.1844480030037094, + "learning_rate": 3.695567525472127e-05, + "loss": 0.5639, + "step": 8140 + }, + { + "epoch": 0.7908014748690083, + "grad_norm": 1.9208684488820418, + "learning_rate": 3.6939450970212216e-05, + "loss": 0.5548, + "step": 8150 + }, + { + "epoch": 0.7917717834271298, + "grad_norm": 1.7298769058966599, + "learning_rate": 3.692322668570316e-05, + "loss": 0.6189, + "step": 8160 + }, + { + "epoch": 0.7927420919852514, + "grad_norm": 2.043188779971351, + "learning_rate": 3.690700240119411e-05, + "loss": 0.6341, + "step": 8170 + }, + { + "epoch": 0.7937124005433728, + "grad_norm": 1.4155988967270856, + "learning_rate": 3.689077811668505e-05, + "loss": 0.5803, + "step": 8180 + }, + { + "epoch": 0.7946827091014943, + "grad_norm": 1.876027618508082, + "learning_rate": 3.6874553832176e-05, + "loss": 0.5469, + "step": 8190 + }, + { + "epoch": 0.7956530176596157, + "grad_norm": 1.8526890916404788, + "learning_rate": 3.6858329547666945e-05, + "loss": 0.5692, + "step": 8200 + }, + { + "epoch": 0.7966233262177372, + "grad_norm": 1.454859578114591, + "learning_rate": 3.6842105263157895e-05, + "loss": 0.5861, + "step": 8210 + }, + { + "epoch": 0.7975936347758588, + "grad_norm": 1.6396970355230962, + "learning_rate": 3.6825880978648844e-05, + "loss": 0.5864, + "step": 8220 + }, + { + "epoch": 0.7985639433339802, + "grad_norm": 2.035219184211101, + "learning_rate": 3.680965669413979e-05, + "loss": 0.5761, + "step": 8230 + }, + { + "epoch": 0.7995342518921017, + "grad_norm": 1.4771264704118183, + "learning_rate": 3.679343240963074e-05, + "loss": 0.5901, + "step": 8240 + }, + { + "epoch": 0.8005045604502232, + "grad_norm": 1.366640822196709, + "learning_rate": 3.677720812512168e-05, + "loss": 0.5976, + "step": 8250 + }, + { + "epoch": 0.8014748690083446, + "grad_norm": 1.6158534552665804, + "learning_rate": 3.676098384061263e-05, + "loss": 0.5676, + "step": 8260 + }, + { + "epoch": 0.8024451775664662, + "grad_norm": 1.906704908176893, + "learning_rate": 3.674475955610357e-05, + "loss": 0.6014, + "step": 8270 + }, + { + "epoch": 0.8034154861245876, + "grad_norm": 1.72316238215741, + "learning_rate": 3.672853527159452e-05, + "loss": 0.5341, + "step": 8280 + }, + { + "epoch": 0.8043857946827091, + "grad_norm": 1.6228489307580705, + "learning_rate": 3.6712310987085466e-05, + "loss": 0.6174, + "step": 8290 + }, + { + "epoch": 0.8053561032408306, + "grad_norm": 1.6343652614101287, + "learning_rate": 3.6696086702576416e-05, + "loss": 0.5628, + "step": 8300 + }, + { + "epoch": 0.806326411798952, + "grad_norm": 1.4875484854044787, + "learning_rate": 3.6679862418067365e-05, + "loss": 0.5559, + "step": 8310 + }, + { + "epoch": 0.8072967203570736, + "grad_norm": 1.7360867259906014, + "learning_rate": 3.6663638133558315e-05, + "loss": 0.6122, + "step": 8320 + }, + { + "epoch": 0.8082670289151951, + "grad_norm": 1.3760924603009739, + "learning_rate": 3.664741384904926e-05, + "loss": 0.5369, + "step": 8330 + }, + { + "epoch": 0.8092373374733165, + "grad_norm": 2.1326347379272033, + "learning_rate": 3.663118956454021e-05, + "loss": 0.6026, + "step": 8340 + }, + { + "epoch": 0.810207646031438, + "grad_norm": 1.8445530020541556, + "learning_rate": 3.661496528003116e-05, + "loss": 0.5924, + "step": 8350 + }, + { + "epoch": 0.8111779545895594, + "grad_norm": 2.1620839328051153, + "learning_rate": 3.65987409955221e-05, + "loss": 0.5556, + "step": 8360 + }, + { + "epoch": 0.812148263147681, + "grad_norm": 1.91334025126679, + "learning_rate": 3.658251671101305e-05, + "loss": 0.5787, + "step": 8370 + }, + { + "epoch": 0.8131185717058025, + "grad_norm": 1.6197394739350461, + "learning_rate": 3.656629242650399e-05, + "loss": 0.5364, + "step": 8380 + }, + { + "epoch": 0.8140888802639239, + "grad_norm": 1.7496855520727714, + "learning_rate": 3.655006814199494e-05, + "loss": 0.604, + "step": 8390 + }, + { + "epoch": 0.8150591888220454, + "grad_norm": 1.9072833025355298, + "learning_rate": 3.6533843857485886e-05, + "loss": 0.6127, + "step": 8400 + }, + { + "epoch": 0.816029497380167, + "grad_norm": 1.693494475477864, + "learning_rate": 3.6517619572976836e-05, + "loss": 0.5708, + "step": 8410 + }, + { + "epoch": 0.8169998059382884, + "grad_norm": 1.694549481689151, + "learning_rate": 3.6501395288467786e-05, + "loss": 0.544, + "step": 8420 + }, + { + "epoch": 0.8179701144964099, + "grad_norm": 1.77015793724743, + "learning_rate": 3.648517100395873e-05, + "loss": 0.5973, + "step": 8430 + }, + { + "epoch": 0.8189404230545313, + "grad_norm": 1.738049084314835, + "learning_rate": 3.646894671944968e-05, + "loss": 0.5912, + "step": 8440 + }, + { + "epoch": 0.8199107316126528, + "grad_norm": 1.647267019109384, + "learning_rate": 3.645272243494062e-05, + "loss": 0.6346, + "step": 8450 + }, + { + "epoch": 0.8208810401707743, + "grad_norm": 1.5240403667548303, + "learning_rate": 3.643649815043157e-05, + "loss": 0.6057, + "step": 8460 + }, + { + "epoch": 0.8218513487288958, + "grad_norm": 1.9527577485511822, + "learning_rate": 3.6420273865922514e-05, + "loss": 0.5615, + "step": 8470 + }, + { + "epoch": 0.8228216572870173, + "grad_norm": 1.937531929878238, + "learning_rate": 3.6404049581413464e-05, + "loss": 0.5625, + "step": 8480 + }, + { + "epoch": 0.8237919658451388, + "grad_norm": 1.7329982779170732, + "learning_rate": 3.638782529690441e-05, + "loss": 0.613, + "step": 8490 + }, + { + "epoch": 0.8247622744032602, + "grad_norm": 1.880843027338221, + "learning_rate": 3.637160101239536e-05, + "loss": 0.5869, + "step": 8500 + }, + { + "epoch": 0.8247622744032602, + "eval_loss": 0.6701070070266724, + "eval_runtime": 2471.9751, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 8500 + }, + { + "epoch": 0.8257325829613817, + "grad_norm": 1.2566468431839666, + "learning_rate": 3.6355376727886307e-05, + "loss": 0.5433, + "step": 8510 + }, + { + "epoch": 0.8267028915195032, + "grad_norm": 1.8546070309693423, + "learning_rate": 3.633915244337725e-05, + "loss": 0.5669, + "step": 8520 + }, + { + "epoch": 0.8276732000776247, + "grad_norm": 1.268249173975751, + "learning_rate": 3.63229281588682e-05, + "loss": 0.5895, + "step": 8530 + }, + { + "epoch": 0.8286435086357462, + "grad_norm": 1.7652424871651577, + "learning_rate": 3.630670387435914e-05, + "loss": 0.5597, + "step": 8540 + }, + { + "epoch": 0.8296138171938676, + "grad_norm": 1.5480894985375517, + "learning_rate": 3.629047958985009e-05, + "loss": 0.5486, + "step": 8550 + }, + { + "epoch": 0.8305841257519891, + "grad_norm": 1.750414852898824, + "learning_rate": 3.6274255305341035e-05, + "loss": 0.584, + "step": 8560 + }, + { + "epoch": 0.8315544343101107, + "grad_norm": 1.7102419104322222, + "learning_rate": 3.6258031020831985e-05, + "loss": 0.5668, + "step": 8570 + }, + { + "epoch": 0.8325247428682321, + "grad_norm": 1.5528462400057457, + "learning_rate": 3.624180673632293e-05, + "loss": 0.5896, + "step": 8580 + }, + { + "epoch": 0.8334950514263536, + "grad_norm": 1.4801001203063542, + "learning_rate": 3.622558245181388e-05, + "loss": 0.5997, + "step": 8590 + }, + { + "epoch": 0.834465359984475, + "grad_norm": 1.391432340520497, + "learning_rate": 3.620935816730483e-05, + "loss": 0.5795, + "step": 8600 + }, + { + "epoch": 0.8354356685425965, + "grad_norm": 1.4373875976848383, + "learning_rate": 3.619313388279577e-05, + "loss": 0.5769, + "step": 8610 + }, + { + "epoch": 0.8364059771007181, + "grad_norm": 1.8411413590399401, + "learning_rate": 3.617690959828672e-05, + "loss": 0.5626, + "step": 8620 + }, + { + "epoch": 0.8373762856588395, + "grad_norm": 1.6990663233303336, + "learning_rate": 3.616068531377766e-05, + "loss": 0.6057, + "step": 8630 + }, + { + "epoch": 0.838346594216961, + "grad_norm": 1.9737814861957261, + "learning_rate": 3.614446102926861e-05, + "loss": 0.564, + "step": 8640 + }, + { + "epoch": 0.8393169027750825, + "grad_norm": 1.6163853470752478, + "learning_rate": 3.6128236744759556e-05, + "loss": 0.5429, + "step": 8650 + }, + { + "epoch": 0.8402872113332039, + "grad_norm": 1.5911941575436375, + "learning_rate": 3.6112012460250506e-05, + "loss": 0.6203, + "step": 8660 + }, + { + "epoch": 0.8412575198913255, + "grad_norm": 2.101242238346334, + "learning_rate": 3.609578817574145e-05, + "loss": 0.5263, + "step": 8670 + }, + { + "epoch": 0.8422278284494469, + "grad_norm": 1.671876183792032, + "learning_rate": 3.60795638912324e-05, + "loss": 0.5699, + "step": 8680 + }, + { + "epoch": 0.8431981370075684, + "grad_norm": 1.7729527985320428, + "learning_rate": 3.606333960672335e-05, + "loss": 0.5525, + "step": 8690 + }, + { + "epoch": 0.8441684455656899, + "grad_norm": 1.7198189021299524, + "learning_rate": 3.604711532221429e-05, + "loss": 0.5789, + "step": 8700 + }, + { + "epoch": 0.8451387541238113, + "grad_norm": 1.754160746828024, + "learning_rate": 3.603089103770524e-05, + "loss": 0.5126, + "step": 8710 + }, + { + "epoch": 0.8461090626819329, + "grad_norm": 1.8621321453517432, + "learning_rate": 3.6014666753196184e-05, + "loss": 0.593, + "step": 8720 + }, + { + "epoch": 0.8470793712400544, + "grad_norm": 1.4506925576933114, + "learning_rate": 3.5998442468687134e-05, + "loss": 0.5489, + "step": 8730 + }, + { + "epoch": 0.8480496797981758, + "grad_norm": 1.9426709583278723, + "learning_rate": 3.598221818417808e-05, + "loss": 0.498, + "step": 8740 + }, + { + "epoch": 0.8490199883562973, + "grad_norm": 1.7346244341640757, + "learning_rate": 3.5965993899669026e-05, + "loss": 0.5583, + "step": 8750 + }, + { + "epoch": 0.8499902969144187, + "grad_norm": 1.926381813212122, + "learning_rate": 3.594976961515997e-05, + "loss": 0.5931, + "step": 8760 + }, + { + "epoch": 0.8509606054725403, + "grad_norm": 1.9797943448639521, + "learning_rate": 3.593354533065092e-05, + "loss": 0.5773, + "step": 8770 + }, + { + "epoch": 0.8519309140306618, + "grad_norm": 2.243092907709638, + "learning_rate": 3.591732104614187e-05, + "loss": 0.5862, + "step": 8780 + }, + { + "epoch": 0.8529012225887832, + "grad_norm": 1.6686507496640315, + "learning_rate": 3.590109676163281e-05, + "loss": 0.5861, + "step": 8790 + }, + { + "epoch": 0.8538715311469047, + "grad_norm": 1.767743857045935, + "learning_rate": 3.588487247712376e-05, + "loss": 0.6008, + "step": 8800 + }, + { + "epoch": 0.8548418397050263, + "grad_norm": 1.789840488197533, + "learning_rate": 3.5868648192614705e-05, + "loss": 0.5578, + "step": 8810 + }, + { + "epoch": 0.8558121482631477, + "grad_norm": 1.826043320699609, + "learning_rate": 3.5852423908105655e-05, + "loss": 0.5568, + "step": 8820 + }, + { + "epoch": 0.8567824568212692, + "grad_norm": 1.4847524529624125, + "learning_rate": 3.58361996235966e-05, + "loss": 0.5984, + "step": 8830 + }, + { + "epoch": 0.8577527653793906, + "grad_norm": 1.5302240137475795, + "learning_rate": 3.581997533908755e-05, + "loss": 0.5619, + "step": 8840 + }, + { + "epoch": 0.8587230739375121, + "grad_norm": 1.6025870419513641, + "learning_rate": 3.58037510545785e-05, + "loss": 0.601, + "step": 8850 + }, + { + "epoch": 0.8596933824956337, + "grad_norm": 1.8930896480563768, + "learning_rate": 3.578752677006944e-05, + "loss": 0.5935, + "step": 8860 + }, + { + "epoch": 0.8606636910537551, + "grad_norm": 1.6413896209556986, + "learning_rate": 3.577130248556039e-05, + "loss": 0.6019, + "step": 8870 + }, + { + "epoch": 0.8616339996118766, + "grad_norm": 1.6030981794189743, + "learning_rate": 3.575507820105133e-05, + "loss": 0.5761, + "step": 8880 + }, + { + "epoch": 0.8626043081699981, + "grad_norm": 1.4798866694040977, + "learning_rate": 3.573885391654228e-05, + "loss": 0.5561, + "step": 8890 + }, + { + "epoch": 0.8635746167281195, + "grad_norm": 1.5790323051766768, + "learning_rate": 3.5722629632033226e-05, + "loss": 0.5986, + "step": 8900 + }, + { + "epoch": 0.864544925286241, + "grad_norm": 1.8409121182605548, + "learning_rate": 3.5706405347524175e-05, + "loss": 0.5878, + "step": 8910 + }, + { + "epoch": 0.8655152338443625, + "grad_norm": 1.4956048954650922, + "learning_rate": 3.569018106301512e-05, + "loss": 0.5548, + "step": 8920 + }, + { + "epoch": 0.866485542402484, + "grad_norm": 1.6969221289264929, + "learning_rate": 3.567395677850607e-05, + "loss": 0.5937, + "step": 8930 + }, + { + "epoch": 0.8674558509606055, + "grad_norm": 1.8986285895813184, + "learning_rate": 3.565773249399702e-05, + "loss": 0.5742, + "step": 8940 + }, + { + "epoch": 0.8684261595187269, + "grad_norm": 1.5434045122281053, + "learning_rate": 3.564150820948796e-05, + "loss": 0.561, + "step": 8950 + }, + { + "epoch": 0.8693964680768484, + "grad_norm": 1.69523497453855, + "learning_rate": 3.562528392497891e-05, + "loss": 0.5386, + "step": 8960 + }, + { + "epoch": 0.87036677663497, + "grad_norm": 1.382817285428295, + "learning_rate": 3.5609059640469854e-05, + "loss": 0.555, + "step": 8970 + }, + { + "epoch": 0.8713370851930914, + "grad_norm": 1.7354010988203836, + "learning_rate": 3.5592835355960803e-05, + "loss": 0.5963, + "step": 8980 + }, + { + "epoch": 0.8723073937512129, + "grad_norm": 1.9761497522561469, + "learning_rate": 3.5576611071451746e-05, + "loss": 0.5859, + "step": 8990 + }, + { + "epoch": 0.8732777023093343, + "grad_norm": 1.6264062899083487, + "learning_rate": 3.5560386786942696e-05, + "loss": 0.5596, + "step": 9000 + }, + { + "epoch": 0.8732777023093343, + "eval_loss": 0.664995551109314, + "eval_runtime": 2472.6792, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 9000 + }, + { + "epoch": 0.8742480108674558, + "grad_norm": 1.7886226308798359, + "learning_rate": 3.554416250243364e-05, + "loss": 0.5484, + "step": 9010 + }, + { + "epoch": 0.8752183194255774, + "grad_norm": 1.5768009291424698, + "learning_rate": 3.552793821792459e-05, + "loss": 0.5511, + "step": 9020 + }, + { + "epoch": 0.8761886279836988, + "grad_norm": 1.5632964481761753, + "learning_rate": 3.551171393341554e-05, + "loss": 0.5173, + "step": 9030 + }, + { + "epoch": 0.8771589365418203, + "grad_norm": 2.0563484372782264, + "learning_rate": 3.549548964890648e-05, + "loss": 0.5586, + "step": 9040 + }, + { + "epoch": 0.8781292450999418, + "grad_norm": 1.9786114879020535, + "learning_rate": 3.547926536439743e-05, + "loss": 0.5767, + "step": 9050 + }, + { + "epoch": 0.8790995536580632, + "grad_norm": 1.2143656544779613, + "learning_rate": 3.5463041079888375e-05, + "loss": 0.5469, + "step": 9060 + }, + { + "epoch": 0.8800698622161848, + "grad_norm": 1.696976506773579, + "learning_rate": 3.5446816795379324e-05, + "loss": 0.5819, + "step": 9070 + }, + { + "epoch": 0.8810401707743062, + "grad_norm": 1.8002454640393335, + "learning_rate": 3.543059251087027e-05, + "loss": 0.5947, + "step": 9080 + }, + { + "epoch": 0.8820104793324277, + "grad_norm": 1.826272101277873, + "learning_rate": 3.541436822636122e-05, + "loss": 0.632, + "step": 9090 + }, + { + "epoch": 0.8829807878905492, + "grad_norm": 1.8609691197855038, + "learning_rate": 3.539814394185217e-05, + "loss": 0.649, + "step": 9100 + }, + { + "epoch": 0.8839510964486706, + "grad_norm": 1.7165708438826977, + "learning_rate": 3.5381919657343117e-05, + "loss": 0.5746, + "step": 9110 + }, + { + "epoch": 0.8849214050067922, + "grad_norm": 2.0390486982174454, + "learning_rate": 3.536569537283406e-05, + "loss": 0.5834, + "step": 9120 + }, + { + "epoch": 0.8858917135649137, + "grad_norm": 1.3107579501174733, + "learning_rate": 3.534947108832501e-05, + "loss": 0.5605, + "step": 9130 + }, + { + "epoch": 0.8868620221230351, + "grad_norm": 1.7750714401302556, + "learning_rate": 3.533324680381596e-05, + "loss": 0.5778, + "step": 9140 + }, + { + "epoch": 0.8878323306811566, + "grad_norm": 1.9002696712889475, + "learning_rate": 3.53170225193069e-05, + "loss": 0.5928, + "step": 9150 + }, + { + "epoch": 0.888802639239278, + "grad_norm": 1.602034501274119, + "learning_rate": 3.530079823479785e-05, + "loss": 0.5941, + "step": 9160 + }, + { + "epoch": 0.8897729477973996, + "grad_norm": 1.5338224625276715, + "learning_rate": 3.5284573950288795e-05, + "loss": 0.5678, + "step": 9170 + }, + { + "epoch": 0.8907432563555211, + "grad_norm": 1.6656771631689804, + "learning_rate": 3.5268349665779745e-05, + "loss": 0.5621, + "step": 9180 + }, + { + "epoch": 0.8917135649136425, + "grad_norm": 1.6734086655118368, + "learning_rate": 3.525212538127069e-05, + "loss": 0.5836, + "step": 9190 + }, + { + "epoch": 0.892683873471764, + "grad_norm": 1.7321566684016627, + "learning_rate": 3.523590109676164e-05, + "loss": 0.6049, + "step": 9200 + }, + { + "epoch": 0.8936541820298856, + "grad_norm": 1.8468990646584322, + "learning_rate": 3.521967681225258e-05, + "loss": 0.5265, + "step": 9210 + }, + { + "epoch": 0.894624490588007, + "grad_norm": 1.8260102068484456, + "learning_rate": 3.520345252774353e-05, + "loss": 0.6003, + "step": 9220 + }, + { + "epoch": 0.8955947991461285, + "grad_norm": 1.5049364704966368, + "learning_rate": 3.518722824323448e-05, + "loss": 0.6362, + "step": 9230 + }, + { + "epoch": 0.8965651077042499, + "grad_norm": 1.8567290478944525, + "learning_rate": 3.517100395872542e-05, + "loss": 0.5666, + "step": 9240 + }, + { + "epoch": 0.8975354162623714, + "grad_norm": 1.83554173576104, + "learning_rate": 3.515477967421637e-05, + "loss": 0.525, + "step": 9250 + }, + { + "epoch": 0.898505724820493, + "grad_norm": 1.756580666977339, + "learning_rate": 3.5138555389707316e-05, + "loss": 0.5297, + "step": 9260 + }, + { + "epoch": 0.8994760333786144, + "grad_norm": 1.3709870685247603, + "learning_rate": 3.5122331105198265e-05, + "loss": 0.5529, + "step": 9270 + }, + { + "epoch": 0.9004463419367359, + "grad_norm": 1.910129116074302, + "learning_rate": 3.510610682068921e-05, + "loss": 0.5575, + "step": 9280 + }, + { + "epoch": 0.9014166504948574, + "grad_norm": 1.457636418218358, + "learning_rate": 3.508988253618016e-05, + "loss": 0.6219, + "step": 9290 + }, + { + "epoch": 0.9023869590529788, + "grad_norm": 1.7400018831329018, + "learning_rate": 3.50736582516711e-05, + "loss": 0.597, + "step": 9300 + }, + { + "epoch": 0.9033572676111004, + "grad_norm": 1.7711972448898297, + "learning_rate": 3.505743396716205e-05, + "loss": 0.5953, + "step": 9310 + }, + { + "epoch": 0.9043275761692218, + "grad_norm": 1.3127391028092956, + "learning_rate": 3.5041209682653e-05, + "loss": 0.5771, + "step": 9320 + }, + { + "epoch": 0.9052978847273433, + "grad_norm": 1.707083390377331, + "learning_rate": 3.5024985398143944e-05, + "loss": 0.5941, + "step": 9330 + }, + { + "epoch": 0.9062681932854648, + "grad_norm": 2.3395444029475425, + "learning_rate": 3.5008761113634894e-05, + "loss": 0.5249, + "step": 9340 + }, + { + "epoch": 0.9072385018435862, + "grad_norm": 2.15851320522422, + "learning_rate": 3.4992536829125837e-05, + "loss": 0.5754, + "step": 9350 + }, + { + "epoch": 0.9082088104017078, + "grad_norm": 1.566455099295683, + "learning_rate": 3.4976312544616786e-05, + "loss": 0.5982, + "step": 9360 + }, + { + "epoch": 0.9091791189598293, + "grad_norm": 1.6447262177778976, + "learning_rate": 3.496008826010773e-05, + "loss": 0.5819, + "step": 9370 + }, + { + "epoch": 0.9101494275179507, + "grad_norm": 1.9281752331049982, + "learning_rate": 3.494386397559868e-05, + "loss": 0.5789, + "step": 9380 + }, + { + "epoch": 0.9111197360760722, + "grad_norm": 1.9237214019216216, + "learning_rate": 3.492763969108962e-05, + "loss": 0.5773, + "step": 9390 + }, + { + "epoch": 0.9120900446341936, + "grad_norm": 1.5099620370471458, + "learning_rate": 3.491141540658057e-05, + "loss": 0.5954, + "step": 9400 + }, + { + "epoch": 0.9130603531923152, + "grad_norm": 1.861214942110368, + "learning_rate": 3.489519112207152e-05, + "loss": 0.5956, + "step": 9410 + }, + { + "epoch": 0.9140306617504367, + "grad_norm": 1.780788157697634, + "learning_rate": 3.4878966837562465e-05, + "loss": 0.5492, + "step": 9420 + }, + { + "epoch": 0.9150009703085581, + "grad_norm": 1.7391155816623414, + "learning_rate": 3.4862742553053414e-05, + "loss": 0.5984, + "step": 9430 + }, + { + "epoch": 0.9159712788666796, + "grad_norm": 2.1229665459752125, + "learning_rate": 3.484651826854436e-05, + "loss": 0.5924, + "step": 9440 + }, + { + "epoch": 0.9169415874248011, + "grad_norm": 2.1306613515387625, + "learning_rate": 3.483029398403531e-05, + "loss": 0.5148, + "step": 9450 + }, + { + "epoch": 0.9179118959829226, + "grad_norm": 1.312566076533743, + "learning_rate": 3.481406969952625e-05, + "loss": 0.5919, + "step": 9460 + }, + { + "epoch": 0.9188822045410441, + "grad_norm": 1.4886826509371758, + "learning_rate": 3.47978454150172e-05, + "loss": 0.5853, + "step": 9470 + }, + { + "epoch": 0.9198525130991655, + "grad_norm": 2.0143652338550098, + "learning_rate": 3.478162113050815e-05, + "loss": 0.5689, + "step": 9480 + }, + { + "epoch": 0.920822821657287, + "grad_norm": 1.1368414203141723, + "learning_rate": 3.476539684599909e-05, + "loss": 0.5546, + "step": 9490 + }, + { + "epoch": 0.9217931302154085, + "grad_norm": 1.6341362563091877, + "learning_rate": 3.474917256149004e-05, + "loss": 0.5481, + "step": 9500 + }, + { + "epoch": 0.9217931302154085, + "eval_loss": 0.6622401475906372, + "eval_runtime": 2474.1087, + "eval_samples_per_second": 0.724, + "eval_steps_per_second": 0.362, + "step": 9500 + }, + { + "epoch": 0.92276343877353, + "grad_norm": 1.8501035490125046, + "learning_rate": 3.4732948276980985e-05, + "loss": 0.519, + "step": 9510 + }, + { + "epoch": 0.9237337473316515, + "grad_norm": 1.788403250607821, + "learning_rate": 3.4716723992471935e-05, + "loss": 0.5934, + "step": 9520 + }, + { + "epoch": 0.924704055889773, + "grad_norm": 2.0089841746536212, + "learning_rate": 3.470049970796288e-05, + "loss": 0.5463, + "step": 9530 + }, + { + "epoch": 0.9256743644478944, + "grad_norm": 1.4797021959834935, + "learning_rate": 3.468427542345383e-05, + "loss": 0.538, + "step": 9540 + }, + { + "epoch": 0.9266446730060159, + "grad_norm": 1.5018994978955122, + "learning_rate": 3.466805113894477e-05, + "loss": 0.5859, + "step": 9550 + }, + { + "epoch": 0.9276149815641374, + "grad_norm": 1.6445575492561615, + "learning_rate": 3.465182685443572e-05, + "loss": 0.5335, + "step": 9560 + }, + { + "epoch": 0.9285852901222589, + "grad_norm": 1.910218513281919, + "learning_rate": 3.463560256992667e-05, + "loss": 0.5284, + "step": 9570 + }, + { + "epoch": 0.9295555986803804, + "grad_norm": 1.5061058308088753, + "learning_rate": 3.4619378285417614e-05, + "loss": 0.5332, + "step": 9580 + }, + { + "epoch": 0.9305259072385018, + "grad_norm": 1.5869569364806828, + "learning_rate": 3.460315400090856e-05, + "loss": 0.5591, + "step": 9590 + }, + { + "epoch": 0.9314962157966233, + "grad_norm": 1.709001551959916, + "learning_rate": 3.4586929716399506e-05, + "loss": 0.5499, + "step": 9600 + }, + { + "epoch": 0.9324665243547449, + "grad_norm": 1.5648665735772118, + "learning_rate": 3.4570705431890456e-05, + "loss": 0.5677, + "step": 9610 + }, + { + "epoch": 0.9334368329128663, + "grad_norm": 2.297106138114182, + "learning_rate": 3.45544811473814e-05, + "loss": 0.5687, + "step": 9620 + }, + { + "epoch": 0.9344071414709878, + "grad_norm": 1.940344115414216, + "learning_rate": 3.453825686287235e-05, + "loss": 0.5614, + "step": 9630 + }, + { + "epoch": 0.9353774500291092, + "grad_norm": 2.2931746226047336, + "learning_rate": 3.452203257836329e-05, + "loss": 0.5888, + "step": 9640 + }, + { + "epoch": 0.9363477585872307, + "grad_norm": 1.6726396768074983, + "learning_rate": 3.450580829385424e-05, + "loss": 0.555, + "step": 9650 + }, + { + "epoch": 0.9373180671453523, + "grad_norm": 1.5245738077800575, + "learning_rate": 3.448958400934519e-05, + "loss": 0.5464, + "step": 9660 + }, + { + "epoch": 0.9382883757034737, + "grad_norm": 1.3863585051832457, + "learning_rate": 3.4473359724836134e-05, + "loss": 0.5033, + "step": 9670 + }, + { + "epoch": 0.9392586842615952, + "grad_norm": 1.2261387961871664, + "learning_rate": 3.4457135440327084e-05, + "loss": 0.5229, + "step": 9680 + }, + { + "epoch": 0.9402289928197167, + "grad_norm": 1.8933274253957586, + "learning_rate": 3.444091115581803e-05, + "loss": 0.552, + "step": 9690 + }, + { + "epoch": 0.9411993013778381, + "grad_norm": 1.7235797326078635, + "learning_rate": 3.442468687130898e-05, + "loss": 0.5899, + "step": 9700 + }, + { + "epoch": 0.9421696099359597, + "grad_norm": 1.428965938918239, + "learning_rate": 3.440846258679992e-05, + "loss": 0.5733, + "step": 9710 + }, + { + "epoch": 0.9431399184940811, + "grad_norm": 1.4190853376920558, + "learning_rate": 3.439223830229087e-05, + "loss": 0.5762, + "step": 9720 + }, + { + "epoch": 0.9441102270522026, + "grad_norm": 1.569528489090731, + "learning_rate": 3.437601401778181e-05, + "loss": 0.5535, + "step": 9730 + }, + { + "epoch": 0.9450805356103241, + "grad_norm": 1.869896863596011, + "learning_rate": 3.435978973327276e-05, + "loss": 0.5651, + "step": 9740 + }, + { + "epoch": 0.9460508441684455, + "grad_norm": 2.3801435462427785, + "learning_rate": 3.434356544876371e-05, + "loss": 0.5366, + "step": 9750 + }, + { + "epoch": 0.9470211527265671, + "grad_norm": 1.7543924621581104, + "learning_rate": 3.4327341164254655e-05, + "loss": 0.5407, + "step": 9760 + }, + { + "epoch": 0.9479914612846886, + "grad_norm": 1.6645160945117223, + "learning_rate": 3.4311116879745605e-05, + "loss": 0.5776, + "step": 9770 + }, + { + "epoch": 0.94896176984281, + "grad_norm": 1.5226969850196896, + "learning_rate": 3.429489259523655e-05, + "loss": 0.5189, + "step": 9780 + }, + { + "epoch": 0.9499320784009315, + "grad_norm": 1.7957341969322531, + "learning_rate": 3.42786683107275e-05, + "loss": 0.5771, + "step": 9790 + }, + { + "epoch": 0.9509023869590529, + "grad_norm": 1.4922624955935235, + "learning_rate": 3.426244402621844e-05, + "loss": 0.5339, + "step": 9800 + }, + { + "epoch": 0.9518726955171745, + "grad_norm": 1.9179842292383775, + "learning_rate": 3.424621974170939e-05, + "loss": 0.5408, + "step": 9810 + }, + { + "epoch": 0.952843004075296, + "grad_norm": 1.8930907076270356, + "learning_rate": 3.4229995457200333e-05, + "loss": 0.5622, + "step": 9820 + }, + { + "epoch": 0.9538133126334174, + "grad_norm": 1.781854001943581, + "learning_rate": 3.421377117269128e-05, + "loss": 0.5654, + "step": 9830 + }, + { + "epoch": 0.9547836211915389, + "grad_norm": 1.9058249916201926, + "learning_rate": 3.419754688818223e-05, + "loss": 0.6089, + "step": 9840 + }, + { + "epoch": 0.9557539297496604, + "grad_norm": 1.5813398061915347, + "learning_rate": 3.4181322603673176e-05, + "loss": 0.5444, + "step": 9850 + }, + { + "epoch": 0.9567242383077819, + "grad_norm": 1.6480219585268394, + "learning_rate": 3.4165098319164126e-05, + "loss": 0.5215, + "step": 9860 + }, + { + "epoch": 0.9576945468659034, + "grad_norm": 1.6302754198626406, + "learning_rate": 3.414887403465507e-05, + "loss": 0.5225, + "step": 9870 + }, + { + "epoch": 0.9586648554240248, + "grad_norm": 1.9428020773502297, + "learning_rate": 3.4132649750146025e-05, + "loss": 0.5227, + "step": 9880 + }, + { + "epoch": 0.9596351639821463, + "grad_norm": 2.1394044994306376, + "learning_rate": 3.411642546563697e-05, + "loss": 0.5227, + "step": 9890 + }, + { + "epoch": 0.9606054725402678, + "grad_norm": 1.4958041091695313, + "learning_rate": 3.410020118112792e-05, + "loss": 0.5536, + "step": 9900 + }, + { + "epoch": 0.9615757810983893, + "grad_norm": 1.65783511931855, + "learning_rate": 3.408397689661886e-05, + "loss": 0.6116, + "step": 9910 + }, + { + "epoch": 0.9625460896565108, + "grad_norm": 1.2687727005359897, + "learning_rate": 3.406775261210981e-05, + "loss": 0.582, + "step": 9920 + }, + { + "epoch": 0.9635163982146323, + "grad_norm": 1.642309856430725, + "learning_rate": 3.4051528327600754e-05, + "loss": 0.5156, + "step": 9930 + }, + { + "epoch": 0.9644867067727537, + "grad_norm": 1.5540151182331825, + "learning_rate": 3.4035304043091704e-05, + "loss": 0.514, + "step": 9940 + }, + { + "epoch": 0.9654570153308752, + "grad_norm": 1.6334411063744383, + "learning_rate": 3.401907975858265e-05, + "loss": 0.585, + "step": 9950 + }, + { + "epoch": 0.9664273238889967, + "grad_norm": 1.5262322683208274, + "learning_rate": 3.4002855474073596e-05, + "loss": 0.5493, + "step": 9960 + }, + { + "epoch": 0.9673976324471182, + "grad_norm": 2.041216469634578, + "learning_rate": 3.3986631189564546e-05, + "loss": 0.5701, + "step": 9970 + }, + { + "epoch": 0.9683679410052397, + "grad_norm": 1.6826626866998198, + "learning_rate": 3.397040690505549e-05, + "loss": 0.5522, + "step": 9980 + }, + { + "epoch": 0.9693382495633611, + "grad_norm": 1.3784779820091337, + "learning_rate": 3.395418262054644e-05, + "loss": 0.5401, + "step": 9990 + }, + { + "epoch": 0.9703085581214826, + "grad_norm": 2.164538127382688, + "learning_rate": 3.393795833603738e-05, + "loss": 0.5493, + "step": 10000 + }, + { + "epoch": 0.9703085581214826, + "eval_loss": 0.6604536771774292, + "eval_runtime": 2471.8323, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 10000 + }, + { + "epoch": 0.9712788666796042, + "grad_norm": 1.641886541364288, + "learning_rate": 3.392173405152833e-05, + "loss": 0.5592, + "step": 10010 + }, + { + "epoch": 0.9722491752377256, + "grad_norm": 1.676600577801569, + "learning_rate": 3.3905509767019275e-05, + "loss": 0.5675, + "step": 10020 + }, + { + "epoch": 0.9732194837958471, + "grad_norm": 1.575522175535914, + "learning_rate": 3.3889285482510224e-05, + "loss": 0.5564, + "step": 10030 + }, + { + "epoch": 0.9741897923539685, + "grad_norm": 1.4256611852011571, + "learning_rate": 3.3873061198001174e-05, + "loss": 0.5654, + "step": 10040 + }, + { + "epoch": 0.97516010091209, + "grad_norm": 1.9766555201121603, + "learning_rate": 3.385683691349212e-05, + "loss": 0.5853, + "step": 10050 + }, + { + "epoch": 0.9761304094702116, + "grad_norm": 1.4257493175991105, + "learning_rate": 3.384061262898307e-05, + "loss": 0.602, + "step": 10060 + }, + { + "epoch": 0.977100718028333, + "grad_norm": 1.7942732122646, + "learning_rate": 3.382438834447401e-05, + "loss": 0.5083, + "step": 10070 + }, + { + "epoch": 0.9780710265864545, + "grad_norm": 1.970290799031593, + "learning_rate": 3.380816405996496e-05, + "loss": 0.5353, + "step": 10080 + }, + { + "epoch": 0.979041335144576, + "grad_norm": 2.177029103778447, + "learning_rate": 3.37919397754559e-05, + "loss": 0.5364, + "step": 10090 + }, + { + "epoch": 0.9800116437026974, + "grad_norm": 1.6319237609838204, + "learning_rate": 3.377571549094685e-05, + "loss": 0.5498, + "step": 10100 + }, + { + "epoch": 0.980981952260819, + "grad_norm": 1.9418003047270103, + "learning_rate": 3.3759491206437795e-05, + "loss": 0.6225, + "step": 10110 + }, + { + "epoch": 0.9819522608189404, + "grad_norm": 1.5233382574211756, + "learning_rate": 3.3743266921928745e-05, + "loss": 0.5488, + "step": 10120 + }, + { + "epoch": 0.9829225693770619, + "grad_norm": 1.7034150218696569, + "learning_rate": 3.3727042637419695e-05, + "loss": 0.5912, + "step": 10130 + }, + { + "epoch": 0.9838928779351834, + "grad_norm": 1.9808892841805323, + "learning_rate": 3.371081835291064e-05, + "loss": 0.5657, + "step": 10140 + }, + { + "epoch": 0.9848631864933048, + "grad_norm": 1.9429830219605533, + "learning_rate": 3.369459406840159e-05, + "loss": 0.563, + "step": 10150 + }, + { + "epoch": 0.9858334950514264, + "grad_norm": 1.7823608925927663, + "learning_rate": 3.367836978389253e-05, + "loss": 0.5315, + "step": 10160 + }, + { + "epoch": 0.9868038036095479, + "grad_norm": 1.6317868773777158, + "learning_rate": 3.366214549938348e-05, + "loss": 0.5663, + "step": 10170 + }, + { + "epoch": 0.9877741121676693, + "grad_norm": 1.9950884655573202, + "learning_rate": 3.3645921214874424e-05, + "loss": 0.5704, + "step": 10180 + }, + { + "epoch": 0.9887444207257908, + "grad_norm": 1.6320762630806733, + "learning_rate": 3.362969693036537e-05, + "loss": 0.5746, + "step": 10190 + }, + { + "epoch": 0.9897147292839122, + "grad_norm": 1.685041060441873, + "learning_rate": 3.361347264585632e-05, + "loss": 0.5917, + "step": 10200 + }, + { + "epoch": 0.9906850378420338, + "grad_norm": 1.7530117501023248, + "learning_rate": 3.3597248361347266e-05, + "loss": 0.5365, + "step": 10210 + }, + { + "epoch": 0.9916553464001553, + "grad_norm": 1.9196791146989973, + "learning_rate": 3.3581024076838216e-05, + "loss": 0.5795, + "step": 10220 + }, + { + "epoch": 0.9926256549582767, + "grad_norm": 1.9870737501998446, + "learning_rate": 3.356479979232916e-05, + "loss": 0.6544, + "step": 10230 + }, + { + "epoch": 0.9935959635163982, + "grad_norm": 1.491102870770748, + "learning_rate": 3.354857550782011e-05, + "loss": 0.5083, + "step": 10240 + }, + { + "epoch": 0.9945662720745198, + "grad_norm": 1.5900809359608934, + "learning_rate": 3.353235122331105e-05, + "loss": 0.5693, + "step": 10250 + }, + { + "epoch": 0.9955365806326412, + "grad_norm": 1.6635095304395011, + "learning_rate": 3.3516126938802e-05, + "loss": 0.5744, + "step": 10260 + }, + { + "epoch": 0.9965068891907627, + "grad_norm": 2.0049230325912957, + "learning_rate": 3.3499902654292944e-05, + "loss": 0.5461, + "step": 10270 + }, + { + "epoch": 0.9974771977488841, + "grad_norm": 1.50147581851131, + "learning_rate": 3.3483678369783894e-05, + "loss": 0.5803, + "step": 10280 + }, + { + "epoch": 0.9984475063070056, + "grad_norm": 1.8064338359868768, + "learning_rate": 3.3467454085274844e-05, + "loss": 0.506, + "step": 10290 + }, + { + "epoch": 0.9994178148651272, + "grad_norm": 2.052231872791701, + "learning_rate": 3.345122980076579e-05, + "loss": 0.5752, + "step": 10300 + }, + { + "epoch": 1.0003881234232486, + "grad_norm": 1.7418958102493116, + "learning_rate": 3.343500551625674e-05, + "loss": 0.6082, + "step": 10310 + }, + { + "epoch": 1.00135843198137, + "grad_norm": 1.8975860607542987, + "learning_rate": 3.341878123174768e-05, + "loss": 0.5524, + "step": 10320 + }, + { + "epoch": 1.0023287405394916, + "grad_norm": 1.7076807320811012, + "learning_rate": 3.340255694723863e-05, + "loss": 0.6075, + "step": 10330 + }, + { + "epoch": 1.003299049097613, + "grad_norm": 1.4300451205657956, + "learning_rate": 3.338633266272957e-05, + "loss": 0.5326, + "step": 10340 + }, + { + "epoch": 1.0042693576557344, + "grad_norm": 2.0682797223020777, + "learning_rate": 3.337010837822052e-05, + "loss": 0.5484, + "step": 10350 + }, + { + "epoch": 1.005239666213856, + "grad_norm": 1.536692096590345, + "learning_rate": 3.3353884093711465e-05, + "loss": 0.546, + "step": 10360 + }, + { + "epoch": 1.0062099747719775, + "grad_norm": 1.7848861749442593, + "learning_rate": 3.3337659809202415e-05, + "loss": 0.5763, + "step": 10370 + }, + { + "epoch": 1.007180283330099, + "grad_norm": 1.4925347515246201, + "learning_rate": 3.3321435524693365e-05, + "loss": 0.5005, + "step": 10380 + }, + { + "epoch": 1.0081505918882205, + "grad_norm": 1.4245048085109102, + "learning_rate": 3.330521124018431e-05, + "loss": 0.5352, + "step": 10390 + }, + { + "epoch": 1.009120900446342, + "grad_norm": 1.6528542775713155, + "learning_rate": 3.328898695567526e-05, + "loss": 0.5794, + "step": 10400 + }, + { + "epoch": 1.0100912090044634, + "grad_norm": 1.7391134864648952, + "learning_rate": 3.32727626711662e-05, + "loss": 0.6133, + "step": 10410 + }, + { + "epoch": 1.011061517562585, + "grad_norm": 1.8040782879083466, + "learning_rate": 3.325653838665715e-05, + "loss": 0.5124, + "step": 10420 + }, + { + "epoch": 1.0120318261207064, + "grad_norm": 1.730132568689756, + "learning_rate": 3.324031410214809e-05, + "loss": 0.4803, + "step": 10430 + }, + { + "epoch": 1.0130021346788278, + "grad_norm": 1.8540575748734034, + "learning_rate": 3.322408981763904e-05, + "loss": 0.5188, + "step": 10440 + }, + { + "epoch": 1.0139724432369492, + "grad_norm": 1.7683659307789739, + "learning_rate": 3.3207865533129986e-05, + "loss": 0.5341, + "step": 10450 + }, + { + "epoch": 1.0149427517950709, + "grad_norm": 1.7745239019667731, + "learning_rate": 3.3191641248620936e-05, + "loss": 0.5858, + "step": 10460 + }, + { + "epoch": 1.0159130603531923, + "grad_norm": 1.5664467690196044, + "learning_rate": 3.3175416964111886e-05, + "loss": 0.6587, + "step": 10470 + }, + { + "epoch": 1.0168833689113137, + "grad_norm": 1.92833985335436, + "learning_rate": 3.315919267960283e-05, + "loss": 0.5532, + "step": 10480 + }, + { + "epoch": 1.0178536774694353, + "grad_norm": 1.646924763771934, + "learning_rate": 3.314296839509378e-05, + "loss": 0.5638, + "step": 10490 + }, + { + "epoch": 1.0188239860275567, + "grad_norm": 1.477844051399051, + "learning_rate": 3.312674411058472e-05, + "loss": 0.5254, + "step": 10500 + }, + { + "epoch": 1.0188239860275567, + "eval_loss": 0.6569487452507019, + "eval_runtime": 2472.3859, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 10500 + }, + { + "epoch": 1.0197942945856782, + "grad_norm": 1.7732534556121307, + "learning_rate": 3.311051982607567e-05, + "loss": 0.5098, + "step": 10510 + }, + { + "epoch": 1.0207646031437998, + "grad_norm": 1.8592910812190746, + "learning_rate": 3.3094295541566614e-05, + "loss": 0.5836, + "step": 10520 + }, + { + "epoch": 1.0217349117019212, + "grad_norm": 1.7186074858012894, + "learning_rate": 3.3078071257057564e-05, + "loss": 0.5445, + "step": 10530 + }, + { + "epoch": 1.0227052202600426, + "grad_norm": 2.043653022917622, + "learning_rate": 3.306184697254851e-05, + "loss": 0.5627, + "step": 10540 + }, + { + "epoch": 1.0236755288181643, + "grad_norm": 1.7054473164163695, + "learning_rate": 3.304562268803946e-05, + "loss": 0.6426, + "step": 10550 + }, + { + "epoch": 1.0246458373762857, + "grad_norm": 1.5920397197878904, + "learning_rate": 3.3029398403530406e-05, + "loss": 0.5542, + "step": 10560 + }, + { + "epoch": 1.025616145934407, + "grad_norm": 1.6885876369635333, + "learning_rate": 3.301317411902135e-05, + "loss": 0.5399, + "step": 10570 + }, + { + "epoch": 1.0265864544925287, + "grad_norm": 1.712761359550596, + "learning_rate": 3.29969498345123e-05, + "loss": 0.5609, + "step": 10580 + }, + { + "epoch": 1.0275567630506501, + "grad_norm": 1.849118205679511, + "learning_rate": 3.298072555000324e-05, + "loss": 0.5817, + "step": 10590 + }, + { + "epoch": 1.0285270716087715, + "grad_norm": 1.7919320749353798, + "learning_rate": 3.296450126549419e-05, + "loss": 0.562, + "step": 10600 + }, + { + "epoch": 1.0294973801668932, + "grad_norm": 1.9867833035377922, + "learning_rate": 3.2948276980985135e-05, + "loss": 0.5266, + "step": 10610 + }, + { + "epoch": 1.0304676887250146, + "grad_norm": 1.4844422926334193, + "learning_rate": 3.2932052696476085e-05, + "loss": 0.5989, + "step": 10620 + }, + { + "epoch": 1.031437997283136, + "grad_norm": 1.902397349373601, + "learning_rate": 3.2915828411967034e-05, + "loss": 0.4869, + "step": 10630 + }, + { + "epoch": 1.0324083058412574, + "grad_norm": 1.595731562090853, + "learning_rate": 3.289960412745798e-05, + "loss": 0.5014, + "step": 10640 + }, + { + "epoch": 1.033378614399379, + "grad_norm": 1.641533553754366, + "learning_rate": 3.288337984294893e-05, + "loss": 0.5847, + "step": 10650 + }, + { + "epoch": 1.0343489229575005, + "grad_norm": 1.6793350674365874, + "learning_rate": 3.286715555843988e-05, + "loss": 0.5433, + "step": 10660 + }, + { + "epoch": 1.0353192315156219, + "grad_norm": 1.7886777094595252, + "learning_rate": 3.285093127393083e-05, + "loss": 0.5936, + "step": 10670 + }, + { + "epoch": 1.0362895400737435, + "grad_norm": 1.7628097025479255, + "learning_rate": 3.283470698942177e-05, + "loss": 0.5176, + "step": 10680 + }, + { + "epoch": 1.037259848631865, + "grad_norm": 1.7117894064748884, + "learning_rate": 3.281848270491272e-05, + "loss": 0.5166, + "step": 10690 + }, + { + "epoch": 1.0382301571899863, + "grad_norm": 1.8595449574787446, + "learning_rate": 3.280225842040366e-05, + "loss": 0.5511, + "step": 10700 + }, + { + "epoch": 1.039200465748108, + "grad_norm": 1.5833347089731324, + "learning_rate": 3.278603413589461e-05, + "loss": 0.5808, + "step": 10710 + }, + { + "epoch": 1.0401707743062294, + "grad_norm": 1.5168916043350162, + "learning_rate": 3.2769809851385555e-05, + "loss": 0.5295, + "step": 10720 + }, + { + "epoch": 1.0411410828643508, + "grad_norm": 1.9583788202686643, + "learning_rate": 3.2753585566876505e-05, + "loss": 0.5859, + "step": 10730 + }, + { + "epoch": 1.0421113914224724, + "grad_norm": 1.6490551146953607, + "learning_rate": 3.273736128236745e-05, + "loss": 0.5988, + "step": 10740 + }, + { + "epoch": 1.0430816999805939, + "grad_norm": 2.2945646534561734, + "learning_rate": 3.27211369978584e-05, + "loss": 0.5481, + "step": 10750 + }, + { + "epoch": 1.0440520085387153, + "grad_norm": 1.6823894633457166, + "learning_rate": 3.270491271334935e-05, + "loss": 0.566, + "step": 10760 + }, + { + "epoch": 1.0450223170968367, + "grad_norm": 1.9698263256075523, + "learning_rate": 3.268868842884029e-05, + "loss": 0.582, + "step": 10770 + }, + { + "epoch": 1.0459926256549583, + "grad_norm": 1.6140673075775909, + "learning_rate": 3.267246414433124e-05, + "loss": 0.4717, + "step": 10780 + }, + { + "epoch": 1.0469629342130797, + "grad_norm": 1.916378749987929, + "learning_rate": 3.265623985982218e-05, + "loss": 0.5244, + "step": 10790 + }, + { + "epoch": 1.0479332427712011, + "grad_norm": 1.9482061690949193, + "learning_rate": 3.264001557531313e-05, + "loss": 0.5973, + "step": 10800 + }, + { + "epoch": 1.0489035513293228, + "grad_norm": 2.142491022649256, + "learning_rate": 3.2623791290804076e-05, + "loss": 0.5317, + "step": 10810 + }, + { + "epoch": 1.0498738598874442, + "grad_norm": 1.706373160041174, + "learning_rate": 3.2607567006295026e-05, + "loss": 0.5136, + "step": 10820 + }, + { + "epoch": 1.0508441684455656, + "grad_norm": 1.8959475848770957, + "learning_rate": 3.2591342721785976e-05, + "loss": 0.5655, + "step": 10830 + }, + { + "epoch": 1.0518144770036872, + "grad_norm": 1.8921424824926663, + "learning_rate": 3.257511843727692e-05, + "loss": 0.5164, + "step": 10840 + }, + { + "epoch": 1.0527847855618087, + "grad_norm": 2.0234555341020664, + "learning_rate": 3.255889415276787e-05, + "loss": 0.5335, + "step": 10850 + }, + { + "epoch": 1.05375509411993, + "grad_norm": 2.0614821240519947, + "learning_rate": 3.254266986825881e-05, + "loss": 0.5022, + "step": 10860 + }, + { + "epoch": 1.0547254026780517, + "grad_norm": 1.7142183353166638, + "learning_rate": 3.252644558374976e-05, + "loss": 0.5452, + "step": 10870 + }, + { + "epoch": 1.0556957112361731, + "grad_norm": 1.8265079239939517, + "learning_rate": 3.2510221299240704e-05, + "loss": 0.5302, + "step": 10880 + }, + { + "epoch": 1.0566660197942945, + "grad_norm": 1.7953764101996608, + "learning_rate": 3.2493997014731654e-05, + "loss": 0.5418, + "step": 10890 + }, + { + "epoch": 1.0576363283524162, + "grad_norm": 1.8762589557600082, + "learning_rate": 3.24777727302226e-05, + "loss": 0.5743, + "step": 10900 + }, + { + "epoch": 1.0586066369105376, + "grad_norm": 1.4926319666858994, + "learning_rate": 3.246154844571355e-05, + "loss": 0.5291, + "step": 10910 + }, + { + "epoch": 1.059576945468659, + "grad_norm": 1.9278698099129787, + "learning_rate": 3.2445324161204497e-05, + "loss": 0.5409, + "step": 10920 + }, + { + "epoch": 1.0605472540267806, + "grad_norm": 1.8973044877439684, + "learning_rate": 3.242909987669544e-05, + "loss": 0.5466, + "step": 10930 + }, + { + "epoch": 1.061517562584902, + "grad_norm": 2.174563710599117, + "learning_rate": 3.241287559218639e-05, + "loss": 0.516, + "step": 10940 + }, + { + "epoch": 1.0624878711430235, + "grad_norm": 2.1559495316330786, + "learning_rate": 3.239665130767733e-05, + "loss": 0.5951, + "step": 10950 + }, + { + "epoch": 1.0634581797011449, + "grad_norm": 2.224958308598168, + "learning_rate": 3.238042702316828e-05, + "loss": 0.5216, + "step": 10960 + }, + { + "epoch": 1.0644284882592665, + "grad_norm": 1.5388616251824314, + "learning_rate": 3.2364202738659225e-05, + "loss": 0.551, + "step": 10970 + }, + { + "epoch": 1.065398796817388, + "grad_norm": 1.8563910144302744, + "learning_rate": 3.2347978454150175e-05, + "loss": 0.5325, + "step": 10980 + }, + { + "epoch": 1.0663691053755093, + "grad_norm": 1.249386464606806, + "learning_rate": 3.233175416964112e-05, + "loss": 0.5223, + "step": 10990 + }, + { + "epoch": 1.067339413933631, + "grad_norm": 1.7788595735717652, + "learning_rate": 3.231552988513207e-05, + "loss": 0.5353, + "step": 11000 + }, + { + "epoch": 1.067339413933631, + "eval_loss": 0.6541542410850525, + "eval_runtime": 2467.7272, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 11000 + }, + { + "epoch": 1.0683097224917524, + "grad_norm": 2.093397486341294, + "learning_rate": 3.229930560062302e-05, + "loss": 0.5373, + "step": 11010 + }, + { + "epoch": 1.0692800310498738, + "grad_norm": 1.6292327190910305, + "learning_rate": 3.228308131611396e-05, + "loss": 0.5417, + "step": 11020 + }, + { + "epoch": 1.0702503396079954, + "grad_norm": 1.6545192933610855, + "learning_rate": 3.226685703160491e-05, + "loss": 0.5508, + "step": 11030 + }, + { + "epoch": 1.0712206481661168, + "grad_norm": 1.8906889663679678, + "learning_rate": 3.225063274709585e-05, + "loss": 0.5268, + "step": 11040 + }, + { + "epoch": 1.0721909567242383, + "grad_norm": 1.392471317144995, + "learning_rate": 3.22344084625868e-05, + "loss": 0.5226, + "step": 11050 + }, + { + "epoch": 1.0731612652823599, + "grad_norm": 1.9171337047147172, + "learning_rate": 3.2218184178077746e-05, + "loss": 0.5212, + "step": 11060 + }, + { + "epoch": 1.0741315738404813, + "grad_norm": 1.4857874915171814, + "learning_rate": 3.2201959893568696e-05, + "loss": 0.5335, + "step": 11070 + }, + { + "epoch": 1.0751018823986027, + "grad_norm": 1.7538116790409093, + "learning_rate": 3.218573560905964e-05, + "loss": 0.542, + "step": 11080 + }, + { + "epoch": 1.0760721909567241, + "grad_norm": 1.6310285074576256, + "learning_rate": 3.216951132455059e-05, + "loss": 0.5114, + "step": 11090 + }, + { + "epoch": 1.0770424995148458, + "grad_norm": 1.6921457584921478, + "learning_rate": 3.215328704004154e-05, + "loss": 0.4924, + "step": 11100 + }, + { + "epoch": 1.0780128080729672, + "grad_norm": 1.7745242342830565, + "learning_rate": 3.213706275553248e-05, + "loss": 0.5567, + "step": 11110 + }, + { + "epoch": 1.0789831166310886, + "grad_norm": 1.69496703066604, + "learning_rate": 3.212083847102343e-05, + "loss": 0.5237, + "step": 11120 + }, + { + "epoch": 1.0799534251892102, + "grad_norm": 1.7029551999976154, + "learning_rate": 3.2104614186514374e-05, + "loss": 0.5023, + "step": 11130 + }, + { + "epoch": 1.0809237337473316, + "grad_norm": 1.71464384875071, + "learning_rate": 3.2088389902005324e-05, + "loss": 0.5111, + "step": 11140 + }, + { + "epoch": 1.081894042305453, + "grad_norm": 1.745797205638, + "learning_rate": 3.207216561749627e-05, + "loss": 0.5259, + "step": 11150 + }, + { + "epoch": 1.0828643508635747, + "grad_norm": 1.8756703389755518, + "learning_rate": 3.2055941332987216e-05, + "loss": 0.5135, + "step": 11160 + }, + { + "epoch": 1.083834659421696, + "grad_norm": 1.7773801380695633, + "learning_rate": 3.203971704847816e-05, + "loss": 0.606, + "step": 11170 + }, + { + "epoch": 1.0848049679798175, + "grad_norm": 1.9149914262933831, + "learning_rate": 3.202349276396911e-05, + "loss": 0.504, + "step": 11180 + }, + { + "epoch": 1.0857752765379391, + "grad_norm": 1.5792891192161234, + "learning_rate": 3.200726847946006e-05, + "loss": 0.4899, + "step": 11190 + }, + { + "epoch": 1.0867455850960606, + "grad_norm": 1.3849113367423667, + "learning_rate": 3.1991044194951e-05, + "loss": 0.5435, + "step": 11200 + }, + { + "epoch": 1.087715893654182, + "grad_norm": 1.836894182259094, + "learning_rate": 3.197481991044195e-05, + "loss": 0.5417, + "step": 11210 + }, + { + "epoch": 1.0886862022123036, + "grad_norm": 2.5716819476436723, + "learning_rate": 3.1958595625932895e-05, + "loss": 0.5487, + "step": 11220 + }, + { + "epoch": 1.089656510770425, + "grad_norm": 1.7054225586630418, + "learning_rate": 3.1942371341423845e-05, + "loss": 0.5366, + "step": 11230 + }, + { + "epoch": 1.0906268193285464, + "grad_norm": 1.9132021835433188, + "learning_rate": 3.192614705691479e-05, + "loss": 0.5212, + "step": 11240 + }, + { + "epoch": 1.091597127886668, + "grad_norm": 1.5687725973259348, + "learning_rate": 3.190992277240574e-05, + "loss": 0.5013, + "step": 11250 + }, + { + "epoch": 1.0925674364447895, + "grad_norm": 1.7910741705827617, + "learning_rate": 3.189369848789669e-05, + "loss": 0.5359, + "step": 11260 + }, + { + "epoch": 1.093537745002911, + "grad_norm": 1.4238221915445326, + "learning_rate": 3.187747420338763e-05, + "loss": 0.5729, + "step": 11270 + }, + { + "epoch": 1.0945080535610323, + "grad_norm": 1.8958035882900321, + "learning_rate": 3.186124991887858e-05, + "loss": 0.5349, + "step": 11280 + }, + { + "epoch": 1.095478362119154, + "grad_norm": 1.5644842460614365, + "learning_rate": 3.184502563436952e-05, + "loss": 0.5275, + "step": 11290 + }, + { + "epoch": 1.0964486706772754, + "grad_norm": 1.6966656078568068, + "learning_rate": 3.182880134986047e-05, + "loss": 0.5714, + "step": 11300 + }, + { + "epoch": 1.0974189792353968, + "grad_norm": 1.5845176878742038, + "learning_rate": 3.1812577065351416e-05, + "loss": 0.547, + "step": 11310 + }, + { + "epoch": 1.0983892877935184, + "grad_norm": 1.9638352416110092, + "learning_rate": 3.1796352780842365e-05, + "loss": 0.5371, + "step": 11320 + }, + { + "epoch": 1.0993595963516398, + "grad_norm": 1.3333867282862815, + "learning_rate": 3.178012849633331e-05, + "loss": 0.5558, + "step": 11330 + }, + { + "epoch": 1.1003299049097612, + "grad_norm": 2.028988375070847, + "learning_rate": 3.176390421182426e-05, + "loss": 0.5425, + "step": 11340 + }, + { + "epoch": 1.1013002134678829, + "grad_norm": 2.0233727263417745, + "learning_rate": 3.174767992731521e-05, + "loss": 0.5128, + "step": 11350 + }, + { + "epoch": 1.1022705220260043, + "grad_norm": 1.6415117808780524, + "learning_rate": 3.173145564280615e-05, + "loss": 0.476, + "step": 11360 + }, + { + "epoch": 1.1032408305841257, + "grad_norm": 2.0488869521619972, + "learning_rate": 3.17152313582971e-05, + "loss": 0.5221, + "step": 11370 + }, + { + "epoch": 1.1042111391422473, + "grad_norm": 2.11079167436828, + "learning_rate": 3.1699007073788044e-05, + "loss": 0.5566, + "step": 11380 + }, + { + "epoch": 1.1051814477003687, + "grad_norm": 2.0421176306313398, + "learning_rate": 3.1682782789278993e-05, + "loss": 0.5061, + "step": 11390 + }, + { + "epoch": 1.1061517562584902, + "grad_norm": 1.9992500086060474, + "learning_rate": 3.1666558504769936e-05, + "loss": 0.542, + "step": 11400 + }, + { + "epoch": 1.1071220648166116, + "grad_norm": 1.872752246092324, + "learning_rate": 3.1650334220260886e-05, + "loss": 0.5602, + "step": 11410 + }, + { + "epoch": 1.1080923733747332, + "grad_norm": 2.0723616338374646, + "learning_rate": 3.163410993575183e-05, + "loss": 0.5052, + "step": 11420 + }, + { + "epoch": 1.1090626819328546, + "grad_norm": 1.7543258598505558, + "learning_rate": 3.161788565124278e-05, + "loss": 0.5951, + "step": 11430 + }, + { + "epoch": 1.110032990490976, + "grad_norm": 1.741111503275317, + "learning_rate": 3.160166136673373e-05, + "loss": 0.5481, + "step": 11440 + }, + { + "epoch": 1.1110032990490977, + "grad_norm": 1.3801479018015765, + "learning_rate": 3.158543708222468e-05, + "loss": 0.5104, + "step": 11450 + }, + { + "epoch": 1.111973607607219, + "grad_norm": 1.9442170634773426, + "learning_rate": 3.156921279771563e-05, + "loss": 0.5129, + "step": 11460 + }, + { + "epoch": 1.1129439161653405, + "grad_norm": 1.9331832081031561, + "learning_rate": 3.155298851320657e-05, + "loss": 0.5139, + "step": 11470 + }, + { + "epoch": 1.1139142247234621, + "grad_norm": 1.5810959938815903, + "learning_rate": 3.153676422869752e-05, + "loss": 0.603, + "step": 11480 + }, + { + "epoch": 1.1148845332815835, + "grad_norm": 1.7478463817804297, + "learning_rate": 3.1520539944188464e-05, + "loss": 0.5573, + "step": 11490 + }, + { + "epoch": 1.115854841839705, + "grad_norm": 1.512993291404137, + "learning_rate": 3.1504315659679414e-05, + "loss": 0.4983, + "step": 11500 + }, + { + "epoch": 1.115854841839705, + "eval_loss": 0.6542506814002991, + "eval_runtime": 2471.8207, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 11500 + }, + { + "epoch": 1.1168251503978266, + "grad_norm": 1.9794498366078679, + "learning_rate": 3.148809137517036e-05, + "loss": 0.5397, + "step": 11510 + }, + { + "epoch": 1.117795458955948, + "grad_norm": 1.6989190060539383, + "learning_rate": 3.1471867090661307e-05, + "loss": 0.4416, + "step": 11520 + }, + { + "epoch": 1.1187657675140694, + "grad_norm": 1.8417193394056512, + "learning_rate": 3.145564280615225e-05, + "loss": 0.5761, + "step": 11530 + }, + { + "epoch": 1.119736076072191, + "grad_norm": 1.8784314737872851, + "learning_rate": 3.14394185216432e-05, + "loss": 0.5179, + "step": 11540 + }, + { + "epoch": 1.1207063846303125, + "grad_norm": 1.724566892104882, + "learning_rate": 3.142319423713415e-05, + "loss": 0.5145, + "step": 11550 + }, + { + "epoch": 1.1216766931884339, + "grad_norm": 1.6271566863595974, + "learning_rate": 3.140696995262509e-05, + "loss": 0.5557, + "step": 11560 + }, + { + "epoch": 1.1226470017465555, + "grad_norm": 1.632474409823301, + "learning_rate": 3.139074566811604e-05, + "loss": 0.5351, + "step": 11570 + }, + { + "epoch": 1.123617310304677, + "grad_norm": 1.8885164133175332, + "learning_rate": 3.1374521383606985e-05, + "loss": 0.5066, + "step": 11580 + }, + { + "epoch": 1.1245876188627983, + "grad_norm": 1.86325603833634, + "learning_rate": 3.1358297099097935e-05, + "loss": 0.5561, + "step": 11590 + }, + { + "epoch": 1.1255579274209198, + "grad_norm": 1.8325010805036752, + "learning_rate": 3.134207281458888e-05, + "loss": 0.5518, + "step": 11600 + }, + { + "epoch": 1.1265282359790414, + "grad_norm": 1.7732624170282982, + "learning_rate": 3.132584853007983e-05, + "loss": 0.5264, + "step": 11610 + }, + { + "epoch": 1.1274985445371628, + "grad_norm": 1.6849511508626207, + "learning_rate": 3.130962424557077e-05, + "loss": 0.4938, + "step": 11620 + }, + { + "epoch": 1.1284688530952842, + "grad_norm": 1.7088287322634965, + "learning_rate": 3.129339996106172e-05, + "loss": 0.5432, + "step": 11630 + }, + { + "epoch": 1.1294391616534059, + "grad_norm": 2.4521787808904345, + "learning_rate": 3.127717567655267e-05, + "loss": 0.5222, + "step": 11640 + }, + { + "epoch": 1.1304094702115273, + "grad_norm": 1.8568933343609308, + "learning_rate": 3.126095139204361e-05, + "loss": 0.5155, + "step": 11650 + }, + { + "epoch": 1.1313797787696487, + "grad_norm": 1.6581222418410275, + "learning_rate": 3.124472710753456e-05, + "loss": 0.5039, + "step": 11660 + }, + { + "epoch": 1.1323500873277703, + "grad_norm": 1.8783913354540864, + "learning_rate": 3.1228502823025506e-05, + "loss": 0.5083, + "step": 11670 + }, + { + "epoch": 1.1333203958858917, + "grad_norm": 1.7156898303987873, + "learning_rate": 3.1212278538516455e-05, + "loss": 0.5184, + "step": 11680 + }, + { + "epoch": 1.1342907044440131, + "grad_norm": 2.1185799396739093, + "learning_rate": 3.11960542540074e-05, + "loss": 0.5252, + "step": 11690 + }, + { + "epoch": 1.1352610130021348, + "grad_norm": 1.2978327151961555, + "learning_rate": 3.117982996949835e-05, + "loss": 0.5548, + "step": 11700 + }, + { + "epoch": 1.1362313215602562, + "grad_norm": 1.6276998380365333, + "learning_rate": 3.116360568498929e-05, + "loss": 0.5412, + "step": 11710 + }, + { + "epoch": 1.1372016301183776, + "grad_norm": 1.5330498087853446, + "learning_rate": 3.114738140048024e-05, + "loss": 0.4811, + "step": 11720 + }, + { + "epoch": 1.138171938676499, + "grad_norm": 1.545547307525802, + "learning_rate": 3.113115711597119e-05, + "loss": 0.4841, + "step": 11730 + }, + { + "epoch": 1.1391422472346207, + "grad_norm": 1.8963148998956563, + "learning_rate": 3.1114932831462134e-05, + "loss": 0.5048, + "step": 11740 + }, + { + "epoch": 1.140112555792742, + "grad_norm": 1.927144240645965, + "learning_rate": 3.1098708546953084e-05, + "loss": 0.5437, + "step": 11750 + }, + { + "epoch": 1.1410828643508635, + "grad_norm": 1.322758693894623, + "learning_rate": 3.1082484262444027e-05, + "loss": 0.5824, + "step": 11760 + }, + { + "epoch": 1.1420531729089851, + "grad_norm": 1.9210557222184033, + "learning_rate": 3.1066259977934976e-05, + "loss": 0.5208, + "step": 11770 + }, + { + "epoch": 1.1430234814671065, + "grad_norm": 1.8904898905530507, + "learning_rate": 3.105003569342592e-05, + "loss": 0.5646, + "step": 11780 + }, + { + "epoch": 1.143993790025228, + "grad_norm": 2.2135650729717478, + "learning_rate": 3.103381140891687e-05, + "loss": 0.5823, + "step": 11790 + }, + { + "epoch": 1.1449640985833496, + "grad_norm": 1.8257267748894057, + "learning_rate": 3.101758712440781e-05, + "loss": 0.4957, + "step": 11800 + }, + { + "epoch": 1.145934407141471, + "grad_norm": 1.8399910987928954, + "learning_rate": 3.100136283989876e-05, + "loss": 0.5589, + "step": 11810 + }, + { + "epoch": 1.1469047156995924, + "grad_norm": 1.751016659591738, + "learning_rate": 3.098513855538971e-05, + "loss": 0.4636, + "step": 11820 + }, + { + "epoch": 1.147875024257714, + "grad_norm": 2.3784962245655743, + "learning_rate": 3.0968914270880655e-05, + "loss": 0.558, + "step": 11830 + }, + { + "epoch": 1.1488453328158355, + "grad_norm": 1.7453204083727232, + "learning_rate": 3.0952689986371604e-05, + "loss": 0.5152, + "step": 11840 + }, + { + "epoch": 1.1498156413739569, + "grad_norm": 1.6493432346857455, + "learning_rate": 3.093646570186255e-05, + "loss": 0.5396, + "step": 11850 + }, + { + "epoch": 1.1507859499320783, + "grad_norm": 1.4016634963337722, + "learning_rate": 3.09202414173535e-05, + "loss": 0.5392, + "step": 11860 + }, + { + "epoch": 1.1517562584902, + "grad_norm": 1.5144392427573337, + "learning_rate": 3.090401713284444e-05, + "loss": 0.5625, + "step": 11870 + }, + { + "epoch": 1.1527265670483213, + "grad_norm": 1.9911432006869803, + "learning_rate": 3.088779284833539e-05, + "loss": 0.5187, + "step": 11880 + }, + { + "epoch": 1.153696875606443, + "grad_norm": 1.989669700307941, + "learning_rate": 3.087156856382634e-05, + "loss": 0.516, + "step": 11890 + }, + { + "epoch": 1.1546671841645644, + "grad_norm": 2.0321346469591717, + "learning_rate": 3.085534427931728e-05, + "loss": 0.5028, + "step": 11900 + }, + { + "epoch": 1.1556374927226858, + "grad_norm": 1.6419270337849394, + "learning_rate": 3.083911999480823e-05, + "loss": 0.5114, + "step": 11910 + }, + { + "epoch": 1.1566078012808072, + "grad_norm": 1.8199861134594042, + "learning_rate": 3.0822895710299175e-05, + "loss": 0.4862, + "step": 11920 + }, + { + "epoch": 1.1575781098389288, + "grad_norm": 2.092599945357918, + "learning_rate": 3.0806671425790125e-05, + "loss": 0.4862, + "step": 11930 + }, + { + "epoch": 1.1585484183970503, + "grad_norm": 1.6390208725289623, + "learning_rate": 3.079044714128107e-05, + "loss": 0.5518, + "step": 11940 + }, + { + "epoch": 1.1595187269551717, + "grad_norm": 1.4035901760825538, + "learning_rate": 3.077422285677202e-05, + "loss": 0.5195, + "step": 11950 + }, + { + "epoch": 1.1604890355132933, + "grad_norm": 1.965204556535071, + "learning_rate": 3.075799857226296e-05, + "loss": 0.5198, + "step": 11960 + }, + { + "epoch": 1.1614593440714147, + "grad_norm": 1.797317897425162, + "learning_rate": 3.074177428775391e-05, + "loss": 0.5223, + "step": 11970 + }, + { + "epoch": 1.1624296526295361, + "grad_norm": 2.0562783101788713, + "learning_rate": 3.072555000324486e-05, + "loss": 0.5207, + "step": 11980 + }, + { + "epoch": 1.1633999611876578, + "grad_norm": 1.674272970925229, + "learning_rate": 3.0709325718735804e-05, + "loss": 0.5034, + "step": 11990 + }, + { + "epoch": 1.1643702697457792, + "grad_norm": 1.8613409955734859, + "learning_rate": 3.069310143422675e-05, + "loss": 0.4921, + "step": 12000 + }, + { + "epoch": 1.1643702697457792, + "eval_loss": 0.6506599187850952, + "eval_runtime": 2467.2044, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 12000 + }, + { + "epoch": 1.1653405783039006, + "grad_norm": 1.561512098916604, + "learning_rate": 3.0676877149717696e-05, + "loss": 0.5497, + "step": 12010 + }, + { + "epoch": 1.1663108868620222, + "grad_norm": 1.865230398957055, + "learning_rate": 3.0660652865208646e-05, + "loss": 0.4925, + "step": 12020 + }, + { + "epoch": 1.1672811954201436, + "grad_norm": 2.2512724716981434, + "learning_rate": 3.064442858069959e-05, + "loss": 0.5295, + "step": 12030 + }, + { + "epoch": 1.168251503978265, + "grad_norm": 1.8002247664216207, + "learning_rate": 3.062820429619054e-05, + "loss": 0.4996, + "step": 12040 + }, + { + "epoch": 1.1692218125363865, + "grad_norm": 2.1116351677440943, + "learning_rate": 3.061198001168148e-05, + "loss": 0.5191, + "step": 12050 + }, + { + "epoch": 1.170192121094508, + "grad_norm": 1.6725748372150215, + "learning_rate": 3.059575572717243e-05, + "loss": 0.5336, + "step": 12060 + }, + { + "epoch": 1.1711624296526295, + "grad_norm": 1.9025453675497637, + "learning_rate": 3.057953144266338e-05, + "loss": 0.5188, + "step": 12070 + }, + { + "epoch": 1.172132738210751, + "grad_norm": 1.8069530215074574, + "learning_rate": 3.0563307158154324e-05, + "loss": 0.5495, + "step": 12080 + }, + { + "epoch": 1.1731030467688726, + "grad_norm": 1.4756589453476456, + "learning_rate": 3.0547082873645274e-05, + "loss": 0.5248, + "step": 12090 + }, + { + "epoch": 1.174073355326994, + "grad_norm": 1.894556403114157, + "learning_rate": 3.053085858913622e-05, + "loss": 0.5604, + "step": 12100 + }, + { + "epoch": 1.1750436638851154, + "grad_norm": 1.7674666934401635, + "learning_rate": 3.0514634304627167e-05, + "loss": 0.5355, + "step": 12110 + }, + { + "epoch": 1.176013972443237, + "grad_norm": 1.6165024582050291, + "learning_rate": 3.0498410020118113e-05, + "loss": 0.4861, + "step": 12120 + }, + { + "epoch": 1.1769842810013584, + "grad_norm": 1.7586790770235476, + "learning_rate": 3.048218573560906e-05, + "loss": 0.5009, + "step": 12130 + }, + { + "epoch": 1.1779545895594798, + "grad_norm": 1.424893034597034, + "learning_rate": 3.0465961451100006e-05, + "loss": 0.5441, + "step": 12140 + }, + { + "epoch": 1.1789248981176015, + "grad_norm": 1.79883718769592, + "learning_rate": 3.0449737166590952e-05, + "loss": 0.5313, + "step": 12150 + }, + { + "epoch": 1.179895206675723, + "grad_norm": 1.2166009796658896, + "learning_rate": 3.04335128820819e-05, + "loss": 0.5771, + "step": 12160 + }, + { + "epoch": 1.1808655152338443, + "grad_norm": 1.8319881864090852, + "learning_rate": 3.0417288597572845e-05, + "loss": 0.5341, + "step": 12170 + }, + { + "epoch": 1.1818358237919657, + "grad_norm": 2.000590117325487, + "learning_rate": 3.040106431306379e-05, + "loss": 0.5434, + "step": 12180 + }, + { + "epoch": 1.1828061323500874, + "grad_norm": 1.7597266341378504, + "learning_rate": 3.038484002855474e-05, + "loss": 0.4637, + "step": 12190 + }, + { + "epoch": 1.1837764409082088, + "grad_norm": 1.7547988130290746, + "learning_rate": 3.0368615744045688e-05, + "loss": 0.5238, + "step": 12200 + }, + { + "epoch": 1.1847467494663304, + "grad_norm": 2.072598486347679, + "learning_rate": 3.0352391459536634e-05, + "loss": 0.5198, + "step": 12210 + }, + { + "epoch": 1.1857170580244518, + "grad_norm": 1.801682666637873, + "learning_rate": 3.033616717502758e-05, + "loss": 0.5549, + "step": 12220 + }, + { + "epoch": 1.1866873665825732, + "grad_norm": 1.7660188156759438, + "learning_rate": 3.0319942890518534e-05, + "loss": 0.5415, + "step": 12230 + }, + { + "epoch": 1.1876576751406946, + "grad_norm": 1.531355321026242, + "learning_rate": 3.030371860600948e-05, + "loss": 0.5125, + "step": 12240 + }, + { + "epoch": 1.1886279836988163, + "grad_norm": 1.6289177265084471, + "learning_rate": 3.0287494321500426e-05, + "loss": 0.5131, + "step": 12250 + }, + { + "epoch": 1.1895982922569377, + "grad_norm": 2.3228651312748423, + "learning_rate": 3.0271270036991373e-05, + "loss": 0.4859, + "step": 12260 + }, + { + "epoch": 1.190568600815059, + "grad_norm": 2.076037584027454, + "learning_rate": 3.025504575248232e-05, + "loss": 0.5234, + "step": 12270 + }, + { + "epoch": 1.1915389093731807, + "grad_norm": 1.4111596404765325, + "learning_rate": 3.0238821467973266e-05, + "loss": 0.5226, + "step": 12280 + }, + { + "epoch": 1.1925092179313022, + "grad_norm": 1.976122066748625, + "learning_rate": 3.0222597183464212e-05, + "loss": 0.5753, + "step": 12290 + }, + { + "epoch": 1.1934795264894236, + "grad_norm": 1.6905440972221555, + "learning_rate": 3.0206372898955158e-05, + "loss": 0.5559, + "step": 12300 + }, + { + "epoch": 1.1944498350475452, + "grad_norm": 1.549005200001293, + "learning_rate": 3.0190148614446108e-05, + "loss": 0.5199, + "step": 12310 + }, + { + "epoch": 1.1954201436056666, + "grad_norm": 1.7886858738633538, + "learning_rate": 3.0173924329937054e-05, + "loss": 0.5524, + "step": 12320 + }, + { + "epoch": 1.196390452163788, + "grad_norm": 2.226393124620313, + "learning_rate": 3.0157700045428e-05, + "loss": 0.511, + "step": 12330 + }, + { + "epoch": 1.1973607607219097, + "grad_norm": 1.84134054205334, + "learning_rate": 3.0141475760918947e-05, + "loss": 0.5537, + "step": 12340 + }, + { + "epoch": 1.198331069280031, + "grad_norm": 1.938676760265039, + "learning_rate": 3.0125251476409894e-05, + "loss": 0.5782, + "step": 12350 + }, + { + "epoch": 1.1993013778381525, + "grad_norm": 1.6671922582631973, + "learning_rate": 3.010902719190084e-05, + "loss": 0.5146, + "step": 12360 + }, + { + "epoch": 1.200271686396274, + "grad_norm": 2.09828561481085, + "learning_rate": 3.0092802907391786e-05, + "loss": 0.5292, + "step": 12370 + }, + { + "epoch": 1.2012419949543955, + "grad_norm": 1.782738185716262, + "learning_rate": 3.0076578622882733e-05, + "loss": 0.5797, + "step": 12380 + }, + { + "epoch": 1.202212303512517, + "grad_norm": 1.7386393975624728, + "learning_rate": 3.006035433837368e-05, + "loss": 0.5459, + "step": 12390 + }, + { + "epoch": 1.2031826120706384, + "grad_norm": 1.834619533516341, + "learning_rate": 3.004413005386463e-05, + "loss": 0.5711, + "step": 12400 + }, + { + "epoch": 1.20415292062876, + "grad_norm": 1.6555949649402653, + "learning_rate": 3.0027905769355575e-05, + "loss": 0.5447, + "step": 12410 + }, + { + "epoch": 1.2051232291868814, + "grad_norm": 1.997714330931175, + "learning_rate": 3.001168148484652e-05, + "loss": 0.5001, + "step": 12420 + }, + { + "epoch": 1.2060935377450028, + "grad_norm": 1.943566532939107, + "learning_rate": 2.9995457200337468e-05, + "loss": 0.5612, + "step": 12430 + }, + { + "epoch": 1.2070638463031245, + "grad_norm": 2.295630424579193, + "learning_rate": 2.9979232915828414e-05, + "loss": 0.5443, + "step": 12440 + }, + { + "epoch": 1.2080341548612459, + "grad_norm": 1.6087261613477206, + "learning_rate": 2.996300863131936e-05, + "loss": 0.4929, + "step": 12450 + }, + { + "epoch": 1.2090044634193673, + "grad_norm": 1.7507917723489945, + "learning_rate": 2.9946784346810307e-05, + "loss": 0.4915, + "step": 12460 + }, + { + "epoch": 1.209974771977489, + "grad_norm": 1.712263283374138, + "learning_rate": 2.9930560062301254e-05, + "loss": 0.5184, + "step": 12470 + }, + { + "epoch": 1.2109450805356103, + "grad_norm": 2.1091363117437427, + "learning_rate": 2.9914335777792203e-05, + "loss": 0.5123, + "step": 12480 + }, + { + "epoch": 1.2119153890937318, + "grad_norm": 2.404178772933857, + "learning_rate": 2.989811149328315e-05, + "loss": 0.4568, + "step": 12490 + }, + { + "epoch": 1.2128856976518532, + "grad_norm": 1.6898907246159178, + "learning_rate": 2.9881887208774096e-05, + "loss": 0.5114, + "step": 12500 + }, + { + "epoch": 1.2128856976518532, + "eval_loss": 0.6506454348564148, + "eval_runtime": 2464.983, + "eval_samples_per_second": 0.727, + "eval_steps_per_second": 0.363, + "step": 12500 + }, + { + "epoch": 1.2138560062099748, + "grad_norm": 1.5632796322726878, + "learning_rate": 2.9865662924265042e-05, + "loss": 0.5381, + "step": 12510 + }, + { + "epoch": 1.2148263147680962, + "grad_norm": 2.2060623757238482, + "learning_rate": 2.984943863975599e-05, + "loss": 0.6286, + "step": 12520 + }, + { + "epoch": 1.2157966233262179, + "grad_norm": 1.6896138867780373, + "learning_rate": 2.9833214355246935e-05, + "loss": 0.4824, + "step": 12530 + }, + { + "epoch": 1.2167669318843393, + "grad_norm": 1.6264014630619223, + "learning_rate": 2.981699007073788e-05, + "loss": 0.5231, + "step": 12540 + }, + { + "epoch": 1.2177372404424607, + "grad_norm": 2.288555955501704, + "learning_rate": 2.9800765786228828e-05, + "loss": 0.5415, + "step": 12550 + }, + { + "epoch": 1.218707549000582, + "grad_norm": 1.6328806432164462, + "learning_rate": 2.9784541501719774e-05, + "loss": 0.5282, + "step": 12560 + }, + { + "epoch": 1.2196778575587037, + "grad_norm": 1.9940506922760688, + "learning_rate": 2.9768317217210724e-05, + "loss": 0.5283, + "step": 12570 + }, + { + "epoch": 1.2206481661168251, + "grad_norm": 1.9369189438911159, + "learning_rate": 2.975209293270167e-05, + "loss": 0.4722, + "step": 12580 + }, + { + "epoch": 1.2216184746749466, + "grad_norm": 1.150006209597975, + "learning_rate": 2.9735868648192617e-05, + "loss": 0.5184, + "step": 12590 + }, + { + "epoch": 1.2225887832330682, + "grad_norm": 1.7210167191672803, + "learning_rate": 2.9719644363683563e-05, + "loss": 0.5031, + "step": 12600 + }, + { + "epoch": 1.2235590917911896, + "grad_norm": 1.9388550988757736, + "learning_rate": 2.970342007917451e-05, + "loss": 0.5099, + "step": 12610 + }, + { + "epoch": 1.224529400349311, + "grad_norm": 1.6524197083323393, + "learning_rate": 2.9687195794665456e-05, + "loss": 0.4447, + "step": 12620 + }, + { + "epoch": 1.2254997089074326, + "grad_norm": 1.89736479001966, + "learning_rate": 2.9670971510156402e-05, + "loss": 0.4983, + "step": 12630 + }, + { + "epoch": 1.226470017465554, + "grad_norm": 1.895097527141105, + "learning_rate": 2.965474722564735e-05, + "loss": 0.5039, + "step": 12640 + }, + { + "epoch": 1.2274403260236755, + "grad_norm": 1.976305435076919, + "learning_rate": 2.96385229411383e-05, + "loss": 0.5337, + "step": 12650 + }, + { + "epoch": 1.2284106345817971, + "grad_norm": 1.6796293491451193, + "learning_rate": 2.9622298656629245e-05, + "loss": 0.506, + "step": 12660 + }, + { + "epoch": 1.2293809431399185, + "grad_norm": 1.6431597372306554, + "learning_rate": 2.960607437212019e-05, + "loss": 0.5797, + "step": 12670 + }, + { + "epoch": 1.23035125169804, + "grad_norm": 1.8020203988119472, + "learning_rate": 2.9589850087611138e-05, + "loss": 0.4957, + "step": 12680 + }, + { + "epoch": 1.2313215602561614, + "grad_norm": 2.0575623836232935, + "learning_rate": 2.9573625803102084e-05, + "loss": 0.5125, + "step": 12690 + }, + { + "epoch": 1.232291868814283, + "grad_norm": 2.0440836316269544, + "learning_rate": 2.955740151859303e-05, + "loss": 0.5186, + "step": 12700 + }, + { + "epoch": 1.2332621773724044, + "grad_norm": 1.6684531467277435, + "learning_rate": 2.9541177234083977e-05, + "loss": 0.5348, + "step": 12710 + }, + { + "epoch": 1.2342324859305258, + "grad_norm": 1.6949248831996988, + "learning_rate": 2.9524952949574923e-05, + "loss": 0.5484, + "step": 12720 + }, + { + "epoch": 1.2352027944886474, + "grad_norm": 1.773704436016878, + "learning_rate": 2.950872866506587e-05, + "loss": 0.5033, + "step": 12730 + }, + { + "epoch": 1.2361731030467689, + "grad_norm": 1.9246733654165475, + "learning_rate": 2.949250438055682e-05, + "loss": 0.5164, + "step": 12740 + }, + { + "epoch": 1.2371434116048903, + "grad_norm": 1.7869787657786207, + "learning_rate": 2.9476280096047766e-05, + "loss": 0.549, + "step": 12750 + }, + { + "epoch": 1.238113720163012, + "grad_norm": 1.5427226807712424, + "learning_rate": 2.9460055811538712e-05, + "loss": 0.5453, + "step": 12760 + }, + { + "epoch": 1.2390840287211333, + "grad_norm": 2.1243893484204706, + "learning_rate": 2.944383152702966e-05, + "loss": 0.5232, + "step": 12770 + }, + { + "epoch": 1.2400543372792547, + "grad_norm": 1.7624693076719502, + "learning_rate": 2.9427607242520605e-05, + "loss": 0.4767, + "step": 12780 + }, + { + "epoch": 1.2410246458373764, + "grad_norm": 1.8048923369800416, + "learning_rate": 2.941138295801155e-05, + "loss": 0.5786, + "step": 12790 + }, + { + "epoch": 1.2419949543954978, + "grad_norm": 1.557577350338282, + "learning_rate": 2.9395158673502498e-05, + "loss": 0.5358, + "step": 12800 + }, + { + "epoch": 1.2429652629536192, + "grad_norm": 1.7796545697030264, + "learning_rate": 2.9378934388993444e-05, + "loss": 0.5447, + "step": 12810 + }, + { + "epoch": 1.2439355715117406, + "grad_norm": 1.5233346835390529, + "learning_rate": 2.936271010448439e-05, + "loss": 0.5434, + "step": 12820 + }, + { + "epoch": 1.2449058800698622, + "grad_norm": 1.8281296123454516, + "learning_rate": 2.934648581997534e-05, + "loss": 0.5311, + "step": 12830 + }, + { + "epoch": 1.2458761886279837, + "grad_norm": 1.8761299295652716, + "learning_rate": 2.9330261535466287e-05, + "loss": 0.561, + "step": 12840 + }, + { + "epoch": 1.2468464971861053, + "grad_norm": 1.571060091128229, + "learning_rate": 2.9314037250957233e-05, + "loss": 0.5293, + "step": 12850 + }, + { + "epoch": 1.2478168057442267, + "grad_norm": 1.3554175730214915, + "learning_rate": 2.929781296644818e-05, + "loss": 0.5259, + "step": 12860 + }, + { + "epoch": 1.2487871143023481, + "grad_norm": 1.5048611859450334, + "learning_rate": 2.9281588681939126e-05, + "loss": 0.5526, + "step": 12870 + }, + { + "epoch": 1.2497574228604695, + "grad_norm": 1.944003477785508, + "learning_rate": 2.9265364397430072e-05, + "loss": 0.5319, + "step": 12880 + }, + { + "epoch": 1.2507277314185912, + "grad_norm": 1.6137183858737645, + "learning_rate": 2.924914011292102e-05, + "loss": 0.5219, + "step": 12890 + }, + { + "epoch": 1.2516980399767126, + "grad_norm": 1.8792239391042322, + "learning_rate": 2.9232915828411965e-05, + "loss": 0.4996, + "step": 12900 + }, + { + "epoch": 1.252668348534834, + "grad_norm": 1.6048948758745463, + "learning_rate": 2.9216691543902915e-05, + "loss": 0.5094, + "step": 12910 + }, + { + "epoch": 1.2536386570929556, + "grad_norm": 2.06874670039208, + "learning_rate": 2.920046725939386e-05, + "loss": 0.5412, + "step": 12920 + }, + { + "epoch": 1.254608965651077, + "grad_norm": 1.8299215479076065, + "learning_rate": 2.9184242974884808e-05, + "loss": 0.5215, + "step": 12930 + }, + { + "epoch": 1.2555792742091985, + "grad_norm": 1.768175531798484, + "learning_rate": 2.9168018690375754e-05, + "loss": 0.5466, + "step": 12940 + }, + { + "epoch": 1.2565495827673199, + "grad_norm": 2.19878862821484, + "learning_rate": 2.91517944058667e-05, + "loss": 0.5728, + "step": 12950 + }, + { + "epoch": 1.2575198913254415, + "grad_norm": 1.5410059750444967, + "learning_rate": 2.9135570121357647e-05, + "loss": 0.5248, + "step": 12960 + }, + { + "epoch": 1.258490199883563, + "grad_norm": 1.9319057652262193, + "learning_rate": 2.9119345836848593e-05, + "loss": 0.4906, + "step": 12970 + }, + { + "epoch": 1.2594605084416846, + "grad_norm": 1.4285564772766584, + "learning_rate": 2.910312155233954e-05, + "loss": 0.4976, + "step": 12980 + }, + { + "epoch": 1.260430816999806, + "grad_norm": 1.9098028563216323, + "learning_rate": 2.9086897267830486e-05, + "loss": 0.4841, + "step": 12990 + }, + { + "epoch": 1.2614011255579274, + "grad_norm": 2.155481185125366, + "learning_rate": 2.9070672983321436e-05, + "loss": 0.5486, + "step": 13000 + }, + { + "epoch": 1.2614011255579274, + "eval_loss": 0.6476565003395081, + "eval_runtime": 2469.4767, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 13000 + }, + { + "epoch": 1.2623714341160488, + "grad_norm": 1.7583302051951548, + "learning_rate": 2.9054448698812385e-05, + "loss": 0.5287, + "step": 13010 + }, + { + "epoch": 1.2633417426741704, + "grad_norm": 2.1364231232324777, + "learning_rate": 2.9038224414303332e-05, + "loss": 0.5338, + "step": 13020 + }, + { + "epoch": 1.2643120512322918, + "grad_norm": 1.858948662905276, + "learning_rate": 2.902200012979428e-05, + "loss": 0.5371, + "step": 13030 + }, + { + "epoch": 1.2652823597904135, + "grad_norm": 1.434526008901497, + "learning_rate": 2.9005775845285228e-05, + "loss": 0.5199, + "step": 13040 + }, + { + "epoch": 1.266252668348535, + "grad_norm": 1.8326954509581932, + "learning_rate": 2.8989551560776174e-05, + "loss": 0.4626, + "step": 13050 + }, + { + "epoch": 1.2672229769066563, + "grad_norm": 1.929734623290484, + "learning_rate": 2.897332727626712e-05, + "loss": 0.5095, + "step": 13060 + }, + { + "epoch": 1.2681932854647777, + "grad_norm": 1.7342388298669522, + "learning_rate": 2.8957102991758067e-05, + "loss": 0.5334, + "step": 13070 + }, + { + "epoch": 1.2691635940228994, + "grad_norm": 1.8419675684147427, + "learning_rate": 2.8940878707249013e-05, + "loss": 0.5215, + "step": 13080 + }, + { + "epoch": 1.2701339025810208, + "grad_norm": 2.0003719776284434, + "learning_rate": 2.892465442273996e-05, + "loss": 0.5291, + "step": 13090 + }, + { + "epoch": 1.2711042111391422, + "grad_norm": 1.5672070400575155, + "learning_rate": 2.8908430138230906e-05, + "loss": 0.5302, + "step": 13100 + }, + { + "epoch": 1.2720745196972638, + "grad_norm": 1.6329476161915306, + "learning_rate": 2.8892205853721856e-05, + "loss": 0.5004, + "step": 13110 + }, + { + "epoch": 1.2730448282553852, + "grad_norm": 1.3430643020518607, + "learning_rate": 2.8875981569212802e-05, + "loss": 0.5586, + "step": 13120 + }, + { + "epoch": 1.2740151368135066, + "grad_norm": 1.865116300464639, + "learning_rate": 2.885975728470375e-05, + "loss": 0.5176, + "step": 13130 + }, + { + "epoch": 1.274985445371628, + "grad_norm": 1.9537429368278099, + "learning_rate": 2.8843533000194695e-05, + "loss": 0.5167, + "step": 13140 + }, + { + "epoch": 1.2759557539297497, + "grad_norm": 2.059620522389882, + "learning_rate": 2.882730871568564e-05, + "loss": 0.4654, + "step": 13150 + }, + { + "epoch": 1.276926062487871, + "grad_norm": 2.1139159674960317, + "learning_rate": 2.8811084431176588e-05, + "loss": 0.5365, + "step": 13160 + }, + { + "epoch": 1.2778963710459927, + "grad_norm": 1.9675564766085183, + "learning_rate": 2.8794860146667534e-05, + "loss": 0.5567, + "step": 13170 + }, + { + "epoch": 1.2788666796041142, + "grad_norm": 1.4714857260607748, + "learning_rate": 2.877863586215848e-05, + "loss": 0.5543, + "step": 13180 + }, + { + "epoch": 1.2798369881622356, + "grad_norm": 1.997029571976202, + "learning_rate": 2.8762411577649427e-05, + "loss": 0.5349, + "step": 13190 + }, + { + "epoch": 1.280807296720357, + "grad_norm": 1.7739211918572755, + "learning_rate": 2.8746187293140377e-05, + "loss": 0.4893, + "step": 13200 + }, + { + "epoch": 1.2817776052784786, + "grad_norm": 1.7069871948143287, + "learning_rate": 2.8729963008631323e-05, + "loss": 0.4907, + "step": 13210 + }, + { + "epoch": 1.2827479138366, + "grad_norm": 1.9981954512910625, + "learning_rate": 2.871373872412227e-05, + "loss": 0.5672, + "step": 13220 + }, + { + "epoch": 1.2837182223947214, + "grad_norm": 1.6613620495892372, + "learning_rate": 2.8697514439613216e-05, + "loss": 0.541, + "step": 13230 + }, + { + "epoch": 1.284688530952843, + "grad_norm": 2.3272590952107675, + "learning_rate": 2.8681290155104162e-05, + "loss": 0.5411, + "step": 13240 + }, + { + "epoch": 1.2856588395109645, + "grad_norm": 2.12223916886103, + "learning_rate": 2.866506587059511e-05, + "loss": 0.4957, + "step": 13250 + }, + { + "epoch": 1.286629148069086, + "grad_norm": 1.4853001948712596, + "learning_rate": 2.8648841586086055e-05, + "loss": 0.5432, + "step": 13260 + }, + { + "epoch": 1.2875994566272073, + "grad_norm": 2.095035318310476, + "learning_rate": 2.8632617301577e-05, + "loss": 0.5466, + "step": 13270 + }, + { + "epoch": 1.288569765185329, + "grad_norm": 1.8562546228217478, + "learning_rate": 2.8616393017067948e-05, + "loss": 0.5393, + "step": 13280 + }, + { + "epoch": 1.2895400737434504, + "grad_norm": 1.960721539695441, + "learning_rate": 2.8600168732558898e-05, + "loss": 0.5737, + "step": 13290 + }, + { + "epoch": 1.290510382301572, + "grad_norm": 1.8998278241439668, + "learning_rate": 2.8583944448049844e-05, + "loss": 0.4965, + "step": 13300 + }, + { + "epoch": 1.2914806908596934, + "grad_norm": 1.648594607244266, + "learning_rate": 2.856772016354079e-05, + "loss": 0.5691, + "step": 13310 + }, + { + "epoch": 1.2924509994178148, + "grad_norm": 1.65007773003003, + "learning_rate": 2.8551495879031737e-05, + "loss": 0.5, + "step": 13320 + }, + { + "epoch": 1.2934213079759362, + "grad_norm": 2.0573651261935293, + "learning_rate": 2.8535271594522683e-05, + "loss": 0.4926, + "step": 13330 + }, + { + "epoch": 1.2943916165340579, + "grad_norm": 2.328992496419527, + "learning_rate": 2.851904731001363e-05, + "loss": 0.5221, + "step": 13340 + }, + { + "epoch": 1.2953619250921793, + "grad_norm": 1.8509833991470588, + "learning_rate": 2.8502823025504576e-05, + "loss": 0.5018, + "step": 13350 + }, + { + "epoch": 1.296332233650301, + "grad_norm": 1.895399492110669, + "learning_rate": 2.8486598740995522e-05, + "loss": 0.4724, + "step": 13360 + }, + { + "epoch": 1.2973025422084223, + "grad_norm": 1.667924410333677, + "learning_rate": 2.8470374456486472e-05, + "loss": 0.5165, + "step": 13370 + }, + { + "epoch": 1.2982728507665438, + "grad_norm": 2.1618438681975722, + "learning_rate": 2.845415017197742e-05, + "loss": 0.5208, + "step": 13380 + }, + { + "epoch": 1.2992431593246652, + "grad_norm": 2.0324515358778434, + "learning_rate": 2.8437925887468365e-05, + "loss": 0.4944, + "step": 13390 + }, + { + "epoch": 1.3002134678827868, + "grad_norm": 1.4374011148146266, + "learning_rate": 2.842170160295931e-05, + "loss": 0.556, + "step": 13400 + }, + { + "epoch": 1.3011837764409082, + "grad_norm": 2.0625833924885075, + "learning_rate": 2.8405477318450258e-05, + "loss": 0.533, + "step": 13410 + }, + { + "epoch": 1.3021540849990296, + "grad_norm": 1.7327381778160986, + "learning_rate": 2.8389253033941204e-05, + "loss": 0.5428, + "step": 13420 + }, + { + "epoch": 1.3031243935571513, + "grad_norm": 1.5980081031800506, + "learning_rate": 2.837302874943215e-05, + "loss": 0.5169, + "step": 13430 + }, + { + "epoch": 1.3040947021152727, + "grad_norm": 1.4133329538382045, + "learning_rate": 2.8356804464923097e-05, + "loss": 0.5365, + "step": 13440 + }, + { + "epoch": 1.305065010673394, + "grad_norm": 1.6052634633267397, + "learning_rate": 2.8340580180414043e-05, + "loss": 0.5062, + "step": 13450 + }, + { + "epoch": 1.3060353192315155, + "grad_norm": 1.8789186495289818, + "learning_rate": 2.8324355895904993e-05, + "loss": 0.5301, + "step": 13460 + }, + { + "epoch": 1.3070056277896371, + "grad_norm": 1.7539318856514114, + "learning_rate": 2.830813161139594e-05, + "loss": 0.5191, + "step": 13470 + }, + { + "epoch": 1.3079759363477585, + "grad_norm": 1.652081891063903, + "learning_rate": 2.8291907326886886e-05, + "loss": 0.5162, + "step": 13480 + }, + { + "epoch": 1.3089462449058802, + "grad_norm": 1.9042372320107943, + "learning_rate": 2.8275683042377832e-05, + "loss": 0.5202, + "step": 13490 + }, + { + "epoch": 1.3099165534640016, + "grad_norm": 1.9149924815733665, + "learning_rate": 2.825945875786878e-05, + "loss": 0.5785, + "step": 13500 + }, + { + "epoch": 1.3099165534640016, + "eval_loss": 0.6458503007888794, + "eval_runtime": 2472.3879, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 13500 + }, + { + "epoch": 1.310886862022123, + "grad_norm": 1.6405768771538807, + "learning_rate": 2.8243234473359725e-05, + "loss": 0.5037, + "step": 13510 + }, + { + "epoch": 1.3118571705802444, + "grad_norm": 1.8018934132744455, + "learning_rate": 2.822701018885067e-05, + "loss": 0.5465, + "step": 13520 + }, + { + "epoch": 1.312827479138366, + "grad_norm": 2.013531086384291, + "learning_rate": 2.8210785904341618e-05, + "loss": 0.5011, + "step": 13530 + }, + { + "epoch": 1.3137977876964875, + "grad_norm": 2.022552703035907, + "learning_rate": 2.8194561619832567e-05, + "loss": 0.5657, + "step": 13540 + }, + { + "epoch": 1.3147680962546089, + "grad_norm": 1.734149403661694, + "learning_rate": 2.8178337335323514e-05, + "loss": 0.5074, + "step": 13550 + }, + { + "epoch": 1.3157384048127305, + "grad_norm": 2.1508604289357476, + "learning_rate": 2.816211305081446e-05, + "loss": 0.4873, + "step": 13560 + }, + { + "epoch": 1.316708713370852, + "grad_norm": 1.9266405893568426, + "learning_rate": 2.8145888766305406e-05, + "loss": 0.5231, + "step": 13570 + }, + { + "epoch": 1.3176790219289733, + "grad_norm": 1.8623995402610134, + "learning_rate": 2.8129664481796353e-05, + "loss": 0.5748, + "step": 13580 + }, + { + "epoch": 1.3186493304870948, + "grad_norm": 1.5283427217297032, + "learning_rate": 2.81134401972873e-05, + "loss": 0.5322, + "step": 13590 + }, + { + "epoch": 1.3196196390452164, + "grad_norm": 1.7148189442620572, + "learning_rate": 2.8097215912778246e-05, + "loss": 0.4896, + "step": 13600 + }, + { + "epoch": 1.3205899476033378, + "grad_norm": 1.4473714452022712, + "learning_rate": 2.8080991628269192e-05, + "loss": 0.4974, + "step": 13610 + }, + { + "epoch": 1.3215602561614594, + "grad_norm": 1.8643931328762333, + "learning_rate": 2.806476734376014e-05, + "loss": 0.5707, + "step": 13620 + }, + { + "epoch": 1.3225305647195809, + "grad_norm": 2.1475243574365264, + "learning_rate": 2.8048543059251088e-05, + "loss": 0.5167, + "step": 13630 + }, + { + "epoch": 1.3235008732777023, + "grad_norm": 1.6293038645225775, + "learning_rate": 2.8032318774742035e-05, + "loss": 0.5268, + "step": 13640 + }, + { + "epoch": 1.3244711818358237, + "grad_norm": 1.8987021880259773, + "learning_rate": 2.801609449023298e-05, + "loss": 0.5432, + "step": 13650 + }, + { + "epoch": 1.3254414903939453, + "grad_norm": 1.5753855821090716, + "learning_rate": 2.7999870205723927e-05, + "loss": 0.4827, + "step": 13660 + }, + { + "epoch": 1.3264117989520667, + "grad_norm": 1.350257936732941, + "learning_rate": 2.7983645921214874e-05, + "loss": 0.5544, + "step": 13670 + }, + { + "epoch": 1.3273821075101884, + "grad_norm": 1.9293055864878257, + "learning_rate": 2.796742163670582e-05, + "loss": 0.5441, + "step": 13680 + }, + { + "epoch": 1.3283524160683098, + "grad_norm": 2.2701614440224813, + "learning_rate": 2.7951197352196766e-05, + "loss": 0.5237, + "step": 13690 + }, + { + "epoch": 1.3293227246264312, + "grad_norm": 1.2019327117406085, + "learning_rate": 2.7934973067687713e-05, + "loss": 0.5434, + "step": 13700 + }, + { + "epoch": 1.3302930331845526, + "grad_norm": 1.406257025731888, + "learning_rate": 2.7918748783178663e-05, + "loss": 0.5063, + "step": 13710 + }, + { + "epoch": 1.3312633417426742, + "grad_norm": 1.666513597838276, + "learning_rate": 2.790252449866961e-05, + "loss": 0.5068, + "step": 13720 + }, + { + "epoch": 1.3322336503007957, + "grad_norm": 1.7668183417692156, + "learning_rate": 2.7886300214160555e-05, + "loss": 0.5141, + "step": 13730 + }, + { + "epoch": 1.333203958858917, + "grad_norm": 1.433593593379768, + "learning_rate": 2.7870075929651502e-05, + "loss": 0.4626, + "step": 13740 + }, + { + "epoch": 1.3341742674170387, + "grad_norm": 2.009965700363568, + "learning_rate": 2.7853851645142448e-05, + "loss": 0.5618, + "step": 13750 + }, + { + "epoch": 1.3351445759751601, + "grad_norm": 1.763668408586817, + "learning_rate": 2.7837627360633395e-05, + "loss": 0.5369, + "step": 13760 + }, + { + "epoch": 1.3361148845332815, + "grad_norm": 2.1411324301354053, + "learning_rate": 2.782140307612434e-05, + "loss": 0.5084, + "step": 13770 + }, + { + "epoch": 1.337085193091403, + "grad_norm": 1.436827138346562, + "learning_rate": 2.7805178791615287e-05, + "loss": 0.5032, + "step": 13780 + }, + { + "epoch": 1.3380555016495246, + "grad_norm": 1.9606877958295938, + "learning_rate": 2.778895450710624e-05, + "loss": 0.5327, + "step": 13790 + }, + { + "epoch": 1.339025810207646, + "grad_norm": 2.0522393469060396, + "learning_rate": 2.7772730222597187e-05, + "loss": 0.5398, + "step": 13800 + }, + { + "epoch": 1.3399961187657676, + "grad_norm": 1.9786107551791654, + "learning_rate": 2.7756505938088133e-05, + "loss": 0.5009, + "step": 13810 + }, + { + "epoch": 1.340966427323889, + "grad_norm": 2.090584551627846, + "learning_rate": 2.774028165357908e-05, + "loss": 0.5296, + "step": 13820 + }, + { + "epoch": 1.3419367358820105, + "grad_norm": 2.1298950160365613, + "learning_rate": 2.772405736907003e-05, + "loss": 0.4865, + "step": 13830 + }, + { + "epoch": 1.3429070444401319, + "grad_norm": 1.7629297128429193, + "learning_rate": 2.7707833084560976e-05, + "loss": 0.5324, + "step": 13840 + }, + { + "epoch": 1.3438773529982535, + "grad_norm": 1.8397197756904764, + "learning_rate": 2.7691608800051922e-05, + "loss": 0.5732, + "step": 13850 + }, + { + "epoch": 1.344847661556375, + "grad_norm": 1.8108747182833855, + "learning_rate": 2.767538451554287e-05, + "loss": 0.4947, + "step": 13860 + }, + { + "epoch": 1.3458179701144963, + "grad_norm": 1.8199614245355178, + "learning_rate": 2.7659160231033815e-05, + "loss": 0.5557, + "step": 13870 + }, + { + "epoch": 1.346788278672618, + "grad_norm": 1.3431248134057203, + "learning_rate": 2.764293594652476e-05, + "loss": 0.5404, + "step": 13880 + }, + { + "epoch": 1.3477585872307394, + "grad_norm": 1.5574090523973907, + "learning_rate": 2.7626711662015708e-05, + "loss": 0.568, + "step": 13890 + }, + { + "epoch": 1.3487288957888608, + "grad_norm": 1.4115013178507176, + "learning_rate": 2.7610487377506654e-05, + "loss": 0.4716, + "step": 13900 + }, + { + "epoch": 1.3496992043469822, + "grad_norm": 2.1040010286866444, + "learning_rate": 2.75942630929976e-05, + "loss": 0.4934, + "step": 13910 + }, + { + "epoch": 1.3506695129051038, + "grad_norm": 1.7498060957230301, + "learning_rate": 2.757803880848855e-05, + "loss": 0.5454, + "step": 13920 + }, + { + "epoch": 1.3516398214632253, + "grad_norm": 1.8910220759501428, + "learning_rate": 2.7561814523979497e-05, + "loss": 0.5641, + "step": 13930 + }, + { + "epoch": 1.352610130021347, + "grad_norm": 1.6170015266249442, + "learning_rate": 2.7545590239470443e-05, + "loss": 0.5378, + "step": 13940 + }, + { + "epoch": 1.3535804385794683, + "grad_norm": 1.7666178388963019, + "learning_rate": 2.752936595496139e-05, + "loss": 0.5448, + "step": 13950 + }, + { + "epoch": 1.3545507471375897, + "grad_norm": 2.052806059730038, + "learning_rate": 2.7513141670452336e-05, + "loss": 0.5158, + "step": 13960 + }, + { + "epoch": 1.3555210556957111, + "grad_norm": 1.9222501285118103, + "learning_rate": 2.7496917385943282e-05, + "loss": 0.4982, + "step": 13970 + }, + { + "epoch": 1.3564913642538328, + "grad_norm": 2.1370232742150903, + "learning_rate": 2.748069310143423e-05, + "loss": 0.5409, + "step": 13980 + }, + { + "epoch": 1.3574616728119542, + "grad_norm": 1.7396628585118512, + "learning_rate": 2.7464468816925175e-05, + "loss": 0.5327, + "step": 13990 + }, + { + "epoch": 1.3584319813700758, + "grad_norm": 1.990716927328341, + "learning_rate": 2.7448244532416125e-05, + "loss": 0.5291, + "step": 14000 + }, + { + "epoch": 1.3584319813700758, + "eval_loss": 0.6447646021842957, + "eval_runtime": 2468.8402, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 14000 + }, + { + "epoch": 1.3594022899281972, + "grad_norm": 1.7121198472067753, + "learning_rate": 2.743202024790707e-05, + "loss": 0.4758, + "step": 14010 + }, + { + "epoch": 1.3603725984863186, + "grad_norm": 1.6674489776942332, + "learning_rate": 2.7415795963398017e-05, + "loss": 0.5484, + "step": 14020 + }, + { + "epoch": 1.36134290704444, + "grad_norm": 1.458164078535125, + "learning_rate": 2.7399571678888964e-05, + "loss": 0.5523, + "step": 14030 + }, + { + "epoch": 1.3623132156025617, + "grad_norm": 1.8775916401381276, + "learning_rate": 2.738334739437991e-05, + "loss": 0.5254, + "step": 14040 + }, + { + "epoch": 1.363283524160683, + "grad_norm": 1.671470903392816, + "learning_rate": 2.7367123109870857e-05, + "loss": 0.5005, + "step": 14050 + }, + { + "epoch": 1.3642538327188045, + "grad_norm": 1.9806763146120538, + "learning_rate": 2.7350898825361803e-05, + "loss": 0.5134, + "step": 14060 + }, + { + "epoch": 1.3652241412769262, + "grad_norm": 1.514946401551448, + "learning_rate": 2.733467454085275e-05, + "loss": 0.5597, + "step": 14070 + }, + { + "epoch": 1.3661944498350476, + "grad_norm": 1.6635049106626887, + "learning_rate": 2.7318450256343696e-05, + "loss": 0.5377, + "step": 14080 + }, + { + "epoch": 1.367164758393169, + "grad_norm": 1.7564332843089718, + "learning_rate": 2.7302225971834645e-05, + "loss": 0.5027, + "step": 14090 + }, + { + "epoch": 1.3681350669512904, + "grad_norm": 2.0454872453162682, + "learning_rate": 2.7286001687325592e-05, + "loss": 0.5056, + "step": 14100 + }, + { + "epoch": 1.369105375509412, + "grad_norm": 2.0786210322749032, + "learning_rate": 2.7269777402816538e-05, + "loss": 0.5835, + "step": 14110 + }, + { + "epoch": 1.3700756840675334, + "grad_norm": 2.1275215610783555, + "learning_rate": 2.7253553118307485e-05, + "loss": 0.5235, + "step": 14120 + }, + { + "epoch": 1.371045992625655, + "grad_norm": 1.7013208723478948, + "learning_rate": 2.723732883379843e-05, + "loss": 0.5105, + "step": 14130 + }, + { + "epoch": 1.3720163011837765, + "grad_norm": 1.586441401545233, + "learning_rate": 2.7221104549289377e-05, + "loss": 0.5221, + "step": 14140 + }, + { + "epoch": 1.372986609741898, + "grad_norm": 1.533260185383275, + "learning_rate": 2.7204880264780324e-05, + "loss": 0.4714, + "step": 14150 + }, + { + "epoch": 1.3739569183000193, + "grad_norm": 1.99495942930363, + "learning_rate": 2.718865598027127e-05, + "loss": 0.5127, + "step": 14160 + }, + { + "epoch": 1.374927226858141, + "grad_norm": 1.5455163764317608, + "learning_rate": 2.717243169576222e-05, + "loss": 0.5331, + "step": 14170 + }, + { + "epoch": 1.3758975354162624, + "grad_norm": 2.542301246782172, + "learning_rate": 2.7156207411253166e-05, + "loss": 0.5196, + "step": 14180 + }, + { + "epoch": 1.3768678439743838, + "grad_norm": 2.367306381136639, + "learning_rate": 2.7139983126744113e-05, + "loss": 0.5173, + "step": 14190 + }, + { + "epoch": 1.3778381525325054, + "grad_norm": 1.748922567904988, + "learning_rate": 2.712375884223506e-05, + "loss": 0.5301, + "step": 14200 + }, + { + "epoch": 1.3788084610906268, + "grad_norm": 1.9185950864572987, + "learning_rate": 2.7107534557726005e-05, + "loss": 0.4242, + "step": 14210 + }, + { + "epoch": 1.3797787696487482, + "grad_norm": 1.7876319295098282, + "learning_rate": 2.7091310273216952e-05, + "loss": 0.5251, + "step": 14220 + }, + { + "epoch": 1.3807490782068697, + "grad_norm": 1.7638194028118903, + "learning_rate": 2.7075085988707898e-05, + "loss": 0.5087, + "step": 14230 + }, + { + "epoch": 1.3817193867649913, + "grad_norm": 1.781232728742025, + "learning_rate": 2.7058861704198845e-05, + "loss": 0.46, + "step": 14240 + }, + { + "epoch": 1.3826896953231127, + "grad_norm": 2.0454750065003564, + "learning_rate": 2.704263741968979e-05, + "loss": 0.542, + "step": 14250 + }, + { + "epoch": 1.3836600038812343, + "grad_norm": 2.2368401504756, + "learning_rate": 2.702641313518074e-05, + "loss": 0.501, + "step": 14260 + }, + { + "epoch": 1.3846303124393557, + "grad_norm": 1.4717190899560046, + "learning_rate": 2.7010188850671687e-05, + "loss": 0.5146, + "step": 14270 + }, + { + "epoch": 1.3856006209974772, + "grad_norm": 1.8752670210283335, + "learning_rate": 2.6993964566162634e-05, + "loss": 0.4999, + "step": 14280 + }, + { + "epoch": 1.3865709295555986, + "grad_norm": 2.065234795556922, + "learning_rate": 2.697774028165358e-05, + "loss": 0.4447, + "step": 14290 + }, + { + "epoch": 1.3875412381137202, + "grad_norm": 2.125070528749461, + "learning_rate": 2.6961515997144526e-05, + "loss": 0.4797, + "step": 14300 + }, + { + "epoch": 1.3885115466718416, + "grad_norm": 1.5859308448291956, + "learning_rate": 2.6945291712635473e-05, + "loss": 0.5336, + "step": 14310 + }, + { + "epoch": 1.3894818552299633, + "grad_norm": 2.861319646114515, + "learning_rate": 2.692906742812642e-05, + "loss": 0.4812, + "step": 14320 + }, + { + "epoch": 1.3904521637880847, + "grad_norm": 1.7597224976071428, + "learning_rate": 2.6912843143617365e-05, + "loss": 0.55, + "step": 14330 + }, + { + "epoch": 1.391422472346206, + "grad_norm": 1.681866583110354, + "learning_rate": 2.6896618859108312e-05, + "loss": 0.5046, + "step": 14340 + }, + { + "epoch": 1.3923927809043275, + "grad_norm": 1.5215374726522628, + "learning_rate": 2.688039457459926e-05, + "loss": 0.5579, + "step": 14350 + }, + { + "epoch": 1.3933630894624491, + "grad_norm": 2.079812021813028, + "learning_rate": 2.6864170290090208e-05, + "loss": 0.5495, + "step": 14360 + }, + { + "epoch": 1.3943333980205705, + "grad_norm": 1.617321032110613, + "learning_rate": 2.6847946005581154e-05, + "loss": 0.529, + "step": 14370 + }, + { + "epoch": 1.395303706578692, + "grad_norm": 1.9418293727425169, + "learning_rate": 2.68317217210721e-05, + "loss": 0.5503, + "step": 14380 + }, + { + "epoch": 1.3962740151368136, + "grad_norm": 1.6562011692834846, + "learning_rate": 2.6815497436563047e-05, + "loss": 0.5245, + "step": 14390 + }, + { + "epoch": 1.397244323694935, + "grad_norm": 1.8991877294518344, + "learning_rate": 2.6799273152053994e-05, + "loss": 0.4976, + "step": 14400 + }, + { + "epoch": 1.3982146322530564, + "grad_norm": 1.9301826202288486, + "learning_rate": 2.678304886754494e-05, + "loss": 0.4878, + "step": 14410 + }, + { + "epoch": 1.3991849408111778, + "grad_norm": 2.019905342024004, + "learning_rate": 2.6766824583035886e-05, + "loss": 0.5093, + "step": 14420 + }, + { + "epoch": 1.4001552493692995, + "grad_norm": 2.364843712084718, + "learning_rate": 2.6750600298526836e-05, + "loss": 0.5005, + "step": 14430 + }, + { + "epoch": 1.4011255579274209, + "grad_norm": 1.9751607681902115, + "learning_rate": 2.6734376014017782e-05, + "loss": 0.4958, + "step": 14440 + }, + { + "epoch": 1.4020958664855425, + "grad_norm": 1.9625121046714047, + "learning_rate": 2.671815172950873e-05, + "loss": 0.5251, + "step": 14450 + }, + { + "epoch": 1.403066175043664, + "grad_norm": 1.4828612193168078, + "learning_rate": 2.6701927444999675e-05, + "loss": 0.5496, + "step": 14460 + }, + { + "epoch": 1.4040364836017853, + "grad_norm": 1.5015720843906, + "learning_rate": 2.668570316049062e-05, + "loss": 0.535, + "step": 14470 + }, + { + "epoch": 1.4050067921599068, + "grad_norm": 1.3851757562899687, + "learning_rate": 2.6669478875981568e-05, + "loss": 0.5628, + "step": 14480 + }, + { + "epoch": 1.4059771007180284, + "grad_norm": 1.6822047277997916, + "learning_rate": 2.6653254591472514e-05, + "loss": 0.5605, + "step": 14490 + }, + { + "epoch": 1.4069474092761498, + "grad_norm": 1.9002135762249894, + "learning_rate": 2.663703030696346e-05, + "loss": 0.5348, + "step": 14500 + }, + { + "epoch": 1.4069474092761498, + "eval_loss": 0.6422961950302124, + "eval_runtime": 2474.9423, + "eval_samples_per_second": 0.724, + "eval_steps_per_second": 0.362, + "step": 14500 + }, + { + "epoch": 1.4079177178342712, + "grad_norm": 1.7272883282125664, + "learning_rate": 2.6620806022454407e-05, + "loss": 0.5518, + "step": 14510 + }, + { + "epoch": 1.4088880263923929, + "grad_norm": 1.7836182665219666, + "learning_rate": 2.6604581737945357e-05, + "loss": 0.5195, + "step": 14520 + }, + { + "epoch": 1.4098583349505143, + "grad_norm": 1.7860780128722327, + "learning_rate": 2.6588357453436303e-05, + "loss": 0.5061, + "step": 14530 + }, + { + "epoch": 1.4108286435086357, + "grad_norm": 1.8487543077145125, + "learning_rate": 2.657213316892725e-05, + "loss": 0.5261, + "step": 14540 + }, + { + "epoch": 1.411798952066757, + "grad_norm": 1.8960118987299062, + "learning_rate": 2.6555908884418196e-05, + "loss": 0.523, + "step": 14550 + }, + { + "epoch": 1.4127692606248787, + "grad_norm": 1.3767948578941858, + "learning_rate": 2.6539684599909142e-05, + "loss": 0.4654, + "step": 14560 + }, + { + "epoch": 1.4137395691830001, + "grad_norm": 1.7605168074790472, + "learning_rate": 2.652346031540009e-05, + "loss": 0.4711, + "step": 14570 + }, + { + "epoch": 1.4147098777411218, + "grad_norm": 1.424973688569186, + "learning_rate": 2.6507236030891042e-05, + "loss": 0.5639, + "step": 14580 + }, + { + "epoch": 1.4156801862992432, + "grad_norm": 1.9215824187230874, + "learning_rate": 2.649101174638199e-05, + "loss": 0.5137, + "step": 14590 + }, + { + "epoch": 1.4166504948573646, + "grad_norm": 1.439592415452285, + "learning_rate": 2.6474787461872935e-05, + "loss": 0.488, + "step": 14600 + }, + { + "epoch": 1.417620803415486, + "grad_norm": 1.6446340356434774, + "learning_rate": 2.645856317736388e-05, + "loss": 0.5324, + "step": 14610 + }, + { + "epoch": 1.4185911119736077, + "grad_norm": 1.9264081435604268, + "learning_rate": 2.6442338892854827e-05, + "loss": 0.5297, + "step": 14620 + }, + { + "epoch": 1.419561420531729, + "grad_norm": 1.5948112951701827, + "learning_rate": 2.6426114608345777e-05, + "loss": 0.5291, + "step": 14630 + }, + { + "epoch": 1.4205317290898507, + "grad_norm": 1.8654562276618851, + "learning_rate": 2.6409890323836724e-05, + "loss": 0.5588, + "step": 14640 + }, + { + "epoch": 1.4215020376479721, + "grad_norm": 1.8410472433514884, + "learning_rate": 2.639366603932767e-05, + "loss": 0.5167, + "step": 14650 + }, + { + "epoch": 1.4224723462060935, + "grad_norm": 1.9663683443045321, + "learning_rate": 2.6377441754818616e-05, + "loss": 0.5166, + "step": 14660 + }, + { + "epoch": 1.423442654764215, + "grad_norm": 1.77295818801796, + "learning_rate": 2.6361217470309563e-05, + "loss": 0.5557, + "step": 14670 + }, + { + "epoch": 1.4244129633223366, + "grad_norm": 1.542816761347417, + "learning_rate": 2.634499318580051e-05, + "loss": 0.5147, + "step": 14680 + }, + { + "epoch": 1.425383271880458, + "grad_norm": 2.13234320369828, + "learning_rate": 2.6328768901291456e-05, + "loss": 0.5421, + "step": 14690 + }, + { + "epoch": 1.4263535804385794, + "grad_norm": 1.4543036251195376, + "learning_rate": 2.6312544616782402e-05, + "loss": 0.488, + "step": 14700 + }, + { + "epoch": 1.427323888996701, + "grad_norm": 1.5267685947307574, + "learning_rate": 2.6296320332273348e-05, + "loss": 0.5352, + "step": 14710 + }, + { + "epoch": 1.4282941975548225, + "grad_norm": 2.05420183759504, + "learning_rate": 2.6280096047764298e-05, + "loss": 0.5489, + "step": 14720 + }, + { + "epoch": 1.4292645061129439, + "grad_norm": 1.5141763719248076, + "learning_rate": 2.6263871763255244e-05, + "loss": 0.5627, + "step": 14730 + }, + { + "epoch": 1.4302348146710653, + "grad_norm": 1.7887916433402153, + "learning_rate": 2.624764747874619e-05, + "loss": 0.5129, + "step": 14740 + }, + { + "epoch": 1.431205123229187, + "grad_norm": 1.9817836633125971, + "learning_rate": 2.6231423194237137e-05, + "loss": 0.5129, + "step": 14750 + }, + { + "epoch": 1.4321754317873083, + "grad_norm": 2.358539950484514, + "learning_rate": 2.6215198909728084e-05, + "loss": 0.482, + "step": 14760 + }, + { + "epoch": 1.43314574034543, + "grad_norm": 1.7444853271955691, + "learning_rate": 2.619897462521903e-05, + "loss": 0.4948, + "step": 14770 + }, + { + "epoch": 1.4341160489035514, + "grad_norm": 1.573301988778271, + "learning_rate": 2.6182750340709976e-05, + "loss": 0.4998, + "step": 14780 + }, + { + "epoch": 1.4350863574616728, + "grad_norm": 1.878570997397064, + "learning_rate": 2.6166526056200923e-05, + "loss": 0.489, + "step": 14790 + }, + { + "epoch": 1.4360566660197942, + "grad_norm": 2.0345633471458444, + "learning_rate": 2.615030177169187e-05, + "loss": 0.5395, + "step": 14800 + }, + { + "epoch": 1.4370269745779158, + "grad_norm": 1.7616258348677174, + "learning_rate": 2.613407748718282e-05, + "loss": 0.5575, + "step": 14810 + }, + { + "epoch": 1.4379972831360373, + "grad_norm": 2.202827489308336, + "learning_rate": 2.6117853202673765e-05, + "loss": 0.5378, + "step": 14820 + }, + { + "epoch": 1.4389675916941587, + "grad_norm": 2.0172801246892496, + "learning_rate": 2.610162891816471e-05, + "loss": 0.565, + "step": 14830 + }, + { + "epoch": 1.4399379002522803, + "grad_norm": 1.460733632748616, + "learning_rate": 2.6085404633655658e-05, + "loss": 0.4866, + "step": 14840 + }, + { + "epoch": 1.4409082088104017, + "grad_norm": 1.5702235924410088, + "learning_rate": 2.6069180349146604e-05, + "loss": 0.4814, + "step": 14850 + }, + { + "epoch": 1.4418785173685231, + "grad_norm": 2.3366982495878013, + "learning_rate": 2.605295606463755e-05, + "loss": 0.5204, + "step": 14860 + }, + { + "epoch": 1.4428488259266445, + "grad_norm": 2.0441313559069805, + "learning_rate": 2.6036731780128497e-05, + "loss": 0.4861, + "step": 14870 + }, + { + "epoch": 1.4438191344847662, + "grad_norm": 2.2095151787420417, + "learning_rate": 2.6020507495619444e-05, + "loss": 0.5376, + "step": 14880 + }, + { + "epoch": 1.4447894430428876, + "grad_norm": 1.9134281253559753, + "learning_rate": 2.6004283211110393e-05, + "loss": 0.5102, + "step": 14890 + }, + { + "epoch": 1.4457597516010092, + "grad_norm": 1.7832517565992747, + "learning_rate": 2.598805892660134e-05, + "loss": 0.5303, + "step": 14900 + }, + { + "epoch": 1.4467300601591306, + "grad_norm": 1.7558924047130664, + "learning_rate": 2.5971834642092286e-05, + "loss": 0.4994, + "step": 14910 + }, + { + "epoch": 1.447700368717252, + "grad_norm": 1.7883915957856114, + "learning_rate": 2.5955610357583233e-05, + "loss": 0.4677, + "step": 14920 + }, + { + "epoch": 1.4486706772753735, + "grad_norm": 2.077257189743679, + "learning_rate": 2.593938607307418e-05, + "loss": 0.515, + "step": 14930 + }, + { + "epoch": 1.449640985833495, + "grad_norm": 1.9045010104323963, + "learning_rate": 2.5923161788565125e-05, + "loss": 0.5024, + "step": 14940 + }, + { + "epoch": 1.4506112943916165, + "grad_norm": 2.106289689638874, + "learning_rate": 2.590693750405607e-05, + "loss": 0.5242, + "step": 14950 + }, + { + "epoch": 1.4515816029497381, + "grad_norm": 1.9221754156642648, + "learning_rate": 2.5890713219547018e-05, + "loss": 0.4712, + "step": 14960 + }, + { + "epoch": 1.4525519115078596, + "grad_norm": 1.8046920974227167, + "learning_rate": 2.5874488935037964e-05, + "loss": 0.5017, + "step": 14970 + }, + { + "epoch": 1.453522220065981, + "grad_norm": 1.855646189779827, + "learning_rate": 2.5858264650528914e-05, + "loss": 0.5146, + "step": 14980 + }, + { + "epoch": 1.4544925286241024, + "grad_norm": 1.8308672850602437, + "learning_rate": 2.584204036601986e-05, + "loss": 0.5293, + "step": 14990 + }, + { + "epoch": 1.455462837182224, + "grad_norm": 1.6235231403062087, + "learning_rate": 2.5825816081510807e-05, + "loss": 0.4968, + "step": 15000 + }, + { + "epoch": 1.455462837182224, + "eval_loss": 0.6395026445388794, + "eval_runtime": 2472.4459, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 15000 + }, + { + "epoch": 1.4564331457403454, + "grad_norm": 1.9674866651938567, + "learning_rate": 2.5809591797001753e-05, + "loss": 0.5402, + "step": 15010 + }, + { + "epoch": 1.4574034542984668, + "grad_norm": 1.7667679072452773, + "learning_rate": 2.57933675124927e-05, + "loss": 0.5598, + "step": 15020 + }, + { + "epoch": 1.4583737628565885, + "grad_norm": 1.6236668551346622, + "learning_rate": 2.5777143227983646e-05, + "loss": 0.4819, + "step": 15030 + }, + { + "epoch": 1.45934407141471, + "grad_norm": 1.5162521396018838, + "learning_rate": 2.5760918943474592e-05, + "loss": 0.5609, + "step": 15040 + }, + { + "epoch": 1.4603143799728313, + "grad_norm": 2.248488105603888, + "learning_rate": 2.574469465896554e-05, + "loss": 0.4987, + "step": 15050 + }, + { + "epoch": 1.4612846885309527, + "grad_norm": 1.5876501247710053, + "learning_rate": 2.572847037445649e-05, + "loss": 0.4794, + "step": 15060 + }, + { + "epoch": 1.4622549970890744, + "grad_norm": 1.9484672186526921, + "learning_rate": 2.5712246089947435e-05, + "loss": 0.5115, + "step": 15070 + }, + { + "epoch": 1.4632253056471958, + "grad_norm": 1.9931906967691606, + "learning_rate": 2.569602180543838e-05, + "loss": 0.4848, + "step": 15080 + }, + { + "epoch": 1.4641956142053174, + "grad_norm": 1.9798564517490436, + "learning_rate": 2.5679797520929328e-05, + "loss": 0.5136, + "step": 15090 + }, + { + "epoch": 1.4651659227634388, + "grad_norm": 1.7543081259876927, + "learning_rate": 2.5663573236420274e-05, + "loss": 0.5179, + "step": 15100 + }, + { + "epoch": 1.4661362313215602, + "grad_norm": 1.797610941181892, + "learning_rate": 2.564734895191122e-05, + "loss": 0.5134, + "step": 15110 + }, + { + "epoch": 1.4671065398796816, + "grad_norm": 1.6254067495518276, + "learning_rate": 2.5631124667402167e-05, + "loss": 0.497, + "step": 15120 + }, + { + "epoch": 1.4680768484378033, + "grad_norm": 2.0916697354749743, + "learning_rate": 2.5614900382893113e-05, + "loss": 0.5035, + "step": 15130 + }, + { + "epoch": 1.4690471569959247, + "grad_norm": 1.7408478320862355, + "learning_rate": 2.559867609838406e-05, + "loss": 0.5165, + "step": 15140 + }, + { + "epoch": 1.470017465554046, + "grad_norm": 1.6971971097300078, + "learning_rate": 2.558245181387501e-05, + "loss": 0.5416, + "step": 15150 + }, + { + "epoch": 1.4709877741121677, + "grad_norm": 1.7615798102754638, + "learning_rate": 2.5566227529365956e-05, + "loss": 0.5028, + "step": 15160 + }, + { + "epoch": 1.4719580826702892, + "grad_norm": 2.126034120697344, + "learning_rate": 2.5550003244856902e-05, + "loss": 0.4983, + "step": 15170 + }, + { + "epoch": 1.4729283912284106, + "grad_norm": 1.7930301180931063, + "learning_rate": 2.553377896034785e-05, + "loss": 0.4836, + "step": 15180 + }, + { + "epoch": 1.473898699786532, + "grad_norm": 1.945479896374108, + "learning_rate": 2.5517554675838795e-05, + "loss": 0.4874, + "step": 15190 + }, + { + "epoch": 1.4748690083446536, + "grad_norm": 1.659537976782885, + "learning_rate": 2.550133039132974e-05, + "loss": 0.485, + "step": 15200 + }, + { + "epoch": 1.475839316902775, + "grad_norm": 1.7852824594767274, + "learning_rate": 2.5485106106820688e-05, + "loss": 0.5314, + "step": 15210 + }, + { + "epoch": 1.4768096254608967, + "grad_norm": 1.7757412059616349, + "learning_rate": 2.5468881822311634e-05, + "loss": 0.4906, + "step": 15220 + }, + { + "epoch": 1.477779934019018, + "grad_norm": 1.7940563485432668, + "learning_rate": 2.5452657537802584e-05, + "loss": 0.4973, + "step": 15230 + }, + { + "epoch": 1.4787502425771395, + "grad_norm": 1.8223131394278327, + "learning_rate": 2.543643325329353e-05, + "loss": 0.5122, + "step": 15240 + }, + { + "epoch": 1.479720551135261, + "grad_norm": 1.9217304666232693, + "learning_rate": 2.5420208968784477e-05, + "loss": 0.5497, + "step": 15250 + }, + { + "epoch": 1.4806908596933825, + "grad_norm": 1.6084194486566938, + "learning_rate": 2.5403984684275423e-05, + "loss": 0.4971, + "step": 15260 + }, + { + "epoch": 1.481661168251504, + "grad_norm": 2.1056710345080827, + "learning_rate": 2.538776039976637e-05, + "loss": 0.5379, + "step": 15270 + }, + { + "epoch": 1.4826314768096256, + "grad_norm": 2.2545586744739015, + "learning_rate": 2.5371536115257316e-05, + "loss": 0.4239, + "step": 15280 + }, + { + "epoch": 1.483601785367747, + "grad_norm": 2.0015642103213063, + "learning_rate": 2.5355311830748262e-05, + "loss": 0.5283, + "step": 15290 + }, + { + "epoch": 1.4845720939258684, + "grad_norm": 2.0206628423435387, + "learning_rate": 2.533908754623921e-05, + "loss": 0.4533, + "step": 15300 + }, + { + "epoch": 1.4855424024839898, + "grad_norm": 2.0392064010865263, + "learning_rate": 2.5322863261730155e-05, + "loss": 0.516, + "step": 15310 + }, + { + "epoch": 1.4865127110421115, + "grad_norm": 1.6397938987534753, + "learning_rate": 2.5306638977221105e-05, + "loss": 0.4915, + "step": 15320 + }, + { + "epoch": 1.4874830196002329, + "grad_norm": 2.0519457829568615, + "learning_rate": 2.529041469271205e-05, + "loss": 0.4938, + "step": 15330 + }, + { + "epoch": 1.4884533281583543, + "grad_norm": 1.883771979267065, + "learning_rate": 2.5274190408202998e-05, + "loss": 0.5454, + "step": 15340 + }, + { + "epoch": 1.489423636716476, + "grad_norm": 1.8963017429804823, + "learning_rate": 2.5257966123693944e-05, + "loss": 0.4733, + "step": 15350 + }, + { + "epoch": 1.4903939452745973, + "grad_norm": 2.005144587549119, + "learning_rate": 2.5241741839184897e-05, + "loss": 0.4967, + "step": 15360 + }, + { + "epoch": 1.4913642538327188, + "grad_norm": 2.151826141060965, + "learning_rate": 2.5225517554675843e-05, + "loss": 0.5466, + "step": 15370 + }, + { + "epoch": 1.4923345623908402, + "grad_norm": 1.6751197267270117, + "learning_rate": 2.520929327016679e-05, + "loss": 0.5456, + "step": 15380 + }, + { + "epoch": 1.4933048709489618, + "grad_norm": 1.9009493553059222, + "learning_rate": 2.5193068985657736e-05, + "loss": 0.483, + "step": 15390 + }, + { + "epoch": 1.4942751795070832, + "grad_norm": 2.1457921969425757, + "learning_rate": 2.5176844701148683e-05, + "loss": 0.5458, + "step": 15400 + }, + { + "epoch": 1.4952454880652049, + "grad_norm": 1.9369152546010822, + "learning_rate": 2.516062041663963e-05, + "loss": 0.5477, + "step": 15410 + }, + { + "epoch": 1.4962157966233263, + "grad_norm": 1.9226350314538543, + "learning_rate": 2.5144396132130575e-05, + "loss": 0.5469, + "step": 15420 + }, + { + "epoch": 1.4971861051814477, + "grad_norm": 1.768970891771466, + "learning_rate": 2.5128171847621522e-05, + "loss": 0.4789, + "step": 15430 + }, + { + "epoch": 1.498156413739569, + "grad_norm": 1.6324753131013463, + "learning_rate": 2.511194756311247e-05, + "loss": 0.4984, + "step": 15440 + }, + { + "epoch": 1.4991267222976907, + "grad_norm": 1.5500314116241656, + "learning_rate": 2.5095723278603418e-05, + "loss": 0.5313, + "step": 15450 + }, + { + "epoch": 1.5000970308558121, + "grad_norm": 1.8222773979858036, + "learning_rate": 2.5079498994094364e-05, + "loss": 0.4935, + "step": 15460 + }, + { + "epoch": 1.5010673394139338, + "grad_norm": 1.6457715681286798, + "learning_rate": 2.506327470958531e-05, + "loss": 0.4739, + "step": 15470 + }, + { + "epoch": 1.5020376479720552, + "grad_norm": 1.6810719922194195, + "learning_rate": 2.5047050425076257e-05, + "loss": 0.478, + "step": 15480 + }, + { + "epoch": 1.5030079565301766, + "grad_norm": 1.9891968136548206, + "learning_rate": 2.5030826140567203e-05, + "loss": 0.5365, + "step": 15490 + }, + { + "epoch": 1.503978265088298, + "grad_norm": 1.836679974736217, + "learning_rate": 2.501460185605815e-05, + "loss": 0.5497, + "step": 15500 + }, + { + "epoch": 1.503978265088298, + "eval_loss": 0.6403182148933411, + "eval_runtime": 2470.3613, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.363, + "step": 15500 + }, + { + "epoch": 1.5049485736464194, + "grad_norm": 1.9808212956686209, + "learning_rate": 2.4998377571549096e-05, + "loss": 0.5195, + "step": 15510 + }, + { + "epoch": 1.505918882204541, + "grad_norm": 1.696799638403568, + "learning_rate": 2.4982153287040043e-05, + "loss": 0.481, + "step": 15520 + }, + { + "epoch": 1.5068891907626625, + "grad_norm": 1.559487018818546, + "learning_rate": 2.496592900253099e-05, + "loss": 0.4849, + "step": 15530 + }, + { + "epoch": 1.5078594993207841, + "grad_norm": 2.0106819741019635, + "learning_rate": 2.4949704718021935e-05, + "loss": 0.5068, + "step": 15540 + }, + { + "epoch": 1.5088298078789055, + "grad_norm": 1.554558550656832, + "learning_rate": 2.4933480433512885e-05, + "loss": 0.5542, + "step": 15550 + }, + { + "epoch": 1.509800116437027, + "grad_norm": 2.1570334538409597, + "learning_rate": 2.491725614900383e-05, + "loss": 0.5537, + "step": 15560 + }, + { + "epoch": 1.5107704249951484, + "grad_norm": 2.082418119992958, + "learning_rate": 2.4901031864494778e-05, + "loss": 0.4495, + "step": 15570 + }, + { + "epoch": 1.51174073355327, + "grad_norm": 1.7903747855746062, + "learning_rate": 2.4884807579985724e-05, + "loss": 0.4755, + "step": 15580 + }, + { + "epoch": 1.5127110421113914, + "grad_norm": 1.8907675365997436, + "learning_rate": 2.486858329547667e-05, + "loss": 0.5377, + "step": 15590 + }, + { + "epoch": 1.513681350669513, + "grad_norm": 1.6823616833089232, + "learning_rate": 2.4852359010967617e-05, + "loss": 0.4747, + "step": 15600 + }, + { + "epoch": 1.5146516592276345, + "grad_norm": 1.8263050453987495, + "learning_rate": 2.4836134726458567e-05, + "loss": 0.4962, + "step": 15610 + }, + { + "epoch": 1.5156219677857559, + "grad_norm": 1.855857638521949, + "learning_rate": 2.4819910441949513e-05, + "loss": 0.5306, + "step": 15620 + }, + { + "epoch": 1.5165922763438773, + "grad_norm": 1.7196056977172254, + "learning_rate": 2.480368615744046e-05, + "loss": 0.507, + "step": 15630 + }, + { + "epoch": 1.5175625849019987, + "grad_norm": 2.2274627334622354, + "learning_rate": 2.4787461872931406e-05, + "loss": 0.4741, + "step": 15640 + }, + { + "epoch": 1.5185328934601203, + "grad_norm": 1.7506693514893716, + "learning_rate": 2.4771237588422352e-05, + "loss": 0.4512, + "step": 15650 + }, + { + "epoch": 1.519503202018242, + "grad_norm": 1.5102515980183955, + "learning_rate": 2.47550133039133e-05, + "loss": 0.5656, + "step": 15660 + }, + { + "epoch": 1.5204735105763634, + "grad_norm": 1.7638756809733325, + "learning_rate": 2.4738789019404245e-05, + "loss": 0.5209, + "step": 15670 + }, + { + "epoch": 1.5214438191344848, + "grad_norm": 1.67804925991046, + "learning_rate": 2.472256473489519e-05, + "loss": 0.5442, + "step": 15680 + }, + { + "epoch": 1.5224141276926062, + "grad_norm": 1.6503421070503612, + "learning_rate": 2.470634045038614e-05, + "loss": 0.5516, + "step": 15690 + }, + { + "epoch": 1.5233844362507276, + "grad_norm": 1.7388522657721959, + "learning_rate": 2.4690116165877088e-05, + "loss": 0.4894, + "step": 15700 + }, + { + "epoch": 1.5243547448088492, + "grad_norm": 1.955414988998346, + "learning_rate": 2.4673891881368034e-05, + "loss": 0.5493, + "step": 15710 + }, + { + "epoch": 1.5253250533669707, + "grad_norm": 2.1089020897683324, + "learning_rate": 2.465766759685898e-05, + "loss": 0.5173, + "step": 15720 + }, + { + "epoch": 1.5262953619250923, + "grad_norm": 2.1627868553616416, + "learning_rate": 2.4641443312349927e-05, + "loss": 0.5165, + "step": 15730 + }, + { + "epoch": 1.5272656704832137, + "grad_norm": 1.9351405181934673, + "learning_rate": 2.4625219027840873e-05, + "loss": 0.4873, + "step": 15740 + }, + { + "epoch": 1.5282359790413351, + "grad_norm": 1.8488349390401109, + "learning_rate": 2.460899474333182e-05, + "loss": 0.5121, + "step": 15750 + }, + { + "epoch": 1.5292062875994565, + "grad_norm": 1.9814673588646732, + "learning_rate": 2.4592770458822766e-05, + "loss": 0.4942, + "step": 15760 + }, + { + "epoch": 1.530176596157578, + "grad_norm": 1.8985936905786858, + "learning_rate": 2.4576546174313712e-05, + "loss": 0.5227, + "step": 15770 + }, + { + "epoch": 1.5311469047156996, + "grad_norm": 2.144939705295627, + "learning_rate": 2.4560321889804662e-05, + "loss": 0.5584, + "step": 15780 + }, + { + "epoch": 1.5321172132738212, + "grad_norm": 2.196125539672085, + "learning_rate": 2.454409760529561e-05, + "loss": 0.4592, + "step": 15790 + }, + { + "epoch": 1.5330875218319426, + "grad_norm": 1.5896290702039733, + "learning_rate": 2.4527873320786555e-05, + "loss": 0.5183, + "step": 15800 + }, + { + "epoch": 1.534057830390064, + "grad_norm": 1.3926787086096604, + "learning_rate": 2.45116490362775e-05, + "loss": 0.4929, + "step": 15810 + }, + { + "epoch": 1.5350281389481855, + "grad_norm": 1.8947619224673768, + "learning_rate": 2.4495424751768448e-05, + "loss": 0.487, + "step": 15820 + }, + { + "epoch": 1.5359984475063069, + "grad_norm": 1.9366715293465946, + "learning_rate": 2.4479200467259394e-05, + "loss": 0.5126, + "step": 15830 + }, + { + "epoch": 1.5369687560644285, + "grad_norm": 1.8854258468503662, + "learning_rate": 2.446297618275034e-05, + "loss": 0.4732, + "step": 15840 + }, + { + "epoch": 1.53793906462255, + "grad_norm": 1.7897154496692322, + "learning_rate": 2.4446751898241287e-05, + "loss": 0.4869, + "step": 15850 + }, + { + "epoch": 1.5389093731806716, + "grad_norm": 1.9691769621461568, + "learning_rate": 2.4430527613732233e-05, + "loss": 0.5204, + "step": 15860 + }, + { + "epoch": 1.539879681738793, + "grad_norm": 1.5159729039901195, + "learning_rate": 2.4414303329223183e-05, + "loss": 0.5067, + "step": 15870 + }, + { + "epoch": 1.5408499902969144, + "grad_norm": 2.0085743327171364, + "learning_rate": 2.439807904471413e-05, + "loss": 0.5855, + "step": 15880 + }, + { + "epoch": 1.5418202988550358, + "grad_norm": 1.808826518921202, + "learning_rate": 2.4381854760205076e-05, + "loss": 0.511, + "step": 15890 + }, + { + "epoch": 1.5427906074131574, + "grad_norm": 2.144838816182226, + "learning_rate": 2.4365630475696022e-05, + "loss": 0.5151, + "step": 15900 + }, + { + "epoch": 1.5437609159712788, + "grad_norm": 1.9134282869959454, + "learning_rate": 2.434940619118697e-05, + "loss": 0.547, + "step": 15910 + }, + { + "epoch": 1.5447312245294005, + "grad_norm": 1.4718585956659067, + "learning_rate": 2.4333181906677915e-05, + "loss": 0.5035, + "step": 15920 + }, + { + "epoch": 1.545701533087522, + "grad_norm": 1.9846910792449015, + "learning_rate": 2.431695762216886e-05, + "loss": 0.4603, + "step": 15930 + }, + { + "epoch": 1.5466718416456433, + "grad_norm": 2.237689406521008, + "learning_rate": 2.430073333765981e-05, + "loss": 0.4835, + "step": 15940 + }, + { + "epoch": 1.5476421502037647, + "grad_norm": 1.8521877806585876, + "learning_rate": 2.4284509053150757e-05, + "loss": 0.486, + "step": 15950 + }, + { + "epoch": 1.5486124587618861, + "grad_norm": 1.6410601205984716, + "learning_rate": 2.4268284768641704e-05, + "loss": 0.5362, + "step": 15960 + }, + { + "epoch": 1.5495827673200078, + "grad_norm": 2.0223928767474524, + "learning_rate": 2.4252060484132653e-05, + "loss": 0.5954, + "step": 15970 + }, + { + "epoch": 1.5505530758781294, + "grad_norm": 2.531432809831189, + "learning_rate": 2.42358361996236e-05, + "loss": 0.5618, + "step": 15980 + }, + { + "epoch": 1.5515233844362508, + "grad_norm": 2.0740258540735788, + "learning_rate": 2.4219611915114546e-05, + "loss": 0.4733, + "step": 15990 + }, + { + "epoch": 1.5524936929943722, + "grad_norm": 1.7587497399634056, + "learning_rate": 2.4203387630605493e-05, + "loss": 0.5099, + "step": 16000 + }, + { + "epoch": 1.5524936929943722, + "eval_loss": 0.635991096496582, + "eval_runtime": 2468.0382, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 16000 + }, + { + "epoch": 1.5534640015524936, + "grad_norm": 1.670336846718715, + "learning_rate": 2.418716334609644e-05, + "loss": 0.4553, + "step": 16010 + }, + { + "epoch": 1.554434310110615, + "grad_norm": 2.0177796015243237, + "learning_rate": 2.4170939061587385e-05, + "loss": 0.5115, + "step": 16020 + }, + { + "epoch": 1.5554046186687367, + "grad_norm": 1.8858883624862002, + "learning_rate": 2.4154714777078332e-05, + "loss": 0.5172, + "step": 16030 + }, + { + "epoch": 1.556374927226858, + "grad_norm": 2.0359679981766647, + "learning_rate": 2.4138490492569278e-05, + "loss": 0.4589, + "step": 16040 + }, + { + "epoch": 1.5573452357849797, + "grad_norm": 1.2444792958823963, + "learning_rate": 2.4122266208060228e-05, + "loss": 0.5134, + "step": 16050 + }, + { + "epoch": 1.5583155443431012, + "grad_norm": 1.5062153000531946, + "learning_rate": 2.4106041923551174e-05, + "loss": 0.4869, + "step": 16060 + }, + { + "epoch": 1.5592858529012226, + "grad_norm": 2.2139374575219364, + "learning_rate": 2.408981763904212e-05, + "loss": 0.4811, + "step": 16070 + }, + { + "epoch": 1.560256161459344, + "grad_norm": 1.7542756535220294, + "learning_rate": 2.4073593354533067e-05, + "loss": 0.4895, + "step": 16080 + }, + { + "epoch": 1.5612264700174654, + "grad_norm": 1.9663056975292839, + "learning_rate": 2.4057369070024013e-05, + "loss": 0.5046, + "step": 16090 + }, + { + "epoch": 1.562196778575587, + "grad_norm": 1.760446721510282, + "learning_rate": 2.404114478551496e-05, + "loss": 0.495, + "step": 16100 + }, + { + "epoch": 1.5631670871337087, + "grad_norm": 1.6132518828061082, + "learning_rate": 2.4024920501005906e-05, + "loss": 0.4985, + "step": 16110 + }, + { + "epoch": 1.56413739569183, + "grad_norm": 1.7640461059330637, + "learning_rate": 2.4008696216496853e-05, + "loss": 0.5186, + "step": 16120 + }, + { + "epoch": 1.5651077042499515, + "grad_norm": 1.769814216957158, + "learning_rate": 2.39924719319878e-05, + "loss": 0.5413, + "step": 16130 + }, + { + "epoch": 1.566078012808073, + "grad_norm": 2.293241964432802, + "learning_rate": 2.397624764747875e-05, + "loss": 0.4809, + "step": 16140 + }, + { + "epoch": 1.5670483213661943, + "grad_norm": 1.8759559371438301, + "learning_rate": 2.3960023362969695e-05, + "loss": 0.5262, + "step": 16150 + }, + { + "epoch": 1.568018629924316, + "grad_norm": 2.3087090615314114, + "learning_rate": 2.394379907846064e-05, + "loss": 0.4841, + "step": 16160 + }, + { + "epoch": 1.5689889384824374, + "grad_norm": 1.6108810314362396, + "learning_rate": 2.3927574793951588e-05, + "loss": 0.5188, + "step": 16170 + }, + { + "epoch": 1.569959247040559, + "grad_norm": 2.241869048530712, + "learning_rate": 2.3911350509442534e-05, + "loss": 0.5005, + "step": 16180 + }, + { + "epoch": 1.5709295555986804, + "grad_norm": 2.1127428890150215, + "learning_rate": 2.389512622493348e-05, + "loss": 0.5063, + "step": 16190 + }, + { + "epoch": 1.5718998641568018, + "grad_norm": 1.7046555426171013, + "learning_rate": 2.3878901940424427e-05, + "loss": 0.5154, + "step": 16200 + }, + { + "epoch": 1.5728701727149232, + "grad_norm": 1.6974188480153705, + "learning_rate": 2.3862677655915373e-05, + "loss": 0.4775, + "step": 16210 + }, + { + "epoch": 1.5738404812730449, + "grad_norm": 1.9773772891509165, + "learning_rate": 2.384645337140632e-05, + "loss": 0.489, + "step": 16220 + }, + { + "epoch": 1.5748107898311663, + "grad_norm": 2.3323506244488645, + "learning_rate": 2.383022908689727e-05, + "loss": 0.5252, + "step": 16230 + }, + { + "epoch": 1.575781098389288, + "grad_norm": 1.4580240462073564, + "learning_rate": 2.3814004802388216e-05, + "loss": 0.4809, + "step": 16240 + }, + { + "epoch": 1.5767514069474093, + "grad_norm": 1.9799244985970428, + "learning_rate": 2.3797780517879162e-05, + "loss": 0.4938, + "step": 16250 + }, + { + "epoch": 1.5777217155055308, + "grad_norm": 2.2967773637028315, + "learning_rate": 2.378155623337011e-05, + "loss": 0.48, + "step": 16260 + }, + { + "epoch": 1.5786920240636522, + "grad_norm": 1.6715177397086511, + "learning_rate": 2.3765331948861055e-05, + "loss": 0.5112, + "step": 16270 + }, + { + "epoch": 1.5796623326217736, + "grad_norm": 1.7543224867437088, + "learning_rate": 2.3749107664352e-05, + "loss": 0.5355, + "step": 16280 + }, + { + "epoch": 1.5806326411798952, + "grad_norm": 1.7059823329477148, + "learning_rate": 2.3732883379842948e-05, + "loss": 0.4885, + "step": 16290 + }, + { + "epoch": 1.5816029497380169, + "grad_norm": 1.808906067511985, + "learning_rate": 2.3716659095333894e-05, + "loss": 0.4883, + "step": 16300 + }, + { + "epoch": 1.5825732582961383, + "grad_norm": 1.456280506138187, + "learning_rate": 2.3700434810824844e-05, + "loss": 0.4986, + "step": 16310 + }, + { + "epoch": 1.5835435668542597, + "grad_norm": 1.7844142401171645, + "learning_rate": 2.368421052631579e-05, + "loss": 0.5726, + "step": 16320 + }, + { + "epoch": 1.584513875412381, + "grad_norm": 1.8511812344459693, + "learning_rate": 2.3667986241806737e-05, + "loss": 0.5418, + "step": 16330 + }, + { + "epoch": 1.5854841839705025, + "grad_norm": 1.6886781249735945, + "learning_rate": 2.3651761957297687e-05, + "loss": 0.4871, + "step": 16340 + }, + { + "epoch": 1.5864544925286241, + "grad_norm": 1.6386080299672316, + "learning_rate": 2.3635537672788633e-05, + "loss": 0.5502, + "step": 16350 + }, + { + "epoch": 1.5874248010867456, + "grad_norm": 1.6152930914041828, + "learning_rate": 2.361931338827958e-05, + "loss": 0.4602, + "step": 16360 + }, + { + "epoch": 1.5883951096448672, + "grad_norm": 1.9896290423806842, + "learning_rate": 2.3603089103770526e-05, + "loss": 0.5517, + "step": 16370 + }, + { + "epoch": 1.5893654182029886, + "grad_norm": 1.785358677833909, + "learning_rate": 2.3586864819261472e-05, + "loss": 0.5108, + "step": 16380 + }, + { + "epoch": 1.59033572676111, + "grad_norm": 1.8919452155428813, + "learning_rate": 2.357064053475242e-05, + "loss": 0.5586, + "step": 16390 + }, + { + "epoch": 1.5913060353192314, + "grad_norm": 2.1765937296152376, + "learning_rate": 2.3554416250243365e-05, + "loss": 0.4978, + "step": 16400 + }, + { + "epoch": 1.5922763438773528, + "grad_norm": 1.9463613536174769, + "learning_rate": 2.3538191965734315e-05, + "loss": 0.534, + "step": 16410 + }, + { + "epoch": 1.5932466524354745, + "grad_norm": 1.483428953982187, + "learning_rate": 2.352196768122526e-05, + "loss": 0.4481, + "step": 16420 + }, + { + "epoch": 1.594216960993596, + "grad_norm": 1.8946130151442728, + "learning_rate": 2.3505743396716207e-05, + "loss": 0.4499, + "step": 16430 + }, + { + "epoch": 1.5951872695517175, + "grad_norm": 1.496172632655443, + "learning_rate": 2.3489519112207154e-05, + "loss": 0.4719, + "step": 16440 + }, + { + "epoch": 1.596157578109839, + "grad_norm": 1.5582678654830617, + "learning_rate": 2.34732948276981e-05, + "loss": 0.5023, + "step": 16450 + }, + { + "epoch": 1.5971278866679604, + "grad_norm": 1.9101490466966036, + "learning_rate": 2.3457070543189047e-05, + "loss": 0.4442, + "step": 16460 + }, + { + "epoch": 1.5980981952260818, + "grad_norm": 2.5712850986928757, + "learning_rate": 2.3440846258679993e-05, + "loss": 0.5008, + "step": 16470 + }, + { + "epoch": 1.5990685037842034, + "grad_norm": 2.1177587573552805, + "learning_rate": 2.342462197417094e-05, + "loss": 0.5014, + "step": 16480 + }, + { + "epoch": 1.6000388123423248, + "grad_norm": 2.4990786716780455, + "learning_rate": 2.3408397689661886e-05, + "loss": 0.4583, + "step": 16490 + }, + { + "epoch": 1.6010091209004464, + "grad_norm": 1.91427095412827, + "learning_rate": 2.3392173405152835e-05, + "loss": 0.5037, + "step": 16500 + }, + { + "epoch": 1.6010091209004464, + "eval_loss": 0.6359612345695496, + "eval_runtime": 2467.4852, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 16500 + }, + { + "epoch": 1.6019794294585679, + "grad_norm": 1.8370649227416649, + "learning_rate": 2.3375949120643782e-05, + "loss": 0.5399, + "step": 16510 + }, + { + "epoch": 1.6029497380166893, + "grad_norm": 1.6932084838539212, + "learning_rate": 2.3359724836134728e-05, + "loss": 0.5212, + "step": 16520 + }, + { + "epoch": 1.6039200465748107, + "grad_norm": 1.68790954641985, + "learning_rate": 2.3343500551625675e-05, + "loss": 0.4837, + "step": 16530 + }, + { + "epoch": 1.6048903551329323, + "grad_norm": 1.5773573029174093, + "learning_rate": 2.332727626711662e-05, + "loss": 0.5125, + "step": 16540 + }, + { + "epoch": 1.6058606636910537, + "grad_norm": 1.7640635185794997, + "learning_rate": 2.3311051982607567e-05, + "loss": 0.5013, + "step": 16550 + }, + { + "epoch": 1.6068309722491754, + "grad_norm": 1.7477056146726457, + "learning_rate": 2.3294827698098514e-05, + "loss": 0.5095, + "step": 16560 + }, + { + "epoch": 1.6078012808072968, + "grad_norm": 1.0733428522302542, + "learning_rate": 2.327860341358946e-05, + "loss": 0.5, + "step": 16570 + }, + { + "epoch": 1.6087715893654182, + "grad_norm": 1.9564292094193194, + "learning_rate": 2.326237912908041e-05, + "loss": 0.5058, + "step": 16580 + }, + { + "epoch": 1.6097418979235396, + "grad_norm": 2.1045720941333466, + "learning_rate": 2.3246154844571356e-05, + "loss": 0.5101, + "step": 16590 + }, + { + "epoch": 1.610712206481661, + "grad_norm": 2.171187966701804, + "learning_rate": 2.3229930560062303e-05, + "loss": 0.5069, + "step": 16600 + }, + { + "epoch": 1.6116825150397827, + "grad_norm": 1.9610313890988438, + "learning_rate": 2.321370627555325e-05, + "loss": 0.4904, + "step": 16610 + }, + { + "epoch": 1.6126528235979043, + "grad_norm": 1.941252888287077, + "learning_rate": 2.3197481991044195e-05, + "loss": 0.4865, + "step": 16620 + }, + { + "epoch": 1.6136231321560257, + "grad_norm": 1.897218836962928, + "learning_rate": 2.3181257706535142e-05, + "loss": 0.4966, + "step": 16630 + }, + { + "epoch": 1.6145934407141471, + "grad_norm": 2.201820668610594, + "learning_rate": 2.3165033422026088e-05, + "loss": 0.4895, + "step": 16640 + }, + { + "epoch": 1.6155637492722685, + "grad_norm": 1.5720192215380684, + "learning_rate": 2.3148809137517035e-05, + "loss": 0.4918, + "step": 16650 + }, + { + "epoch": 1.61653405783039, + "grad_norm": 2.227704713223705, + "learning_rate": 2.313258485300798e-05, + "loss": 0.5251, + "step": 16660 + }, + { + "epoch": 1.6175043663885116, + "grad_norm": 2.0170271312798707, + "learning_rate": 2.311636056849893e-05, + "loss": 0.4965, + "step": 16670 + }, + { + "epoch": 1.618474674946633, + "grad_norm": 1.9688156858912216, + "learning_rate": 2.3100136283989877e-05, + "loss": 0.4685, + "step": 16680 + }, + { + "epoch": 1.6194449835047546, + "grad_norm": 2.0958524562959675, + "learning_rate": 2.3083911999480824e-05, + "loss": 0.4884, + "step": 16690 + }, + { + "epoch": 1.620415292062876, + "grad_norm": 2.0973339157843447, + "learning_rate": 2.306768771497177e-05, + "loss": 0.4394, + "step": 16700 + }, + { + "epoch": 1.6213856006209975, + "grad_norm": 1.5852203386480506, + "learning_rate": 2.3051463430462716e-05, + "loss": 0.5068, + "step": 16710 + }, + { + "epoch": 1.6223559091791189, + "grad_norm": 1.907182215766834, + "learning_rate": 2.3035239145953663e-05, + "loss": 0.461, + "step": 16720 + }, + { + "epoch": 1.6233262177372403, + "grad_norm": 1.9903511844679715, + "learning_rate": 2.3019014861444612e-05, + "loss": 0.4861, + "step": 16730 + }, + { + "epoch": 1.624296526295362, + "grad_norm": 1.8734845445192632, + "learning_rate": 2.300279057693556e-05, + "loss": 0.5203, + "step": 16740 + }, + { + "epoch": 1.6252668348534836, + "grad_norm": 1.4830887784438085, + "learning_rate": 2.2986566292426505e-05, + "loss": 0.5504, + "step": 16750 + }, + { + "epoch": 1.626237143411605, + "grad_norm": 1.9852817046238187, + "learning_rate": 2.297034200791745e-05, + "loss": 0.5727, + "step": 16760 + }, + { + "epoch": 1.6272074519697264, + "grad_norm": 1.9574779506866022, + "learning_rate": 2.29541177234084e-05, + "loss": 0.4932, + "step": 16770 + }, + { + "epoch": 1.6281777605278478, + "grad_norm": 1.8045639100087034, + "learning_rate": 2.2937893438899348e-05, + "loss": 0.4874, + "step": 16780 + }, + { + "epoch": 1.6291480690859692, + "grad_norm": 1.9467881794111201, + "learning_rate": 2.2921669154390294e-05, + "loss": 0.4231, + "step": 16790 + }, + { + "epoch": 1.6301183776440908, + "grad_norm": 2.115958262042766, + "learning_rate": 2.290544486988124e-05, + "loss": 0.5085, + "step": 16800 + }, + { + "epoch": 1.6310886862022123, + "grad_norm": 1.9242055352600025, + "learning_rate": 2.2889220585372187e-05, + "loss": 0.4961, + "step": 16810 + }, + { + "epoch": 1.632058994760334, + "grad_norm": 1.7473969264432643, + "learning_rate": 2.2872996300863133e-05, + "loss": 0.4896, + "step": 16820 + }, + { + "epoch": 1.6330293033184553, + "grad_norm": 1.9086620383723776, + "learning_rate": 2.285677201635408e-05, + "loss": 0.5793, + "step": 16830 + }, + { + "epoch": 1.6339996118765767, + "grad_norm": 1.8179786231819581, + "learning_rate": 2.2840547731845026e-05, + "loss": 0.47, + "step": 16840 + }, + { + "epoch": 1.6349699204346981, + "grad_norm": 2.3742959935176993, + "learning_rate": 2.2824323447335972e-05, + "loss": 0.4601, + "step": 16850 + }, + { + "epoch": 1.6359402289928198, + "grad_norm": 1.5831090068828182, + "learning_rate": 2.2808099162826922e-05, + "loss": 0.527, + "step": 16860 + }, + { + "epoch": 1.6369105375509412, + "grad_norm": 2.429466466556868, + "learning_rate": 2.279187487831787e-05, + "loss": 0.5041, + "step": 16870 + }, + { + "epoch": 1.6378808461090628, + "grad_norm": 2.0666827078950676, + "learning_rate": 2.2775650593808815e-05, + "loss": 0.5093, + "step": 16880 + }, + { + "epoch": 1.6388511546671842, + "grad_norm": 1.7376049636619024, + "learning_rate": 2.275942630929976e-05, + "loss": 0.4899, + "step": 16890 + }, + { + "epoch": 1.6398214632253056, + "grad_norm": 1.9743952968077716, + "learning_rate": 2.2743202024790708e-05, + "loss": 0.4483, + "step": 16900 + }, + { + "epoch": 1.640791771783427, + "grad_norm": 1.7020982614990035, + "learning_rate": 2.2726977740281654e-05, + "loss": 0.4746, + "step": 16910 + }, + { + "epoch": 1.6417620803415485, + "grad_norm": 1.5288620918579647, + "learning_rate": 2.27107534557726e-05, + "loss": 0.5386, + "step": 16920 + }, + { + "epoch": 1.64273238889967, + "grad_norm": 2.2418459177194725, + "learning_rate": 2.2694529171263547e-05, + "loss": 0.5221, + "step": 16930 + }, + { + "epoch": 1.6437026974577917, + "grad_norm": 1.7801442444217204, + "learning_rate": 2.2678304886754497e-05, + "loss": 0.502, + "step": 16940 + }, + { + "epoch": 1.6446730060159132, + "grad_norm": 2.750428317407271, + "learning_rate": 2.2662080602245443e-05, + "loss": 0.4947, + "step": 16950 + }, + { + "epoch": 1.6456433145740346, + "grad_norm": 1.5310655899660852, + "learning_rate": 2.264585631773639e-05, + "loss": 0.5228, + "step": 16960 + }, + { + "epoch": 1.646613623132156, + "grad_norm": 1.7325660800102027, + "learning_rate": 2.2629632033227336e-05, + "loss": 0.4724, + "step": 16970 + }, + { + "epoch": 1.6475839316902774, + "grad_norm": 3.028000821794418, + "learning_rate": 2.2613407748718282e-05, + "loss": 0.5444, + "step": 16980 + }, + { + "epoch": 1.648554240248399, + "grad_norm": 1.8847091529095503, + "learning_rate": 2.259718346420923e-05, + "loss": 0.5095, + "step": 16990 + }, + { + "epoch": 1.6495245488065204, + "grad_norm": 1.584479445600283, + "learning_rate": 2.2580959179700175e-05, + "loss": 0.5471, + "step": 17000 + }, + { + "epoch": 1.6495245488065204, + "eval_loss": 0.6354050636291504, + "eval_runtime": 2468.5352, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.363, + "step": 17000 + }, + { + "epoch": 1.650494857364642, + "grad_norm": 1.668904824652418, + "learning_rate": 2.256473489519112e-05, + "loss": 0.4838, + "step": 17010 + }, + { + "epoch": 1.6514651659227635, + "grad_norm": 1.7782439469811153, + "learning_rate": 2.2548510610682068e-05, + "loss": 0.5225, + "step": 17020 + }, + { + "epoch": 1.652435474480885, + "grad_norm": 2.1175538832822576, + "learning_rate": 2.2532286326173017e-05, + "loss": 0.5438, + "step": 17030 + }, + { + "epoch": 1.6534057830390063, + "grad_norm": 1.9031650594744414, + "learning_rate": 2.2516062041663964e-05, + "loss": 0.4456, + "step": 17040 + }, + { + "epoch": 1.6543760915971277, + "grad_norm": 1.8418596497226953, + "learning_rate": 2.249983775715491e-05, + "loss": 0.5148, + "step": 17050 + }, + { + "epoch": 1.6553464001552494, + "grad_norm": 2.001058155829245, + "learning_rate": 2.2483613472645857e-05, + "loss": 0.5381, + "step": 17060 + }, + { + "epoch": 1.656316708713371, + "grad_norm": 1.6070553409553883, + "learning_rate": 2.2467389188136803e-05, + "loss": 0.5199, + "step": 17070 + }, + { + "epoch": 1.6572870172714924, + "grad_norm": 1.8977480591408646, + "learning_rate": 2.245116490362775e-05, + "loss": 0.5132, + "step": 17080 + }, + { + "epoch": 1.6582573258296138, + "grad_norm": 1.8011894993916608, + "learning_rate": 2.2434940619118696e-05, + "loss": 0.5448, + "step": 17090 + }, + { + "epoch": 1.6592276343877352, + "grad_norm": 2.0284255015217565, + "learning_rate": 2.2418716334609642e-05, + "loss": 0.4717, + "step": 17100 + }, + { + "epoch": 1.6601979429458567, + "grad_norm": 1.885999888526123, + "learning_rate": 2.2402492050100592e-05, + "loss": 0.5183, + "step": 17110 + }, + { + "epoch": 1.6611682515039783, + "grad_norm": 1.9056970812089797, + "learning_rate": 2.2386267765591538e-05, + "loss": 0.4393, + "step": 17120 + }, + { + "epoch": 1.6621385600620997, + "grad_norm": 2.267841170403769, + "learning_rate": 2.2370043481082488e-05, + "loss": 0.4882, + "step": 17130 + }, + { + "epoch": 1.6631088686202213, + "grad_norm": 1.5488822989395437, + "learning_rate": 2.2353819196573434e-05, + "loss": 0.5294, + "step": 17140 + }, + { + "epoch": 1.6640791771783427, + "grad_norm": 1.992034372217963, + "learning_rate": 2.233759491206438e-05, + "loss": 0.4925, + "step": 17150 + }, + { + "epoch": 1.6650494857364642, + "grad_norm": 1.7050665334433257, + "learning_rate": 2.2321370627555327e-05, + "loss": 0.5237, + "step": 17160 + }, + { + "epoch": 1.6660197942945856, + "grad_norm": 1.8164813364072325, + "learning_rate": 2.2305146343046274e-05, + "loss": 0.5128, + "step": 17170 + }, + { + "epoch": 1.6669901028527072, + "grad_norm": 1.7407964429381908, + "learning_rate": 2.228892205853722e-05, + "loss": 0.4812, + "step": 17180 + }, + { + "epoch": 1.6679604114108286, + "grad_norm": 2.220803029174025, + "learning_rate": 2.2272697774028166e-05, + "loss": 0.4936, + "step": 17190 + }, + { + "epoch": 1.6689307199689503, + "grad_norm": 1.7697338691239146, + "learning_rate": 2.2256473489519113e-05, + "loss": 0.5208, + "step": 17200 + }, + { + "epoch": 1.6699010285270717, + "grad_norm": 2.2487670732225085, + "learning_rate": 2.224024920501006e-05, + "loss": 0.4833, + "step": 17210 + }, + { + "epoch": 1.670871337085193, + "grad_norm": 1.888912866564453, + "learning_rate": 2.222402492050101e-05, + "loss": 0.5287, + "step": 17220 + }, + { + "epoch": 1.6718416456433145, + "grad_norm": 1.7902985235611324, + "learning_rate": 2.2207800635991955e-05, + "loss": 0.5271, + "step": 17230 + }, + { + "epoch": 1.672811954201436, + "grad_norm": 2.2626922106136607, + "learning_rate": 2.21915763514829e-05, + "loss": 0.5062, + "step": 17240 + }, + { + "epoch": 1.6737822627595575, + "grad_norm": 2.0509489741448923, + "learning_rate": 2.2175352066973848e-05, + "loss": 0.4586, + "step": 17250 + }, + { + "epoch": 1.6747525713176792, + "grad_norm": 1.7028309180471788, + "learning_rate": 2.2159127782464794e-05, + "loss": 0.5591, + "step": 17260 + }, + { + "epoch": 1.6757228798758006, + "grad_norm": 1.8148060135409945, + "learning_rate": 2.214290349795574e-05, + "loss": 0.4708, + "step": 17270 + }, + { + "epoch": 1.676693188433922, + "grad_norm": 1.7415357367009103, + "learning_rate": 2.2126679213446687e-05, + "loss": 0.4877, + "step": 17280 + }, + { + "epoch": 1.6776634969920434, + "grad_norm": 1.6879714610876217, + "learning_rate": 2.2110454928937634e-05, + "loss": 0.4838, + "step": 17290 + }, + { + "epoch": 1.6786338055501648, + "grad_norm": 1.3869597913004987, + "learning_rate": 2.2094230644428583e-05, + "loss": 0.496, + "step": 17300 + }, + { + "epoch": 1.6796041141082865, + "grad_norm": 1.4730335934961858, + "learning_rate": 2.207800635991953e-05, + "loss": 0.4926, + "step": 17310 + }, + { + "epoch": 1.6805744226664079, + "grad_norm": 1.771999456693497, + "learning_rate": 2.2061782075410476e-05, + "loss": 0.4993, + "step": 17320 + }, + { + "epoch": 1.6815447312245295, + "grad_norm": 1.6689193723022826, + "learning_rate": 2.2045557790901423e-05, + "loss": 0.4815, + "step": 17330 + }, + { + "epoch": 1.682515039782651, + "grad_norm": 2.1543699729043717, + "learning_rate": 2.202933350639237e-05, + "loss": 0.4772, + "step": 17340 + }, + { + "epoch": 1.6834853483407723, + "grad_norm": 1.7964447035669882, + "learning_rate": 2.2013109221883315e-05, + "loss": 0.4872, + "step": 17350 + }, + { + "epoch": 1.6844556568988938, + "grad_norm": 1.5499702179216124, + "learning_rate": 2.199688493737426e-05, + "loss": 0.5115, + "step": 17360 + }, + { + "epoch": 1.6854259654570152, + "grad_norm": 1.736034409254174, + "learning_rate": 2.1980660652865208e-05, + "loss": 0.5004, + "step": 17370 + }, + { + "epoch": 1.6863962740151368, + "grad_norm": 1.6670883800573928, + "learning_rate": 2.1964436368356154e-05, + "loss": 0.5172, + "step": 17380 + }, + { + "epoch": 1.6873665825732584, + "grad_norm": 1.8976878178656118, + "learning_rate": 2.1948212083847104e-05, + "loss": 0.5056, + "step": 17390 + }, + { + "epoch": 1.6883368911313799, + "grad_norm": 1.7825783485749558, + "learning_rate": 2.193198779933805e-05, + "loss": 0.5079, + "step": 17400 + }, + { + "epoch": 1.6893071996895013, + "grad_norm": 1.963783922685446, + "learning_rate": 2.1915763514828997e-05, + "loss": 0.5595, + "step": 17410 + }, + { + "epoch": 1.6902775082476227, + "grad_norm": 1.577100524669314, + "learning_rate": 2.1899539230319943e-05, + "loss": 0.5122, + "step": 17420 + }, + { + "epoch": 1.691247816805744, + "grad_norm": 1.9224227001223828, + "learning_rate": 2.188331494581089e-05, + "loss": 0.5179, + "step": 17430 + }, + { + "epoch": 1.6922181253638657, + "grad_norm": 1.968807335226898, + "learning_rate": 2.1867090661301836e-05, + "loss": 0.4811, + "step": 17440 + }, + { + "epoch": 1.6931884339219871, + "grad_norm": 1.8656524316794496, + "learning_rate": 2.1850866376792782e-05, + "loss": 0.5562, + "step": 17450 + }, + { + "epoch": 1.6941587424801088, + "grad_norm": 1.936995738098722, + "learning_rate": 2.183464209228373e-05, + "loss": 0.5128, + "step": 17460 + }, + { + "epoch": 1.6951290510382302, + "grad_norm": 1.716335988732862, + "learning_rate": 2.181841780777468e-05, + "loss": 0.4878, + "step": 17470 + }, + { + "epoch": 1.6960993595963516, + "grad_norm": 1.8671035370037838, + "learning_rate": 2.1802193523265625e-05, + "loss": 0.5631, + "step": 17480 + }, + { + "epoch": 1.697069668154473, + "grad_norm": 2.10031146751523, + "learning_rate": 2.178596923875657e-05, + "loss": 0.4911, + "step": 17490 + }, + { + "epoch": 1.6980399767125947, + "grad_norm": 1.4651955164387034, + "learning_rate": 2.1769744954247518e-05, + "loss": 0.4841, + "step": 17500 + }, + { + "epoch": 1.6980399767125947, + "eval_loss": 0.6339951157569885, + "eval_runtime": 2473.3585, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 17500 + }, + { + "epoch": 1.699010285270716, + "grad_norm": 1.9647331919080824, + "learning_rate": 2.1753520669738468e-05, + "loss": 0.4788, + "step": 17510 + }, + { + "epoch": 1.6999805938288377, + "grad_norm": 1.9100412693293238, + "learning_rate": 2.1737296385229414e-05, + "loss": 0.475, + "step": 17520 + }, + { + "epoch": 1.7009509023869591, + "grad_norm": 1.7020406495492606, + "learning_rate": 2.172107210072036e-05, + "loss": 0.4908, + "step": 17530 + }, + { + "epoch": 1.7019212109450805, + "grad_norm": 1.4911580170253027, + "learning_rate": 2.1704847816211307e-05, + "loss": 0.4832, + "step": 17540 + }, + { + "epoch": 1.702891519503202, + "grad_norm": 2.086292759031188, + "learning_rate": 2.1688623531702253e-05, + "loss": 0.5129, + "step": 17550 + }, + { + "epoch": 1.7038618280613234, + "grad_norm": 1.5359738880009668, + "learning_rate": 2.16723992471932e-05, + "loss": 0.4512, + "step": 17560 + }, + { + "epoch": 1.704832136619445, + "grad_norm": 1.850242190602371, + "learning_rate": 2.165617496268415e-05, + "loss": 0.5224, + "step": 17570 + }, + { + "epoch": 1.7058024451775666, + "grad_norm": 1.2439323678887082, + "learning_rate": 2.1639950678175096e-05, + "loss": 0.4625, + "step": 17580 + }, + { + "epoch": 1.706772753735688, + "grad_norm": 1.5186947478659967, + "learning_rate": 2.1623726393666042e-05, + "loss": 0.5527, + "step": 17590 + }, + { + "epoch": 1.7077430622938095, + "grad_norm": 2.1869553971014035, + "learning_rate": 2.160750210915699e-05, + "loss": 0.499, + "step": 17600 + }, + { + "epoch": 1.7087133708519309, + "grad_norm": 2.1896368464295968, + "learning_rate": 2.1591277824647935e-05, + "loss": 0.5182, + "step": 17610 + }, + { + "epoch": 1.7096836794100523, + "grad_norm": 2.5508627899244853, + "learning_rate": 2.157505354013888e-05, + "loss": 0.529, + "step": 17620 + }, + { + "epoch": 1.710653987968174, + "grad_norm": 2.0451418684052527, + "learning_rate": 2.1558829255629828e-05, + "loss": 0.4789, + "step": 17630 + }, + { + "epoch": 1.7116242965262953, + "grad_norm": 2.358852269881603, + "learning_rate": 2.1542604971120774e-05, + "loss": 0.4701, + "step": 17640 + }, + { + "epoch": 1.712594605084417, + "grad_norm": 1.65876380363299, + "learning_rate": 2.152638068661172e-05, + "loss": 0.5078, + "step": 17650 + }, + { + "epoch": 1.7135649136425384, + "grad_norm": 1.819260137292149, + "learning_rate": 2.151015640210267e-05, + "loss": 0.5115, + "step": 17660 + }, + { + "epoch": 1.7145352222006598, + "grad_norm": 1.9278786735506124, + "learning_rate": 2.1493932117593616e-05, + "loss": 0.526, + "step": 17670 + }, + { + "epoch": 1.7155055307587812, + "grad_norm": 1.647908501988862, + "learning_rate": 2.1477707833084563e-05, + "loss": 0.4455, + "step": 17680 + }, + { + "epoch": 1.7164758393169026, + "grad_norm": 2.211325829098572, + "learning_rate": 2.146148354857551e-05, + "loss": 0.5251, + "step": 17690 + }, + { + "epoch": 1.7174461478750243, + "grad_norm": 2.068602886260178, + "learning_rate": 2.1445259264066456e-05, + "loss": 0.5173, + "step": 17700 + }, + { + "epoch": 1.718416456433146, + "grad_norm": 1.7976369940318933, + "learning_rate": 2.1429034979557402e-05, + "loss": 0.4997, + "step": 17710 + }, + { + "epoch": 1.7193867649912673, + "grad_norm": 1.8518665259593716, + "learning_rate": 2.141281069504835e-05, + "loss": 0.5221, + "step": 17720 + }, + { + "epoch": 1.7203570735493887, + "grad_norm": 2.162010082710502, + "learning_rate": 2.1396586410539295e-05, + "loss": 0.5096, + "step": 17730 + }, + { + "epoch": 1.7213273821075101, + "grad_norm": 2.3175061118466704, + "learning_rate": 2.138036212603024e-05, + "loss": 0.5203, + "step": 17740 + }, + { + "epoch": 1.7222976906656315, + "grad_norm": 2.0036851029306364, + "learning_rate": 2.136413784152119e-05, + "loss": 0.5312, + "step": 17750 + }, + { + "epoch": 1.7232679992237532, + "grad_norm": 2.232799745735125, + "learning_rate": 2.1347913557012137e-05, + "loss": 0.477, + "step": 17760 + }, + { + "epoch": 1.7242383077818746, + "grad_norm": 2.0579250461700225, + "learning_rate": 2.1331689272503084e-05, + "loss": 0.5104, + "step": 17770 + }, + { + "epoch": 1.7252086163399962, + "grad_norm": 1.9067412132990194, + "learning_rate": 2.131546498799403e-05, + "loss": 0.4663, + "step": 17780 + }, + { + "epoch": 1.7261789248981176, + "grad_norm": 1.6166777364493794, + "learning_rate": 2.1299240703484976e-05, + "loss": 0.5091, + "step": 17790 + }, + { + "epoch": 1.727149233456239, + "grad_norm": 1.8400065423176315, + "learning_rate": 2.1283016418975923e-05, + "loss": 0.4577, + "step": 17800 + }, + { + "epoch": 1.7281195420143605, + "grad_norm": 1.8343636819724598, + "learning_rate": 2.126679213446687e-05, + "loss": 0.5171, + "step": 17810 + }, + { + "epoch": 1.729089850572482, + "grad_norm": 2.035066789255769, + "learning_rate": 2.1250567849957816e-05, + "loss": 0.4809, + "step": 17820 + }, + { + "epoch": 1.7300601591306035, + "grad_norm": 1.9750470965118032, + "learning_rate": 2.1234343565448765e-05, + "loss": 0.4681, + "step": 17830 + }, + { + "epoch": 1.7310304676887251, + "grad_norm": 1.707449728088738, + "learning_rate": 2.1218119280939712e-05, + "loss": 0.4768, + "step": 17840 + }, + { + "epoch": 1.7320007762468466, + "grad_norm": 2.091683187172791, + "learning_rate": 2.1201894996430658e-05, + "loss": 0.4967, + "step": 17850 + }, + { + "epoch": 1.732971084804968, + "grad_norm": 1.8317230891625513, + "learning_rate": 2.1185670711921604e-05, + "loss": 0.4617, + "step": 17860 + }, + { + "epoch": 1.7339413933630894, + "grad_norm": 1.8581590049450867, + "learning_rate": 2.116944642741255e-05, + "loss": 0.453, + "step": 17870 + }, + { + "epoch": 1.7349117019212108, + "grad_norm": 2.394138871900483, + "learning_rate": 2.1153222142903497e-05, + "loss": 0.5231, + "step": 17880 + }, + { + "epoch": 1.7358820104793324, + "grad_norm": 1.8303455092887013, + "learning_rate": 2.1136997858394444e-05, + "loss": 0.5045, + "step": 17890 + }, + { + "epoch": 1.736852319037454, + "grad_norm": 1.7040025873966649, + "learning_rate": 2.1120773573885393e-05, + "loss": 0.5185, + "step": 17900 + }, + { + "epoch": 1.7378226275955755, + "grad_norm": 1.721795865054807, + "learning_rate": 2.110454928937634e-05, + "loss": 0.5153, + "step": 17910 + }, + { + "epoch": 1.738792936153697, + "grad_norm": 2.195542603436347, + "learning_rate": 2.1088325004867286e-05, + "loss": 0.515, + "step": 17920 + }, + { + "epoch": 1.7397632447118183, + "grad_norm": 1.9475497865392444, + "learning_rate": 2.1072100720358236e-05, + "loss": 0.6134, + "step": 17930 + }, + { + "epoch": 1.7407335532699397, + "grad_norm": 2.007827581925297, + "learning_rate": 2.1055876435849182e-05, + "loss": 0.4483, + "step": 17940 + }, + { + "epoch": 1.7417038618280614, + "grad_norm": 2.427289180607889, + "learning_rate": 2.103965215134013e-05, + "loss": 0.4791, + "step": 17950 + }, + { + "epoch": 1.7426741703861828, + "grad_norm": 2.090548009394418, + "learning_rate": 2.1023427866831075e-05, + "loss": 0.4485, + "step": 17960 + }, + { + "epoch": 1.7436444789443044, + "grad_norm": 1.4231148878110762, + "learning_rate": 2.100720358232202e-05, + "loss": 0.4819, + "step": 17970 + }, + { + "epoch": 1.7446147875024258, + "grad_norm": 1.952695177483555, + "learning_rate": 2.0990979297812968e-05, + "loss": 0.4615, + "step": 17980 + }, + { + "epoch": 1.7455850960605472, + "grad_norm": 1.5076062020543768, + "learning_rate": 2.0974755013303914e-05, + "loss": 0.5145, + "step": 17990 + }, + { + "epoch": 1.7465554046186686, + "grad_norm": 2.385317508868708, + "learning_rate": 2.095853072879486e-05, + "loss": 0.4667, + "step": 18000 + }, + { + "epoch": 1.7465554046186686, + "eval_loss": 0.6351094841957092, + "eval_runtime": 2471.0101, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.363, + "step": 18000 + }, + { + "epoch": 1.74752571317679, + "grad_norm": 1.5204879350818516, + "learning_rate": 2.0942306444285807e-05, + "loss": 0.4808, + "step": 18010 + }, + { + "epoch": 1.7484960217349117, + "grad_norm": 2.363404313519829, + "learning_rate": 2.0926082159776757e-05, + "loss": 0.4829, + "step": 18020 + }, + { + "epoch": 1.7494663302930333, + "grad_norm": 1.771660432749116, + "learning_rate": 2.0909857875267703e-05, + "loss": 0.4838, + "step": 18030 + }, + { + "epoch": 1.7504366388511547, + "grad_norm": 1.754309450094576, + "learning_rate": 2.089363359075865e-05, + "loss": 0.5289, + "step": 18040 + }, + { + "epoch": 1.7514069474092762, + "grad_norm": 1.7394710721825013, + "learning_rate": 2.0877409306249596e-05, + "loss": 0.4681, + "step": 18050 + }, + { + "epoch": 1.7523772559673976, + "grad_norm": 1.903002291309086, + "learning_rate": 2.0861185021740542e-05, + "loss": 0.4352, + "step": 18060 + }, + { + "epoch": 1.753347564525519, + "grad_norm": 1.8599137833426156, + "learning_rate": 2.084496073723149e-05, + "loss": 0.5158, + "step": 18070 + }, + { + "epoch": 1.7543178730836406, + "grad_norm": 2.416977234001108, + "learning_rate": 2.0828736452722435e-05, + "loss": 0.458, + "step": 18080 + }, + { + "epoch": 1.755288181641762, + "grad_norm": 2.0471286021548876, + "learning_rate": 2.081251216821338e-05, + "loss": 0.5093, + "step": 18090 + }, + { + "epoch": 1.7562584901998837, + "grad_norm": 1.676028789821819, + "learning_rate": 2.079628788370433e-05, + "loss": 0.4682, + "step": 18100 + }, + { + "epoch": 1.757228798758005, + "grad_norm": 2.27723774043769, + "learning_rate": 2.0780063599195278e-05, + "loss": 0.4844, + "step": 18110 + }, + { + "epoch": 1.7581991073161265, + "grad_norm": 1.9065062110447546, + "learning_rate": 2.0763839314686224e-05, + "loss": 0.4913, + "step": 18120 + }, + { + "epoch": 1.759169415874248, + "grad_norm": 1.499218120899005, + "learning_rate": 2.074761503017717e-05, + "loss": 0.4764, + "step": 18130 + }, + { + "epoch": 1.7601397244323695, + "grad_norm": 1.7111231535151605, + "learning_rate": 2.0731390745668117e-05, + "loss": 0.4376, + "step": 18140 + }, + { + "epoch": 1.761110032990491, + "grad_norm": 2.1731054208801406, + "learning_rate": 2.0715166461159063e-05, + "loss": 0.5115, + "step": 18150 + }, + { + "epoch": 1.7620803415486126, + "grad_norm": 2.537816428144228, + "learning_rate": 2.069894217665001e-05, + "loss": 0.4872, + "step": 18160 + }, + { + "epoch": 1.763050650106734, + "grad_norm": 1.8860270703852153, + "learning_rate": 2.0682717892140956e-05, + "loss": 0.544, + "step": 18170 + }, + { + "epoch": 1.7640209586648554, + "grad_norm": 2.021054656113362, + "learning_rate": 2.0666493607631902e-05, + "loss": 0.4595, + "step": 18180 + }, + { + "epoch": 1.7649912672229768, + "grad_norm": 1.7916097460233942, + "learning_rate": 2.0650269323122852e-05, + "loss": 0.5098, + "step": 18190 + }, + { + "epoch": 1.7659615757810982, + "grad_norm": 2.3100480138537, + "learning_rate": 2.06340450386138e-05, + "loss": 0.4801, + "step": 18200 + }, + { + "epoch": 1.7669318843392199, + "grad_norm": 2.4377132473167413, + "learning_rate": 2.0617820754104745e-05, + "loss": 0.4507, + "step": 18210 + }, + { + "epoch": 1.7679021928973415, + "grad_norm": 2.303644526731782, + "learning_rate": 2.060159646959569e-05, + "loss": 0.4557, + "step": 18220 + }, + { + "epoch": 1.768872501455463, + "grad_norm": 2.104570073246246, + "learning_rate": 2.0585372185086638e-05, + "loss": 0.4657, + "step": 18230 + }, + { + "epoch": 1.7698428100135843, + "grad_norm": 1.902037691736978, + "learning_rate": 2.0569147900577584e-05, + "loss": 0.4885, + "step": 18240 + }, + { + "epoch": 1.7708131185717058, + "grad_norm": 1.8321649978057852, + "learning_rate": 2.055292361606853e-05, + "loss": 0.4953, + "step": 18250 + }, + { + "epoch": 1.7717834271298272, + "grad_norm": 1.7318686323178791, + "learning_rate": 2.0536699331559477e-05, + "loss": 0.4993, + "step": 18260 + }, + { + "epoch": 1.7727537356879488, + "grad_norm": 1.6339592524860076, + "learning_rate": 2.0520475047050423e-05, + "loss": 0.5311, + "step": 18270 + }, + { + "epoch": 1.7737240442460702, + "grad_norm": 2.0377735402561448, + "learning_rate": 2.0504250762541373e-05, + "loss": 0.4785, + "step": 18280 + }, + { + "epoch": 1.7746943528041919, + "grad_norm": 1.4282663698707199, + "learning_rate": 2.0488026478032323e-05, + "loss": 0.5077, + "step": 18290 + }, + { + "epoch": 1.7756646613623133, + "grad_norm": 1.9149293437143238, + "learning_rate": 2.047180219352327e-05, + "loss": 0.5319, + "step": 18300 + }, + { + "epoch": 1.7766349699204347, + "grad_norm": 2.1578236484875077, + "learning_rate": 2.0455577909014215e-05, + "loss": 0.4928, + "step": 18310 + }, + { + "epoch": 1.777605278478556, + "grad_norm": 2.107005500307039, + "learning_rate": 2.0439353624505162e-05, + "loss": 0.4969, + "step": 18320 + }, + { + "epoch": 1.7785755870366775, + "grad_norm": 1.9793180118959242, + "learning_rate": 2.0423129339996108e-05, + "loss": 0.4585, + "step": 18330 + }, + { + "epoch": 1.7795458955947991, + "grad_norm": 1.8333976168598185, + "learning_rate": 2.0406905055487055e-05, + "loss": 0.4982, + "step": 18340 + }, + { + "epoch": 1.7805162041529208, + "grad_norm": 1.9049818949759387, + "learning_rate": 2.0390680770978e-05, + "loss": 0.494, + "step": 18350 + }, + { + "epoch": 1.7814865127110422, + "grad_norm": 1.9993776566406598, + "learning_rate": 2.0374456486468947e-05, + "loss": 0.4918, + "step": 18360 + }, + { + "epoch": 1.7824568212691636, + "grad_norm": 1.901331422597336, + "learning_rate": 2.0358232201959894e-05, + "loss": 0.5138, + "step": 18370 + }, + { + "epoch": 1.783427129827285, + "grad_norm": 1.6972755513639741, + "learning_rate": 2.0342007917450843e-05, + "loss": 0.4715, + "step": 18380 + }, + { + "epoch": 1.7843974383854064, + "grad_norm": 1.7683176039213586, + "learning_rate": 2.032578363294179e-05, + "loss": 0.4917, + "step": 18390 + }, + { + "epoch": 1.785367746943528, + "grad_norm": 2.473174371442726, + "learning_rate": 2.0309559348432736e-05, + "loss": 0.5202, + "step": 18400 + }, + { + "epoch": 1.7863380555016495, + "grad_norm": 1.5926803956510256, + "learning_rate": 2.0293335063923683e-05, + "loss": 0.4988, + "step": 18410 + }, + { + "epoch": 1.7873083640597711, + "grad_norm": 2.291647967076523, + "learning_rate": 2.027711077941463e-05, + "loss": 0.4824, + "step": 18420 + }, + { + "epoch": 1.7882786726178925, + "grad_norm": 1.8562307329324135, + "learning_rate": 2.0260886494905575e-05, + "loss": 0.4563, + "step": 18430 + }, + { + "epoch": 1.789248981176014, + "grad_norm": 2.0376982504169225, + "learning_rate": 2.0244662210396522e-05, + "loss": 0.5306, + "step": 18440 + }, + { + "epoch": 1.7902192897341354, + "grad_norm": 2.2229061869152287, + "learning_rate": 2.0228437925887468e-05, + "loss": 0.4992, + "step": 18450 + }, + { + "epoch": 1.791189598292257, + "grad_norm": 1.9381177105445806, + "learning_rate": 2.0212213641378418e-05, + "loss": 0.5163, + "step": 18460 + }, + { + "epoch": 1.7921599068503784, + "grad_norm": 2.2038889339934276, + "learning_rate": 2.0195989356869364e-05, + "loss": 0.4872, + "step": 18470 + }, + { + "epoch": 1.7931302154085, + "grad_norm": 1.7467002610372642, + "learning_rate": 2.017976507236031e-05, + "loss": 0.4876, + "step": 18480 + }, + { + "epoch": 1.7941005239666215, + "grad_norm": 1.80516559021005, + "learning_rate": 2.0163540787851257e-05, + "loss": 0.5155, + "step": 18490 + }, + { + "epoch": 1.7950708325247429, + "grad_norm": 1.6841944257951464, + "learning_rate": 2.0147316503342203e-05, + "loss": 0.485, + "step": 18500 + }, + { + "epoch": 1.7950708325247429, + "eval_loss": 0.634519636631012, + "eval_runtime": 2476.7893, + "eval_samples_per_second": 0.724, + "eval_steps_per_second": 0.362, + "step": 18500 + }, + { + "epoch": 1.7960411410828643, + "grad_norm": 1.9439021855874303, + "learning_rate": 2.013109221883315e-05, + "loss": 0.452, + "step": 18510 + }, + { + "epoch": 1.7970114496409857, + "grad_norm": 2.2513317080100226, + "learning_rate": 2.0114867934324096e-05, + "loss": 0.5325, + "step": 18520 + }, + { + "epoch": 1.7979817581991073, + "grad_norm": 2.08111292912013, + "learning_rate": 2.0098643649815043e-05, + "loss": 0.5473, + "step": 18530 + }, + { + "epoch": 1.798952066757229, + "grad_norm": 1.5733834399311872, + "learning_rate": 2.008241936530599e-05, + "loss": 0.477, + "step": 18540 + }, + { + "epoch": 1.7999223753153504, + "grad_norm": 2.8502682562631954, + "learning_rate": 2.006619508079694e-05, + "loss": 0.5206, + "step": 18550 + }, + { + "epoch": 1.8008926838734718, + "grad_norm": 1.753275957528235, + "learning_rate": 2.0049970796287885e-05, + "loss": 0.4547, + "step": 18560 + }, + { + "epoch": 1.8018629924315932, + "grad_norm": 2.1314672834115473, + "learning_rate": 2.003374651177883e-05, + "loss": 0.5419, + "step": 18570 + }, + { + "epoch": 1.8028333009897146, + "grad_norm": 1.9602167097784182, + "learning_rate": 2.0017522227269778e-05, + "loss": 0.5155, + "step": 18580 + }, + { + "epoch": 1.8038036095478363, + "grad_norm": 1.938753313860514, + "learning_rate": 2.0001297942760724e-05, + "loss": 0.5068, + "step": 18590 + }, + { + "epoch": 1.8047739181059577, + "grad_norm": 2.5245908920622924, + "learning_rate": 1.998507365825167e-05, + "loss": 0.4823, + "step": 18600 + }, + { + "epoch": 1.8057442266640793, + "grad_norm": 1.8472353172834934, + "learning_rate": 1.9968849373742617e-05, + "loss": 0.5307, + "step": 18610 + }, + { + "epoch": 1.8067145352222007, + "grad_norm": 2.119846040052731, + "learning_rate": 1.9952625089233563e-05, + "loss": 0.5497, + "step": 18620 + }, + { + "epoch": 1.8076848437803221, + "grad_norm": 1.8254352104689588, + "learning_rate": 1.9936400804724513e-05, + "loss": 0.5343, + "step": 18630 + }, + { + "epoch": 1.8086551523384435, + "grad_norm": 2.286926975318207, + "learning_rate": 1.992017652021546e-05, + "loss": 0.5079, + "step": 18640 + }, + { + "epoch": 1.809625460896565, + "grad_norm": 2.145456846814811, + "learning_rate": 1.9903952235706406e-05, + "loss": 0.4792, + "step": 18650 + }, + { + "epoch": 1.8105957694546866, + "grad_norm": 2.1237990360529615, + "learning_rate": 1.9887727951197352e-05, + "loss": 0.5221, + "step": 18660 + }, + { + "epoch": 1.8115660780128082, + "grad_norm": 1.9365349043963929, + "learning_rate": 1.98715036666883e-05, + "loss": 0.4994, + "step": 18670 + }, + { + "epoch": 1.8125363865709296, + "grad_norm": 2.1477510328366543, + "learning_rate": 1.985527938217925e-05, + "loss": 0.5051, + "step": 18680 + }, + { + "epoch": 1.813506695129051, + "grad_norm": 2.24009021245378, + "learning_rate": 1.9839055097670195e-05, + "loss": 0.4411, + "step": 18690 + }, + { + "epoch": 1.8144770036871725, + "grad_norm": 2.161147622755333, + "learning_rate": 1.982283081316114e-05, + "loss": 0.4947, + "step": 18700 + }, + { + "epoch": 1.8154473122452939, + "grad_norm": 2.5777752480302296, + "learning_rate": 1.9806606528652088e-05, + "loss": 0.5027, + "step": 18710 + }, + { + "epoch": 1.8164176208034155, + "grad_norm": 1.5289437881489423, + "learning_rate": 1.9790382244143034e-05, + "loss": 0.5276, + "step": 18720 + }, + { + "epoch": 1.817387929361537, + "grad_norm": 1.808692912848697, + "learning_rate": 1.977415795963398e-05, + "loss": 0.5155, + "step": 18730 + }, + { + "epoch": 1.8183582379196586, + "grad_norm": 1.7568825751279589, + "learning_rate": 1.975793367512493e-05, + "loss": 0.4658, + "step": 18740 + }, + { + "epoch": 1.81932854647778, + "grad_norm": 1.5226619183552335, + "learning_rate": 1.9741709390615877e-05, + "loss": 0.4764, + "step": 18750 + }, + { + "epoch": 1.8202988550359014, + "grad_norm": 1.5664856097734037, + "learning_rate": 1.9725485106106823e-05, + "loss": 0.4503, + "step": 18760 + }, + { + "epoch": 1.8212691635940228, + "grad_norm": 1.7127846075920363, + "learning_rate": 1.970926082159777e-05, + "loss": 0.4558, + "step": 18770 + }, + { + "epoch": 1.8222394721521444, + "grad_norm": 2.1030557687163007, + "learning_rate": 1.9693036537088716e-05, + "loss": 0.5578, + "step": 18780 + }, + { + "epoch": 1.8232097807102658, + "grad_norm": 2.0907960296944474, + "learning_rate": 1.9676812252579662e-05, + "loss": 0.4974, + "step": 18790 + }, + { + "epoch": 1.8241800892683875, + "grad_norm": 1.4612012638589904, + "learning_rate": 1.966058796807061e-05, + "loss": 0.4931, + "step": 18800 + }, + { + "epoch": 1.825150397826509, + "grad_norm": 1.6970216305431254, + "learning_rate": 1.9644363683561555e-05, + "loss": 0.4691, + "step": 18810 + }, + { + "epoch": 1.8261207063846303, + "grad_norm": 2.4415425125364125, + "learning_rate": 1.9628139399052505e-05, + "loss": 0.5249, + "step": 18820 + }, + { + "epoch": 1.8270910149427517, + "grad_norm": 1.7771990898290009, + "learning_rate": 1.961191511454345e-05, + "loss": 0.5019, + "step": 18830 + }, + { + "epoch": 1.8280613235008731, + "grad_norm": 1.7839717190667712, + "learning_rate": 1.9595690830034397e-05, + "loss": 0.5052, + "step": 18840 + }, + { + "epoch": 1.8290316320589948, + "grad_norm": 2.209198913639542, + "learning_rate": 1.9579466545525344e-05, + "loss": 0.5235, + "step": 18850 + }, + { + "epoch": 1.8300019406171164, + "grad_norm": 2.200681320097298, + "learning_rate": 1.956324226101629e-05, + "loss": 0.5302, + "step": 18860 + }, + { + "epoch": 1.8309722491752378, + "grad_norm": 2.000921134970765, + "learning_rate": 1.9547017976507237e-05, + "loss": 0.4648, + "step": 18870 + }, + { + "epoch": 1.8319425577333592, + "grad_norm": 2.623565257277865, + "learning_rate": 1.9530793691998183e-05, + "loss": 0.4855, + "step": 18880 + }, + { + "epoch": 1.8329128662914806, + "grad_norm": 1.8550961862060569, + "learning_rate": 1.951456940748913e-05, + "loss": 0.5135, + "step": 18890 + }, + { + "epoch": 1.833883174849602, + "grad_norm": 2.0777083761268718, + "learning_rate": 1.9498345122980076e-05, + "loss": 0.5236, + "step": 18900 + }, + { + "epoch": 1.8348534834077237, + "grad_norm": 2.2339320492583337, + "learning_rate": 1.9482120838471025e-05, + "loss": 0.5336, + "step": 18910 + }, + { + "epoch": 1.835823791965845, + "grad_norm": 2.2130179741978173, + "learning_rate": 1.9465896553961972e-05, + "loss": 0.4468, + "step": 18920 + }, + { + "epoch": 1.8367941005239667, + "grad_norm": 1.9136609274036684, + "learning_rate": 1.9449672269452918e-05, + "loss": 0.516, + "step": 18930 + }, + { + "epoch": 1.8377644090820882, + "grad_norm": 1.9319944175941055, + "learning_rate": 1.9433447984943865e-05, + "loss": 0.4837, + "step": 18940 + }, + { + "epoch": 1.8387347176402096, + "grad_norm": 1.8144243996167448, + "learning_rate": 1.941722370043481e-05, + "loss": 0.5388, + "step": 18950 + }, + { + "epoch": 1.839705026198331, + "grad_norm": 1.9871922674209272, + "learning_rate": 1.9400999415925757e-05, + "loss": 0.5543, + "step": 18960 + }, + { + "epoch": 1.8406753347564524, + "grad_norm": 2.1436846647503707, + "learning_rate": 1.9384775131416704e-05, + "loss": 0.4787, + "step": 18970 + }, + { + "epoch": 1.841645643314574, + "grad_norm": 1.926685455696982, + "learning_rate": 1.936855084690765e-05, + "loss": 0.5096, + "step": 18980 + }, + { + "epoch": 1.8426159518726957, + "grad_norm": 2.342596116742962, + "learning_rate": 1.93523265623986e-05, + "loss": 0.522, + "step": 18990 + }, + { + "epoch": 1.843586260430817, + "grad_norm": 2.255501885540124, + "learning_rate": 1.9336102277889546e-05, + "loss": 0.5377, + "step": 19000 + }, + { + "epoch": 1.843586260430817, + "eval_loss": 0.6318312883377075, + "eval_runtime": 2473.7549, + "eval_samples_per_second": 0.724, + "eval_steps_per_second": 0.362, + "step": 19000 + }, + { + "epoch": 1.8445565689889385, + "grad_norm": 2.1596583204757884, + "learning_rate": 1.9319877993380493e-05, + "loss": 0.5629, + "step": 19010 + }, + { + "epoch": 1.84552687754706, + "grad_norm": 1.8190379524057834, + "learning_rate": 1.930365370887144e-05, + "loss": 0.4796, + "step": 19020 + }, + { + "epoch": 1.8464971861051813, + "grad_norm": 1.019855687196459, + "learning_rate": 1.9287429424362385e-05, + "loss": 0.471, + "step": 19030 + }, + { + "epoch": 1.847467494663303, + "grad_norm": 1.9569552509020551, + "learning_rate": 1.9271205139853332e-05, + "loss": 0.5126, + "step": 19040 + }, + { + "epoch": 1.8484378032214244, + "grad_norm": 2.1870238918367524, + "learning_rate": 1.9254980855344278e-05, + "loss": 0.4865, + "step": 19050 + }, + { + "epoch": 1.849408111779546, + "grad_norm": 1.594729244733264, + "learning_rate": 1.9238756570835225e-05, + "loss": 0.4802, + "step": 19060 + }, + { + "epoch": 1.8503784203376674, + "grad_norm": 2.1426009139912843, + "learning_rate": 1.9222532286326174e-05, + "loss": 0.5303, + "step": 19070 + }, + { + "epoch": 1.8513487288957888, + "grad_norm": 1.7760248540972265, + "learning_rate": 1.920630800181712e-05, + "loss": 0.4948, + "step": 19080 + }, + { + "epoch": 1.8523190374539102, + "grad_norm": 1.6725671761260958, + "learning_rate": 1.919008371730807e-05, + "loss": 0.4922, + "step": 19090 + }, + { + "epoch": 1.8532893460120319, + "grad_norm": 1.9565774055440965, + "learning_rate": 1.9173859432799017e-05, + "loss": 0.5658, + "step": 19100 + }, + { + "epoch": 1.8542596545701533, + "grad_norm": 2.231350831039019, + "learning_rate": 1.9157635148289963e-05, + "loss": 0.4906, + "step": 19110 + }, + { + "epoch": 1.855229963128275, + "grad_norm": 1.5993699848453529, + "learning_rate": 1.914141086378091e-05, + "loss": 0.4796, + "step": 19120 + }, + { + "epoch": 1.8562002716863963, + "grad_norm": 2.0780226359466387, + "learning_rate": 1.9125186579271856e-05, + "loss": 0.4773, + "step": 19130 + }, + { + "epoch": 1.8571705802445178, + "grad_norm": 2.01763881138038, + "learning_rate": 1.9108962294762802e-05, + "loss": 0.4984, + "step": 19140 + }, + { + "epoch": 1.8581408888026392, + "grad_norm": 2.240723917017317, + "learning_rate": 1.909273801025375e-05, + "loss": 0.4559, + "step": 19150 + }, + { + "epoch": 1.8591111973607606, + "grad_norm": 1.849249605812625, + "learning_rate": 1.9076513725744695e-05, + "loss": 0.5445, + "step": 19160 + }, + { + "epoch": 1.8600815059188822, + "grad_norm": 1.7656331962779384, + "learning_rate": 1.906028944123564e-05, + "loss": 0.466, + "step": 19170 + }, + { + "epoch": 1.8610518144770039, + "grad_norm": 1.9723478771428924, + "learning_rate": 1.904406515672659e-05, + "loss": 0.4951, + "step": 19180 + }, + { + "epoch": 1.8620221230351253, + "grad_norm": 2.133741060165629, + "learning_rate": 1.9027840872217538e-05, + "loss": 0.5064, + "step": 19190 + }, + { + "epoch": 1.8629924315932467, + "grad_norm": 2.267317304504692, + "learning_rate": 1.9011616587708484e-05, + "loss": 0.4774, + "step": 19200 + }, + { + "epoch": 1.863962740151368, + "grad_norm": 1.6590470147649787, + "learning_rate": 1.899539230319943e-05, + "loss": 0.4684, + "step": 19210 + }, + { + "epoch": 1.8649330487094895, + "grad_norm": 1.7428570168415043, + "learning_rate": 1.8979168018690377e-05, + "loss": 0.494, + "step": 19220 + }, + { + "epoch": 1.8659033572676111, + "grad_norm": 1.6212375134091286, + "learning_rate": 1.8962943734181323e-05, + "loss": 0.481, + "step": 19230 + }, + { + "epoch": 1.8668736658257326, + "grad_norm": 2.0261745446913717, + "learning_rate": 1.894671944967227e-05, + "loss": 0.5107, + "step": 19240 + }, + { + "epoch": 1.8678439743838542, + "grad_norm": 1.7751731877262686, + "learning_rate": 1.8930495165163216e-05, + "loss": 0.4615, + "step": 19250 + }, + { + "epoch": 1.8688142829419756, + "grad_norm": 1.5566178962662132, + "learning_rate": 1.8914270880654162e-05, + "loss": 0.5061, + "step": 19260 + }, + { + "epoch": 1.869784591500097, + "grad_norm": 2.0648415585072137, + "learning_rate": 1.8898046596145112e-05, + "loss": 0.4721, + "step": 19270 + }, + { + "epoch": 1.8707549000582184, + "grad_norm": 1.8255664078089486, + "learning_rate": 1.888182231163606e-05, + "loss": 0.4652, + "step": 19280 + }, + { + "epoch": 1.8717252086163398, + "grad_norm": 2.144758151637292, + "learning_rate": 1.8865598027127005e-05, + "loss": 0.4391, + "step": 19290 + }, + { + "epoch": 1.8726955171744615, + "grad_norm": 1.803874603824675, + "learning_rate": 1.884937374261795e-05, + "loss": 0.492, + "step": 19300 + }, + { + "epoch": 1.8736658257325831, + "grad_norm": 2.1603319792398685, + "learning_rate": 1.8833149458108898e-05, + "loss": 0.4602, + "step": 19310 + }, + { + "epoch": 1.8746361342907045, + "grad_norm": 1.931469165672542, + "learning_rate": 1.8816925173599844e-05, + "loss": 0.479, + "step": 19320 + }, + { + "epoch": 1.875606442848826, + "grad_norm": 2.289215850770512, + "learning_rate": 1.880070088909079e-05, + "loss": 0.475, + "step": 19330 + }, + { + "epoch": 1.8765767514069474, + "grad_norm": 2.0548701793141384, + "learning_rate": 1.8784476604581737e-05, + "loss": 0.4523, + "step": 19340 + }, + { + "epoch": 1.8775470599650688, + "grad_norm": 1.6304292402973444, + "learning_rate": 1.8768252320072687e-05, + "loss": 0.4704, + "step": 19350 + }, + { + "epoch": 1.8785173685231904, + "grad_norm": 1.3760981254681826, + "learning_rate": 1.8752028035563633e-05, + "loss": 0.509, + "step": 19360 + }, + { + "epoch": 1.8794876770813118, + "grad_norm": 2.1389308936606364, + "learning_rate": 1.873580375105458e-05, + "loss": 0.4791, + "step": 19370 + }, + { + "epoch": 1.8804579856394334, + "grad_norm": 2.0673076161036663, + "learning_rate": 1.8719579466545526e-05, + "loss": 0.4902, + "step": 19380 + }, + { + "epoch": 1.8814282941975549, + "grad_norm": 1.861431210462014, + "learning_rate": 1.8703355182036472e-05, + "loss": 0.4719, + "step": 19390 + }, + { + "epoch": 1.8823986027556763, + "grad_norm": 1.9788908151587594, + "learning_rate": 1.868713089752742e-05, + "loss": 0.5127, + "step": 19400 + }, + { + "epoch": 1.8833689113137977, + "grad_norm": 1.9487695948736958, + "learning_rate": 1.8670906613018365e-05, + "loss": 0.4469, + "step": 19410 + }, + { + "epoch": 1.8843392198719193, + "grad_norm": 1.8048413444236069, + "learning_rate": 1.865468232850931e-05, + "loss": 0.5345, + "step": 19420 + }, + { + "epoch": 1.8853095284300407, + "grad_norm": 1.8168395167881652, + "learning_rate": 1.8638458044000258e-05, + "loss": 0.4957, + "step": 19430 + }, + { + "epoch": 1.8862798369881624, + "grad_norm": 1.9900462235084369, + "learning_rate": 1.8622233759491207e-05, + "loss": 0.4928, + "step": 19440 + }, + { + "epoch": 1.8872501455462838, + "grad_norm": 1.9978639659195843, + "learning_rate": 1.8606009474982154e-05, + "loss": 0.5085, + "step": 19450 + }, + { + "epoch": 1.8882204541044052, + "grad_norm": 1.9870398133754268, + "learning_rate": 1.8589785190473104e-05, + "loss": 0.4857, + "step": 19460 + }, + { + "epoch": 1.8891907626625266, + "grad_norm": 1.9723654086733247, + "learning_rate": 1.857356090596405e-05, + "loss": 0.4658, + "step": 19470 + }, + { + "epoch": 1.890161071220648, + "grad_norm": 1.8141064421371964, + "learning_rate": 1.8557336621454996e-05, + "loss": 0.4881, + "step": 19480 + }, + { + "epoch": 1.8911313797787697, + "grad_norm": 1.6779273277493256, + "learning_rate": 1.8541112336945943e-05, + "loss": 0.4798, + "step": 19490 + }, + { + "epoch": 1.8921016883368913, + "grad_norm": 1.901625964641795, + "learning_rate": 1.852488805243689e-05, + "loss": 0.5199, + "step": 19500 + }, + { + "epoch": 1.8921016883368913, + "eval_loss": 0.631912350654602, + "eval_runtime": 2473.0661, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.362, + "step": 19500 + }, + { + "epoch": 1.8930719968950127, + "grad_norm": 1.7019367377384935, + "learning_rate": 1.8508663767927836e-05, + "loss": 0.4693, + "step": 19510 + }, + { + "epoch": 1.8940423054531341, + "grad_norm": 1.9777898503801572, + "learning_rate": 1.8492439483418782e-05, + "loss": 0.4907, + "step": 19520 + }, + { + "epoch": 1.8950126140112555, + "grad_norm": 2.613311730337197, + "learning_rate": 1.8476215198909728e-05, + "loss": 0.5179, + "step": 19530 + }, + { + "epoch": 1.895982922569377, + "grad_norm": 1.3976847929675515, + "learning_rate": 1.8459990914400678e-05, + "loss": 0.4316, + "step": 19540 + }, + { + "epoch": 1.8969532311274986, + "grad_norm": 2.1428783623843715, + "learning_rate": 1.8443766629891624e-05, + "loss": 0.5275, + "step": 19550 + }, + { + "epoch": 1.89792353968562, + "grad_norm": 1.6878966830886573, + "learning_rate": 1.842754234538257e-05, + "loss": 0.5061, + "step": 19560 + }, + { + "epoch": 1.8988938482437416, + "grad_norm": 1.6660923395832288, + "learning_rate": 1.8411318060873517e-05, + "loss": 0.4846, + "step": 19570 + }, + { + "epoch": 1.899864156801863, + "grad_norm": 1.88350077160719, + "learning_rate": 1.8395093776364464e-05, + "loss": 0.4558, + "step": 19580 + }, + { + "epoch": 1.9008344653599845, + "grad_norm": 1.8180585161160603, + "learning_rate": 1.837886949185541e-05, + "loss": 0.4462, + "step": 19590 + }, + { + "epoch": 1.9018047739181059, + "grad_norm": 2.17218483201875, + "learning_rate": 1.8362645207346356e-05, + "loss": 0.4732, + "step": 19600 + }, + { + "epoch": 1.9027750824762273, + "grad_norm": 2.150072249394045, + "learning_rate": 1.8346420922837303e-05, + "loss": 0.4807, + "step": 19610 + }, + { + "epoch": 1.903745391034349, + "grad_norm": 2.186590190383392, + "learning_rate": 1.8330196638328253e-05, + "loss": 0.5064, + "step": 19620 + }, + { + "epoch": 1.9047156995924706, + "grad_norm": 1.9773135303028604, + "learning_rate": 1.83139723538192e-05, + "loss": 0.4987, + "step": 19630 + }, + { + "epoch": 1.905686008150592, + "grad_norm": 1.7722699958849875, + "learning_rate": 1.8297748069310145e-05, + "loss": 0.493, + "step": 19640 + }, + { + "epoch": 1.9066563167087134, + "grad_norm": 2.398748653045921, + "learning_rate": 1.828152378480109e-05, + "loss": 0.4632, + "step": 19650 + }, + { + "epoch": 1.9076266252668348, + "grad_norm": 1.9381638418073313, + "learning_rate": 1.8265299500292038e-05, + "loss": 0.4879, + "step": 19660 + }, + { + "epoch": 1.9085969338249562, + "grad_norm": 1.806142449327403, + "learning_rate": 1.8249075215782984e-05, + "loss": 0.4426, + "step": 19670 + }, + { + "epoch": 1.9095672423830778, + "grad_norm": 1.8375555769454368, + "learning_rate": 1.823285093127393e-05, + "loss": 0.5133, + "step": 19680 + }, + { + "epoch": 1.9105375509411993, + "grad_norm": 2.1900711143018703, + "learning_rate": 1.8216626646764877e-05, + "loss": 0.4783, + "step": 19690 + }, + { + "epoch": 1.911507859499321, + "grad_norm": 1.7860565394856096, + "learning_rate": 1.8200402362255824e-05, + "loss": 0.5432, + "step": 19700 + }, + { + "epoch": 1.9124781680574423, + "grad_norm": 1.5287991043658957, + "learning_rate": 1.8184178077746773e-05, + "loss": 0.4818, + "step": 19710 + }, + { + "epoch": 1.9134484766155637, + "grad_norm": 1.9162344859036078, + "learning_rate": 1.816795379323772e-05, + "loss": 0.5134, + "step": 19720 + }, + { + "epoch": 1.9144187851736851, + "grad_norm": 1.9361948714776511, + "learning_rate": 1.8151729508728666e-05, + "loss": 0.4705, + "step": 19730 + }, + { + "epoch": 1.9153890937318068, + "grad_norm": 1.7948070446854587, + "learning_rate": 1.8135505224219613e-05, + "loss": 0.4713, + "step": 19740 + }, + { + "epoch": 1.9163594022899282, + "grad_norm": 1.94331048872498, + "learning_rate": 1.811928093971056e-05, + "loss": 0.4636, + "step": 19750 + }, + { + "epoch": 1.9173297108480498, + "grad_norm": 1.9083792067669565, + "learning_rate": 1.8103056655201505e-05, + "loss": 0.4985, + "step": 19760 + }, + { + "epoch": 1.9183000194061712, + "grad_norm": 1.7889544788537133, + "learning_rate": 1.808683237069245e-05, + "loss": 0.465, + "step": 19770 + }, + { + "epoch": 1.9192703279642926, + "grad_norm": 2.1133403229015606, + "learning_rate": 1.8070608086183398e-05, + "loss": 0.4962, + "step": 19780 + }, + { + "epoch": 1.920240636522414, + "grad_norm": 2.119511574698095, + "learning_rate": 1.8054383801674344e-05, + "loss": 0.5265, + "step": 19790 + }, + { + "epoch": 1.9212109450805355, + "grad_norm": 1.8530048672687303, + "learning_rate": 1.8038159517165294e-05, + "loss": 0.4674, + "step": 19800 + }, + { + "epoch": 1.922181253638657, + "grad_norm": 2.060163549018839, + "learning_rate": 1.802193523265624e-05, + "loss": 0.5232, + "step": 19810 + }, + { + "epoch": 1.9231515621967787, + "grad_norm": 1.582662573057051, + "learning_rate": 1.8005710948147187e-05, + "loss": 0.5146, + "step": 19820 + }, + { + "epoch": 1.9241218707549002, + "grad_norm": 1.9973478575975316, + "learning_rate": 1.7989486663638133e-05, + "loss": 0.5294, + "step": 19830 + }, + { + "epoch": 1.9250921793130216, + "grad_norm": 2.2324903918022088, + "learning_rate": 1.797326237912908e-05, + "loss": 0.5039, + "step": 19840 + }, + { + "epoch": 1.926062487871143, + "grad_norm": 2.24550841489829, + "learning_rate": 1.7957038094620026e-05, + "loss": 0.4763, + "step": 19850 + }, + { + "epoch": 1.9270327964292644, + "grad_norm": 1.9055133702740896, + "learning_rate": 1.7940813810110976e-05, + "loss": 0.4978, + "step": 19860 + }, + { + "epoch": 1.928003104987386, + "grad_norm": 1.6202922943730298, + "learning_rate": 1.7924589525601922e-05, + "loss": 0.4467, + "step": 19870 + }, + { + "epoch": 1.9289734135455074, + "grad_norm": 1.7946217889300042, + "learning_rate": 1.790836524109287e-05, + "loss": 0.4975, + "step": 19880 + }, + { + "epoch": 1.929943722103629, + "grad_norm": 2.0837037474907776, + "learning_rate": 1.7892140956583815e-05, + "loss": 0.4488, + "step": 19890 + }, + { + "epoch": 1.9309140306617505, + "grad_norm": 2.284301097020319, + "learning_rate": 1.7875916672074765e-05, + "loss": 0.5315, + "step": 19900 + }, + { + "epoch": 1.931884339219872, + "grad_norm": 1.6410720035809963, + "learning_rate": 1.785969238756571e-05, + "loss": 0.5064, + "step": 19910 + }, + { + "epoch": 1.9328546477779933, + "grad_norm": 1.913707413180516, + "learning_rate": 1.7843468103056658e-05, + "loss": 0.5226, + "step": 19920 + }, + { + "epoch": 1.9338249563361147, + "grad_norm": 1.972749697853565, + "learning_rate": 1.7827243818547604e-05, + "loss": 0.4794, + "step": 19930 + }, + { + "epoch": 1.9347952648942364, + "grad_norm": 2.4245035874963663, + "learning_rate": 1.781101953403855e-05, + "loss": 0.472, + "step": 19940 + }, + { + "epoch": 1.935765573452358, + "grad_norm": 1.6532981118899497, + "learning_rate": 1.7794795249529497e-05, + "loss": 0.4926, + "step": 19950 + }, + { + "epoch": 1.9367358820104794, + "grad_norm": 2.301368908039538, + "learning_rate": 1.7778570965020443e-05, + "loss": 0.4847, + "step": 19960 + }, + { + "epoch": 1.9377061905686008, + "grad_norm": 1.9798901797296686, + "learning_rate": 1.776234668051139e-05, + "loss": 0.5041, + "step": 19970 + }, + { + "epoch": 1.9386764991267222, + "grad_norm": 2.0914880764103825, + "learning_rate": 1.774612239600234e-05, + "loss": 0.4543, + "step": 19980 + }, + { + "epoch": 1.9396468076848437, + "grad_norm": 1.1710098992705869, + "learning_rate": 1.7729898111493286e-05, + "loss": 0.4568, + "step": 19990 + }, + { + "epoch": 1.9406171162429653, + "grad_norm": 2.475821372597333, + "learning_rate": 1.7713673826984232e-05, + "loss": 0.503, + "step": 20000 + }, + { + "epoch": 1.9406171162429653, + "eval_loss": 0.6306876540184021, + "eval_runtime": 2470.8815, + "eval_samples_per_second": 0.725, + "eval_steps_per_second": 0.363, + "step": 20000 + }, + { + "epoch": 1.9415874248010867, + "grad_norm": 1.8135605973580828, + "learning_rate": 1.769744954247518e-05, + "loss": 0.5225, + "step": 20010 + }, + { + "epoch": 1.9425577333592083, + "grad_norm": 1.8818540999249187, + "learning_rate": 1.7681225257966125e-05, + "loss": 0.4411, + "step": 20020 + }, + { + "epoch": 1.9435280419173298, + "grad_norm": 2.3455403519185767, + "learning_rate": 1.766500097345707e-05, + "loss": 0.4618, + "step": 20030 + }, + { + "epoch": 1.9444983504754512, + "grad_norm": 1.8781392474090728, + "learning_rate": 1.7648776688948018e-05, + "loss": 0.4757, + "step": 20040 + }, + { + "epoch": 1.9454686590335726, + "grad_norm": 2.4837262255821004, + "learning_rate": 1.7632552404438964e-05, + "loss": 0.4983, + "step": 20050 + }, + { + "epoch": 1.9464389675916942, + "grad_norm": 2.0371146667611963, + "learning_rate": 1.761632811992991e-05, + "loss": 0.4684, + "step": 20060 + }, + { + "epoch": 1.9474092761498156, + "grad_norm": 1.9070841836100068, + "learning_rate": 1.760010383542086e-05, + "loss": 0.5148, + "step": 20070 + }, + { + "epoch": 1.9483795847079373, + "grad_norm": 1.6764022613616618, + "learning_rate": 1.7583879550911806e-05, + "loss": 0.5129, + "step": 20080 + }, + { + "epoch": 1.9493498932660587, + "grad_norm": 1.801575932413747, + "learning_rate": 1.7567655266402753e-05, + "loss": 0.4924, + "step": 20090 + }, + { + "epoch": 1.95032020182418, + "grad_norm": 1.417352324977605, + "learning_rate": 1.75514309818937e-05, + "loss": 0.4871, + "step": 20100 + }, + { + "epoch": 1.9512905103823015, + "grad_norm": 1.7430495978746263, + "learning_rate": 1.7535206697384646e-05, + "loss": 0.4906, + "step": 20110 + }, + { + "epoch": 1.952260818940423, + "grad_norm": 1.9378606596579797, + "learning_rate": 1.7518982412875592e-05, + "loss": 0.4441, + "step": 20120 + }, + { + "epoch": 1.9532311274985446, + "grad_norm": 2.024032521983931, + "learning_rate": 1.750275812836654e-05, + "loss": 0.4574, + "step": 20130 + }, + { + "epoch": 1.9542014360566662, + "grad_norm": 1.9748616447898866, + "learning_rate": 1.7486533843857485e-05, + "loss": 0.4955, + "step": 20140 + }, + { + "epoch": 1.9551717446147876, + "grad_norm": 2.0789037317300876, + "learning_rate": 1.7470309559348435e-05, + "loss": 0.5176, + "step": 20150 + }, + { + "epoch": 1.956142053172909, + "grad_norm": 1.7669158731638315, + "learning_rate": 1.745408527483938e-05, + "loss": 0.4242, + "step": 20160 + }, + { + "epoch": 1.9571123617310304, + "grad_norm": 2.15917406316717, + "learning_rate": 1.7437860990330327e-05, + "loss": 0.457, + "step": 20170 + }, + { + "epoch": 1.9580826702891518, + "grad_norm": 1.7798687601178151, + "learning_rate": 1.7421636705821274e-05, + "loss": 0.467, + "step": 20180 + }, + { + "epoch": 1.9590529788472735, + "grad_norm": 2.2050151044492154, + "learning_rate": 1.740541242131222e-05, + "loss": 0.4562, + "step": 20190 + }, + { + "epoch": 1.9600232874053949, + "grad_norm": 1.7571600556165854, + "learning_rate": 1.7389188136803166e-05, + "loss": 0.4951, + "step": 20200 + }, + { + "epoch": 1.9609935959635165, + "grad_norm": 2.370912740726444, + "learning_rate": 1.7372963852294113e-05, + "loss": 0.5177, + "step": 20210 + }, + { + "epoch": 1.961963904521638, + "grad_norm": 1.9598530660426143, + "learning_rate": 1.735673956778506e-05, + "loss": 0.4898, + "step": 20220 + }, + { + "epoch": 1.9629342130797593, + "grad_norm": 1.8458335418058476, + "learning_rate": 1.7340515283276006e-05, + "loss": 0.4685, + "step": 20230 + }, + { + "epoch": 1.9639045216378808, + "grad_norm": 2.086603579490736, + "learning_rate": 1.7324290998766955e-05, + "loss": 0.4655, + "step": 20240 + }, + { + "epoch": 1.9648748301960022, + "grad_norm": 1.8861003799697185, + "learning_rate": 1.7308066714257902e-05, + "loss": 0.5012, + "step": 20250 + }, + { + "epoch": 1.9658451387541238, + "grad_norm": 2.0410888085121655, + "learning_rate": 1.729184242974885e-05, + "loss": 0.4686, + "step": 20260 + }, + { + "epoch": 1.9668154473122454, + "grad_norm": 2.0486563420463293, + "learning_rate": 1.7275618145239798e-05, + "loss": 0.4971, + "step": 20270 + }, + { + "epoch": 1.9677857558703669, + "grad_norm": 2.053946424813733, + "learning_rate": 1.7259393860730744e-05, + "loss": 0.5002, + "step": 20280 + }, + { + "epoch": 1.9687560644284883, + "grad_norm": 1.933061743356704, + "learning_rate": 1.724316957622169e-05, + "loss": 0.5072, + "step": 20290 + }, + { + "epoch": 1.9697263729866097, + "grad_norm": 2.066749720982141, + "learning_rate": 1.7226945291712637e-05, + "loss": 0.4928, + "step": 20300 + }, + { + "epoch": 1.970696681544731, + "grad_norm": 1.55984280015245, + "learning_rate": 1.7210721007203583e-05, + "loss": 0.4681, + "step": 20310 + }, + { + "epoch": 1.9716669901028527, + "grad_norm": 2.0658531156309254, + "learning_rate": 1.719449672269453e-05, + "loss": 0.4451, + "step": 20320 + }, + { + "epoch": 1.9726372986609741, + "grad_norm": 1.56362708496437, + "learning_rate": 1.7178272438185476e-05, + "loss": 0.5221, + "step": 20330 + }, + { + "epoch": 1.9736076072190958, + "grad_norm": 2.161774232597424, + "learning_rate": 1.7162048153676426e-05, + "loss": 0.5527, + "step": 20340 + }, + { + "epoch": 1.9745779157772172, + "grad_norm": 2.411930244708304, + "learning_rate": 1.7145823869167372e-05, + "loss": 0.4742, + "step": 20350 + }, + { + "epoch": 1.9755482243353386, + "grad_norm": 2.0274695839961803, + "learning_rate": 1.712959958465832e-05, + "loss": 0.5055, + "step": 20360 + }, + { + "epoch": 1.97651853289346, + "grad_norm": 1.9046814027988292, + "learning_rate": 1.7113375300149265e-05, + "loss": 0.5018, + "step": 20370 + }, + { + "epoch": 1.9774888414515817, + "grad_norm": 2.6745574351193437, + "learning_rate": 1.709715101564021e-05, + "loss": 0.5013, + "step": 20380 + }, + { + "epoch": 1.978459150009703, + "grad_norm": 1.983809422276857, + "learning_rate": 1.7080926731131158e-05, + "loss": 0.506, + "step": 20390 + }, + { + "epoch": 1.9794294585678247, + "grad_norm": 1.8313417501684919, + "learning_rate": 1.7064702446622104e-05, + "loss": 0.4913, + "step": 20400 + }, + { + "epoch": 1.9803997671259461, + "grad_norm": 1.9488632706180251, + "learning_rate": 1.704847816211305e-05, + "loss": 0.4753, + "step": 20410 + }, + { + "epoch": 1.9813700756840675, + "grad_norm": 2.1235067779704386, + "learning_rate": 1.7032253877603997e-05, + "loss": 0.5076, + "step": 20420 + }, + { + "epoch": 1.982340384242189, + "grad_norm": 1.9628606937438253, + "learning_rate": 1.7016029593094947e-05, + "loss": 0.4488, + "step": 20430 + }, + { + "epoch": 1.9833106928003104, + "grad_norm": 2.053405187246396, + "learning_rate": 1.6999805308585893e-05, + "loss": 0.4492, + "step": 20440 + }, + { + "epoch": 1.984281001358432, + "grad_norm": 2.027877392955927, + "learning_rate": 1.698358102407684e-05, + "loss": 0.4481, + "step": 20450 + }, + { + "epoch": 1.9852513099165536, + "grad_norm": 2.6424653212507585, + "learning_rate": 1.6967356739567786e-05, + "loss": 0.4866, + "step": 20460 + }, + { + "epoch": 1.986221618474675, + "grad_norm": 1.798153398364296, + "learning_rate": 1.6951132455058732e-05, + "loss": 0.4901, + "step": 20470 + }, + { + "epoch": 1.9871919270327965, + "grad_norm": 1.9375715199504966, + "learning_rate": 1.693490817054968e-05, + "loss": 0.5145, + "step": 20480 + }, + { + "epoch": 1.9881622355909179, + "grad_norm": 2.227384783436997, + "learning_rate": 1.6918683886040625e-05, + "loss": 0.5234, + "step": 20490 + }, + { + "epoch": 1.9891325441490393, + "grad_norm": 1.6406328006727198, + "learning_rate": 1.690245960153157e-05, + "loss": 0.4938, + "step": 20500 + }, + { + "epoch": 1.9891325441490393, + "eval_loss": 0.6277603507041931, + "eval_runtime": 3080.184, + "eval_samples_per_second": 0.582, + "eval_steps_per_second": 0.291, + "step": 20500 + }, + { + "epoch": 1.990102852707161, + "grad_norm": 2.189175771584087, + "learning_rate": 1.688623531702252e-05, + "loss": 0.4815, + "step": 20510 + }, + { + "epoch": 1.9910731612652823, + "grad_norm": 1.9037268932948403, + "learning_rate": 1.6870011032513468e-05, + "loss": 0.4675, + "step": 20520 + }, + { + "epoch": 1.992043469823404, + "grad_norm": 2.1370328259985096, + "learning_rate": 1.6853786748004414e-05, + "loss": 0.4696, + "step": 20530 + }, + { + "epoch": 1.9930137783815254, + "grad_norm": 2.407631343670518, + "learning_rate": 1.683756246349536e-05, + "loss": 0.4904, + "step": 20540 + }, + { + "epoch": 1.9939840869396468, + "grad_norm": 2.113566676023369, + "learning_rate": 1.6821338178986307e-05, + "loss": 0.4713, + "step": 20550 + }, + { + "epoch": 1.9949543954977682, + "grad_norm": 1.7998412815701594, + "learning_rate": 1.6805113894477253e-05, + "loss": 0.4638, + "step": 20560 + }, + { + "epoch": 1.9959247040558896, + "grad_norm": 2.129222373307167, + "learning_rate": 1.67888896099682e-05, + "loss": 0.4567, + "step": 20570 + }, + { + "epoch": 1.9968950126140113, + "grad_norm": 2.0439893320145197, + "learning_rate": 1.6772665325459146e-05, + "loss": 0.499, + "step": 20580 + }, + { + "epoch": 1.997865321172133, + "grad_norm": 1.996718548398611, + "learning_rate": 1.6756441040950092e-05, + "loss": 0.4657, + "step": 20590 + }, + { + "epoch": 1.9988356297302543, + "grad_norm": 1.8047941252901814, + "learning_rate": 1.6740216756441042e-05, + "loss": 0.4723, + "step": 20600 + }, + { + "epoch": 1.9998059382883757, + "grad_norm": 1.8452676210065997, + "learning_rate": 1.672399247193199e-05, + "loss": 0.4802, + "step": 20610 + }, + { + "epoch": 2.000776246846497, + "grad_norm": 2.2446601261564254, + "learning_rate": 1.6707768187422935e-05, + "loss": 0.501, + "step": 20620 + }, + { + "epoch": 2.0017465554046185, + "grad_norm": 1.6471605415361736, + "learning_rate": 1.669154390291388e-05, + "loss": 0.4735, + "step": 20630 + }, + { + "epoch": 2.00271686396274, + "grad_norm": 2.038536155314213, + "learning_rate": 1.667531961840483e-05, + "loss": 0.4575, + "step": 20640 + }, + { + "epoch": 2.003687172520862, + "grad_norm": 2.1416733149321834, + "learning_rate": 1.6659095333895777e-05, + "loss": 0.5152, + "step": 20650 + }, + { + "epoch": 2.0046574810789832, + "grad_norm": 1.9639254031456526, + "learning_rate": 1.6642871049386724e-05, + "loss": 0.4424, + "step": 20660 + }, + { + "epoch": 2.0056277896371046, + "grad_norm": 2.1201945824283777, + "learning_rate": 1.662664676487767e-05, + "loss": 0.5194, + "step": 20670 + }, + { + "epoch": 2.006598098195226, + "grad_norm": 2.0259029186156616, + "learning_rate": 1.6610422480368617e-05, + "loss": 0.5085, + "step": 20680 + }, + { + "epoch": 2.0075684067533475, + "grad_norm": 1.5035201214590004, + "learning_rate": 1.6594198195859563e-05, + "loss": 0.445, + "step": 20690 + }, + { + "epoch": 2.008538715311469, + "grad_norm": 1.655200868630425, + "learning_rate": 1.6577973911350513e-05, + "loss": 0.4247, + "step": 20700 + }, + { + "epoch": 2.0095090238695907, + "grad_norm": 2.1707069905375764, + "learning_rate": 1.656174962684146e-05, + "loss": 0.4783, + "step": 20710 + }, + { + "epoch": 2.010479332427712, + "grad_norm": 1.8212536925620644, + "learning_rate": 1.6545525342332405e-05, + "loss": 0.4972, + "step": 20720 + }, + { + "epoch": 2.0114496409858336, + "grad_norm": 2.126405409204561, + "learning_rate": 1.6529301057823352e-05, + "loss": 0.4878, + "step": 20730 + }, + { + "epoch": 2.012419949543955, + "grad_norm": 2.077712603387953, + "learning_rate": 1.6513076773314298e-05, + "loss": 0.4571, + "step": 20740 + }, + { + "epoch": 2.0133902581020764, + "grad_norm": 2.1605056660181345, + "learning_rate": 1.6496852488805245e-05, + "loss": 0.5012, + "step": 20750 + }, + { + "epoch": 2.014360566660198, + "grad_norm": 1.8021397618450556, + "learning_rate": 1.648062820429619e-05, + "loss": 0.5049, + "step": 20760 + }, + { + "epoch": 2.015330875218319, + "grad_norm": 1.8857630619437384, + "learning_rate": 1.6464403919787137e-05, + "loss": 0.4726, + "step": 20770 + }, + { + "epoch": 2.016301183776441, + "grad_norm": 1.7074209121094917, + "learning_rate": 1.6448179635278084e-05, + "loss": 0.4914, + "step": 20780 + }, + { + "epoch": 2.0172714923345625, + "grad_norm": 2.6019622648688614, + "learning_rate": 1.6431955350769033e-05, + "loss": 0.4239, + "step": 20790 + }, + { + "epoch": 2.018241800892684, + "grad_norm": 1.6879446237580993, + "learning_rate": 1.641573106625998e-05, + "loss": 0.4657, + "step": 20800 + }, + { + "epoch": 2.0192121094508053, + "grad_norm": 1.4603799114693456, + "learning_rate": 1.6399506781750926e-05, + "loss": 0.4503, + "step": 20810 + }, + { + "epoch": 2.0201824180089267, + "grad_norm": 1.948639651180534, + "learning_rate": 1.6383282497241873e-05, + "loss": 0.4544, + "step": 20820 + }, + { + "epoch": 2.021152726567048, + "grad_norm": 2.1458744835535617, + "learning_rate": 1.636705821273282e-05, + "loss": 0.4632, + "step": 20830 + }, + { + "epoch": 2.02212303512517, + "grad_norm": 2.3393805927580997, + "learning_rate": 1.6350833928223765e-05, + "loss": 0.4686, + "step": 20840 + }, + { + "epoch": 2.0230933436832914, + "grad_norm": 1.8274491944101148, + "learning_rate": 1.6334609643714712e-05, + "loss": 0.4509, + "step": 20850 + }, + { + "epoch": 2.024063652241413, + "grad_norm": 2.5507345485151767, + "learning_rate": 1.6318385359205658e-05, + "loss": 0.5145, + "step": 20860 + }, + { + "epoch": 2.0250339607995342, + "grad_norm": 1.878444628312939, + "learning_rate": 1.6302161074696608e-05, + "loss": 0.449, + "step": 20870 + }, + { + "epoch": 2.0260042693576557, + "grad_norm": 2.1096395212213235, + "learning_rate": 1.6285936790187554e-05, + "loss": 0.409, + "step": 20880 + }, + { + "epoch": 2.026974577915777, + "grad_norm": 1.9597465473748303, + "learning_rate": 1.62697125056785e-05, + "loss": 0.4909, + "step": 20890 + }, + { + "epoch": 2.0279448864738985, + "grad_norm": 1.4262456717146317, + "learning_rate": 1.6253488221169447e-05, + "loss": 0.4933, + "step": 20900 + }, + { + "epoch": 2.0289151950320203, + "grad_norm": 1.7254635997166883, + "learning_rate": 1.6237263936660393e-05, + "loss": 0.4581, + "step": 20910 + }, + { + "epoch": 2.0298855035901417, + "grad_norm": 1.8255091897283775, + "learning_rate": 1.622103965215134e-05, + "loss": 0.462, + "step": 20920 + }, + { + "epoch": 2.030855812148263, + "grad_norm": 1.592461465195459, + "learning_rate": 1.6204815367642286e-05, + "loss": 0.5343, + "step": 20930 + }, + { + "epoch": 2.0318261207063846, + "grad_norm": 2.4124278045088445, + "learning_rate": 1.6188591083133233e-05, + "loss": 0.4738, + "step": 20940 + }, + { + "epoch": 2.032796429264506, + "grad_norm": 1.4707790381425363, + "learning_rate": 1.617236679862418e-05, + "loss": 0.4183, + "step": 20950 + }, + { + "epoch": 2.0337667378226274, + "grad_norm": 1.7383346761795593, + "learning_rate": 1.615614251411513e-05, + "loss": 0.4351, + "step": 20960 + }, + { + "epoch": 2.0347370463807493, + "grad_norm": 2.292481354882268, + "learning_rate": 1.6139918229606075e-05, + "loss": 0.4294, + "step": 20970 + }, + { + "epoch": 2.0357073549388707, + "grad_norm": 2.158168642341278, + "learning_rate": 1.612369394509702e-05, + "loss": 0.4703, + "step": 20980 + }, + { + "epoch": 2.036677663496992, + "grad_norm": 2.283161751864384, + "learning_rate": 1.6107469660587968e-05, + "loss": 0.5128, + "step": 20990 + }, + { + "epoch": 2.0376479720551135, + "grad_norm": 2.125582882132147, + "learning_rate": 1.6091245376078914e-05, + "loss": 0.4652, + "step": 21000 + }, + { + "epoch": 2.0376479720551135, + "eval_loss": 0.6309866905212402, + "eval_runtime": 3136.9547, + "eval_samples_per_second": 0.571, + "eval_steps_per_second": 0.286, + "step": 21000 + }, + { + "epoch": 2.038618280613235, + "grad_norm": 1.9414092633891158, + "learning_rate": 1.607502109156986e-05, + "loss": 0.4451, + "step": 21010 + }, + { + "epoch": 2.0395885891713563, + "grad_norm": 2.0603594802374863, + "learning_rate": 1.6058796807060807e-05, + "loss": 0.4569, + "step": 21020 + }, + { + "epoch": 2.040558897729478, + "grad_norm": 1.6323777472844119, + "learning_rate": 1.6042572522551757e-05, + "loss": 0.4764, + "step": 21030 + }, + { + "epoch": 2.0415292062875996, + "grad_norm": 1.8941882942793062, + "learning_rate": 1.6026348238042703e-05, + "loss": 0.4514, + "step": 21040 + }, + { + "epoch": 2.042499514845721, + "grad_norm": 1.905361739266791, + "learning_rate": 1.601012395353365e-05, + "loss": 0.4681, + "step": 21050 + }, + { + "epoch": 2.0434698234038424, + "grad_norm": 2.357513508114439, + "learning_rate": 1.59938996690246e-05, + "loss": 0.5029, + "step": 21060 + }, + { + "epoch": 2.044440131961964, + "grad_norm": 2.045526641253892, + "learning_rate": 1.5977675384515546e-05, + "loss": 0.5031, + "step": 21070 + }, + { + "epoch": 2.0454104405200852, + "grad_norm": 1.8018214238085957, + "learning_rate": 1.5961451100006492e-05, + "loss": 0.475, + "step": 21080 + }, + { + "epoch": 2.0463807490782067, + "grad_norm": 2.157846458503379, + "learning_rate": 1.594522681549744e-05, + "loss": 0.4922, + "step": 21090 + }, + { + "epoch": 2.0473510576363285, + "grad_norm": 2.2821214882561773, + "learning_rate": 1.5929002530988385e-05, + "loss": 0.4361, + "step": 21100 + }, + { + "epoch": 2.04832136619445, + "grad_norm": 2.4410635323771395, + "learning_rate": 1.591277824647933e-05, + "loss": 0.4426, + "step": 21110 + }, + { + "epoch": 2.0492916747525713, + "grad_norm": 1.847883527949176, + "learning_rate": 1.5896553961970278e-05, + "loss": 0.4998, + "step": 21120 + }, + { + "epoch": 2.0502619833106928, + "grad_norm": 2.086929164640455, + "learning_rate": 1.5880329677461224e-05, + "loss": 0.5218, + "step": 21130 + }, + { + "epoch": 2.051232291868814, + "grad_norm": 1.8955619202400775, + "learning_rate": 1.5864105392952174e-05, + "loss": 0.519, + "step": 21140 + }, + { + "epoch": 2.0522026004269356, + "grad_norm": 2.43252621811095, + "learning_rate": 1.584788110844312e-05, + "loss": 0.4439, + "step": 21150 + }, + { + "epoch": 2.0531729089850574, + "grad_norm": 2.0827457082492464, + "learning_rate": 1.5831656823934067e-05, + "loss": 0.5047, + "step": 21160 + }, + { + "epoch": 2.054143217543179, + "grad_norm": 1.9212860192837349, + "learning_rate": 1.5815432539425013e-05, + "loss": 0.4655, + "step": 21170 + }, + { + "epoch": 2.0551135261013003, + "grad_norm": 2.090169452444841, + "learning_rate": 1.579920825491596e-05, + "loss": 0.4421, + "step": 21180 + }, + { + "epoch": 2.0560838346594217, + "grad_norm": 1.9503749019739738, + "learning_rate": 1.5782983970406906e-05, + "loss": 0.442, + "step": 21190 + }, + { + "epoch": 2.057054143217543, + "grad_norm": 1.724201210329928, + "learning_rate": 1.5766759685897852e-05, + "loss": 0.3953, + "step": 21200 + }, + { + "epoch": 2.0580244517756645, + "grad_norm": 2.081302027118628, + "learning_rate": 1.57505354013888e-05, + "loss": 0.4846, + "step": 21210 + }, + { + "epoch": 2.0589947603337864, + "grad_norm": 2.5592097491782546, + "learning_rate": 1.5734311116879745e-05, + "loss": 0.4428, + "step": 21220 + }, + { + "epoch": 2.059965068891908, + "grad_norm": 1.9807395474177856, + "learning_rate": 1.5718086832370695e-05, + "loss": 0.4843, + "step": 21230 + }, + { + "epoch": 2.060935377450029, + "grad_norm": 1.946678515294379, + "learning_rate": 1.570186254786164e-05, + "loss": 0.5123, + "step": 21240 + }, + { + "epoch": 2.0619056860081506, + "grad_norm": 1.704583779463954, + "learning_rate": 1.5685638263352587e-05, + "loss": 0.5018, + "step": 21250 + }, + { + "epoch": 2.062875994566272, + "grad_norm": 1.9509206749604142, + "learning_rate": 1.5669413978843534e-05, + "loss": 0.4156, + "step": 21260 + }, + { + "epoch": 2.0638463031243934, + "grad_norm": 2.0838442571499365, + "learning_rate": 1.565318969433448e-05, + "loss": 0.4632, + "step": 21270 + }, + { + "epoch": 2.064816611682515, + "grad_norm": 1.9349794015053616, + "learning_rate": 1.5636965409825427e-05, + "loss": 0.4493, + "step": 21280 + }, + { + "epoch": 2.0657869202406367, + "grad_norm": 2.188378390390559, + "learning_rate": 1.5620741125316373e-05, + "loss": 0.5279, + "step": 21290 + }, + { + "epoch": 2.066757228798758, + "grad_norm": 1.8328420572385127, + "learning_rate": 1.560451684080732e-05, + "loss": 0.4611, + "step": 21300 + }, + { + "epoch": 2.0677275373568795, + "grad_norm": 1.563376875838364, + "learning_rate": 1.5588292556298266e-05, + "loss": 0.4449, + "step": 21310 + }, + { + "epoch": 2.068697845915001, + "grad_norm": 2.231697202375159, + "learning_rate": 1.5572068271789215e-05, + "loss": 0.4763, + "step": 21320 + }, + { + "epoch": 2.0696681544731224, + "grad_norm": 2.116198927861082, + "learning_rate": 1.5555843987280162e-05, + "loss": 0.4313, + "step": 21330 + }, + { + "epoch": 2.0706384630312438, + "grad_norm": 2.3801041085422763, + "learning_rate": 1.5539619702771108e-05, + "loss": 0.4643, + "step": 21340 + }, + { + "epoch": 2.0716087715893656, + "grad_norm": 1.8571501672365118, + "learning_rate": 1.5523395418262055e-05, + "loss": 0.4423, + "step": 21350 + }, + { + "epoch": 2.072579080147487, + "grad_norm": 1.6559131513123913, + "learning_rate": 1.5507171133753e-05, + "loss": 0.4437, + "step": 21360 + }, + { + "epoch": 2.0735493887056085, + "grad_norm": 2.137893390544345, + "learning_rate": 1.5490946849243947e-05, + "loss": 0.4732, + "step": 21370 + }, + { + "epoch": 2.07451969726373, + "grad_norm": 1.4742723623250382, + "learning_rate": 1.5474722564734894e-05, + "loss": 0.412, + "step": 21380 + }, + { + "epoch": 2.0754900058218513, + "grad_norm": 2.1318829799319543, + "learning_rate": 1.545849828022584e-05, + "loss": 0.4916, + "step": 21390 + }, + { + "epoch": 2.0764603143799727, + "grad_norm": 2.307110089522299, + "learning_rate": 1.544227399571679e-05, + "loss": 0.4124, + "step": 21400 + }, + { + "epoch": 2.077430622938094, + "grad_norm": 2.3097275184207637, + "learning_rate": 1.5426049711207736e-05, + "loss": 0.4799, + "step": 21410 + }, + { + "epoch": 2.078400931496216, + "grad_norm": 1.5854013780070428, + "learning_rate": 1.5409825426698686e-05, + "loss": 0.4759, + "step": 21420 + }, + { + "epoch": 2.0793712400543374, + "grad_norm": 2.008961619138051, + "learning_rate": 1.5393601142189632e-05, + "loss": 0.4844, + "step": 21430 + }, + { + "epoch": 2.080341548612459, + "grad_norm": 1.848635541993877, + "learning_rate": 1.537737685768058e-05, + "loss": 0.4558, + "step": 21440 + }, + { + "epoch": 2.08131185717058, + "grad_norm": 2.3862164923195217, + "learning_rate": 1.5361152573171525e-05, + "loss": 0.4514, + "step": 21450 + }, + { + "epoch": 2.0822821657287016, + "grad_norm": 2.322912088388782, + "learning_rate": 1.534492828866247e-05, + "loss": 0.4507, + "step": 21460 + }, + { + "epoch": 2.083252474286823, + "grad_norm": 2.1765100761424074, + "learning_rate": 1.5328704004153418e-05, + "loss": 0.4114, + "step": 21470 + }, + { + "epoch": 2.084222782844945, + "grad_norm": 1.915448768034906, + "learning_rate": 1.5312479719644364e-05, + "loss": 0.4794, + "step": 21480 + }, + { + "epoch": 2.0851930914030663, + "grad_norm": 1.955422403883371, + "learning_rate": 1.529625543513531e-05, + "loss": 0.485, + "step": 21490 + }, + { + "epoch": 2.0861633999611877, + "grad_norm": 1.5866416015413762, + "learning_rate": 1.528003115062626e-05, + "loss": 0.4961, + "step": 21500 + }, + { + "epoch": 2.0861633999611877, + "eval_loss": 0.6304420232772827, + "eval_runtime": 3075.3205, + "eval_samples_per_second": 0.583, + "eval_steps_per_second": 0.291, + "step": 21500 + }, + { + "epoch": 2.087133708519309, + "grad_norm": 1.8961317659006691, + "learning_rate": 1.5263806866117207e-05, + "loss": 0.4803, + "step": 21510 + }, + { + "epoch": 2.0881040170774305, + "grad_norm": 2.1574061097476633, + "learning_rate": 1.5247582581608153e-05, + "loss": 0.4897, + "step": 21520 + }, + { + "epoch": 2.089074325635552, + "grad_norm": 2.080433640436673, + "learning_rate": 1.52313582970991e-05, + "loss": 0.4623, + "step": 21530 + }, + { + "epoch": 2.0900446341936734, + "grad_norm": 2.274765795002327, + "learning_rate": 1.5215134012590046e-05, + "loss": 0.5149, + "step": 21540 + }, + { + "epoch": 2.0910149427517952, + "grad_norm": 2.514598169769852, + "learning_rate": 1.5198909728080992e-05, + "loss": 0.4028, + "step": 21550 + }, + { + "epoch": 2.0919852513099166, + "grad_norm": 2.1324330512224057, + "learning_rate": 1.5182685443571939e-05, + "loss": 0.5213, + "step": 21560 + }, + { + "epoch": 2.092955559868038, + "grad_norm": 2.0829108139421106, + "learning_rate": 1.5166461159062887e-05, + "loss": 0.4338, + "step": 21570 + }, + { + "epoch": 2.0939258684261595, + "grad_norm": 1.9606921161193192, + "learning_rate": 1.5150236874553833e-05, + "loss": 0.5347, + "step": 21580 + }, + { + "epoch": 2.094896176984281, + "grad_norm": 1.9007868794459526, + "learning_rate": 1.513401259004478e-05, + "loss": 0.4698, + "step": 21590 + }, + { + "epoch": 2.0958664855424023, + "grad_norm": 2.2070560808992385, + "learning_rate": 1.5117788305535726e-05, + "loss": 0.4992, + "step": 21600 + }, + { + "epoch": 2.096836794100524, + "grad_norm": 1.7052502823407865, + "learning_rate": 1.5101564021026674e-05, + "loss": 0.4448, + "step": 21610 + }, + { + "epoch": 2.0978071026586456, + "grad_norm": 1.9838818965293705, + "learning_rate": 1.508533973651762e-05, + "loss": 0.452, + "step": 21620 + }, + { + "epoch": 2.098777411216767, + "grad_norm": 2.361870038528201, + "learning_rate": 1.5069115452008567e-05, + "loss": 0.4772, + "step": 21630 + }, + { + "epoch": 2.0997477197748884, + "grad_norm": 2.3190708510226767, + "learning_rate": 1.5052891167499513e-05, + "loss": 0.4732, + "step": 21640 + }, + { + "epoch": 2.10071802833301, + "grad_norm": 2.0341040427547967, + "learning_rate": 1.5036666882990461e-05, + "loss": 0.5013, + "step": 21650 + }, + { + "epoch": 2.101688336891131, + "grad_norm": 2.2854031285851337, + "learning_rate": 1.5020442598481408e-05, + "loss": 0.4587, + "step": 21660 + }, + { + "epoch": 2.102658645449253, + "grad_norm": 2.171534197336386, + "learning_rate": 1.5004218313972354e-05, + "loss": 0.4685, + "step": 21670 + }, + { + "epoch": 2.1036289540073745, + "grad_norm": 2.237731272885117, + "learning_rate": 1.49879940294633e-05, + "loss": 0.4714, + "step": 21680 + }, + { + "epoch": 2.104599262565496, + "grad_norm": 1.83625835750722, + "learning_rate": 1.4971769744954247e-05, + "loss": 0.4858, + "step": 21690 + }, + { + "epoch": 2.1055695711236173, + "grad_norm": 2.1226910702010935, + "learning_rate": 1.4955545460445195e-05, + "loss": 0.4735, + "step": 21700 + }, + { + "epoch": 2.1065398796817387, + "grad_norm": 2.4412848671448764, + "learning_rate": 1.4939321175936141e-05, + "loss": 0.4955, + "step": 21710 + }, + { + "epoch": 2.10751018823986, + "grad_norm": 2.171686191103959, + "learning_rate": 1.4923096891427088e-05, + "loss": 0.4882, + "step": 21720 + }, + { + "epoch": 2.1084804967979816, + "grad_norm": 1.9028000906788562, + "learning_rate": 1.4906872606918034e-05, + "loss": 0.5008, + "step": 21730 + }, + { + "epoch": 2.1094508053561034, + "grad_norm": 2.032206148469331, + "learning_rate": 1.4890648322408982e-05, + "loss": 0.4688, + "step": 21740 + }, + { + "epoch": 2.110421113914225, + "grad_norm": 2.2254255720592457, + "learning_rate": 1.4874424037899929e-05, + "loss": 0.434, + "step": 21750 + }, + { + "epoch": 2.1113914224723462, + "grad_norm": 2.013615294714695, + "learning_rate": 1.4858199753390875e-05, + "loss": 0.4429, + "step": 21760 + }, + { + "epoch": 2.1123617310304676, + "grad_norm": 1.625353905024779, + "learning_rate": 1.4841975468881821e-05, + "loss": 0.4197, + "step": 21770 + }, + { + "epoch": 2.113332039588589, + "grad_norm": 2.6956426206670074, + "learning_rate": 1.482575118437277e-05, + "loss": 0.4412, + "step": 21780 + }, + { + "epoch": 2.1143023481467105, + "grad_norm": 1.642670039289611, + "learning_rate": 1.4809526899863716e-05, + "loss": 0.4365, + "step": 21790 + }, + { + "epoch": 2.1152726567048323, + "grad_norm": 2.047556877474702, + "learning_rate": 1.4793302615354662e-05, + "loss": 0.4882, + "step": 21800 + }, + { + "epoch": 2.1162429652629537, + "grad_norm": 2.666095406948268, + "learning_rate": 1.4777078330845612e-05, + "loss": 0.4885, + "step": 21810 + }, + { + "epoch": 2.117213273821075, + "grad_norm": 2.0148741202390736, + "learning_rate": 1.4760854046336558e-05, + "loss": 0.5092, + "step": 21820 + }, + { + "epoch": 2.1181835823791966, + "grad_norm": 1.5220212421711388, + "learning_rate": 1.4744629761827505e-05, + "loss": 0.5007, + "step": 21830 + }, + { + "epoch": 2.119153890937318, + "grad_norm": 2.2566622563684446, + "learning_rate": 1.4728405477318453e-05, + "loss": 0.4386, + "step": 21840 + }, + { + "epoch": 2.1201241994954394, + "grad_norm": 2.13286935915719, + "learning_rate": 1.47121811928094e-05, + "loss": 0.4559, + "step": 21850 + }, + { + "epoch": 2.1210945080535613, + "grad_norm": 1.8220197546440986, + "learning_rate": 1.4695956908300346e-05, + "loss": 0.451, + "step": 21860 + }, + { + "epoch": 2.1220648166116827, + "grad_norm": 2.2468599066846653, + "learning_rate": 1.4679732623791292e-05, + "loss": 0.446, + "step": 21870 + }, + { + "epoch": 2.123035125169804, + "grad_norm": 1.8918874944552218, + "learning_rate": 1.466350833928224e-05, + "loss": 0.459, + "step": 21880 + }, + { + "epoch": 2.1240054337279255, + "grad_norm": 2.247255412788798, + "learning_rate": 1.4647284054773186e-05, + "loss": 0.5151, + "step": 21890 + }, + { + "epoch": 2.124975742286047, + "grad_norm": 1.8291853119382993, + "learning_rate": 1.4631059770264133e-05, + "loss": 0.4557, + "step": 21900 + }, + { + "epoch": 2.1259460508441683, + "grad_norm": 2.2000080164154654, + "learning_rate": 1.461483548575508e-05, + "loss": 0.5106, + "step": 21910 + }, + { + "epoch": 2.1269163594022897, + "grad_norm": 1.608928034740487, + "learning_rate": 1.4598611201246026e-05, + "loss": 0.4696, + "step": 21920 + }, + { + "epoch": 2.1278866679604116, + "grad_norm": 1.7796242187219558, + "learning_rate": 1.4582386916736974e-05, + "loss": 0.4372, + "step": 21930 + }, + { + "epoch": 2.128856976518533, + "grad_norm": 1.9355986409124974, + "learning_rate": 1.456616263222792e-05, + "loss": 0.4601, + "step": 21940 + }, + { + "epoch": 2.1298272850766544, + "grad_norm": 2.5044478412060776, + "learning_rate": 1.4549938347718866e-05, + "loss": 0.5153, + "step": 21950 + }, + { + "epoch": 2.130797593634776, + "grad_norm": 2.1644310902185264, + "learning_rate": 1.4533714063209813e-05, + "loss": 0.4755, + "step": 21960 + }, + { + "epoch": 2.1317679021928972, + "grad_norm": 1.9759401921373174, + "learning_rate": 1.451748977870076e-05, + "loss": 0.5306, + "step": 21970 + }, + { + "epoch": 2.1327382107510187, + "grad_norm": 2.5662041065947827, + "learning_rate": 1.4501265494191707e-05, + "loss": 0.4936, + "step": 21980 + }, + { + "epoch": 2.13370851930914, + "grad_norm": 1.834252286116916, + "learning_rate": 1.4485041209682654e-05, + "loss": 0.4248, + "step": 21990 + }, + { + "epoch": 2.134678827867262, + "grad_norm": 2.048099727707521, + "learning_rate": 1.44688169251736e-05, + "loss": 0.4573, + "step": 22000 + }, + { + "epoch": 2.134678827867262, + "eval_loss": 0.6303107142448425, + "eval_runtime": 3417.8488, + "eval_samples_per_second": 0.524, + "eval_steps_per_second": 0.262, + "step": 22000 + }, + { + "epoch": 2.1356491364253833, + "grad_norm": 2.032462630747061, + "learning_rate": 1.4452592640664548e-05, + "loss": 0.4496, + "step": 22010 + }, + { + "epoch": 2.1366194449835048, + "grad_norm": 2.0538977423462548, + "learning_rate": 1.4436368356155494e-05, + "loss": 0.4512, + "step": 22020 + }, + { + "epoch": 2.137589753541626, + "grad_norm": 2.0559328662146403, + "learning_rate": 1.442014407164644e-05, + "loss": 0.4469, + "step": 22030 + }, + { + "epoch": 2.1385600620997476, + "grad_norm": 1.8412343311132753, + "learning_rate": 1.4403919787137387e-05, + "loss": 0.449, + "step": 22040 + }, + { + "epoch": 2.139530370657869, + "grad_norm": 2.0513694288656765, + "learning_rate": 1.4387695502628334e-05, + "loss": 0.4709, + "step": 22050 + }, + { + "epoch": 2.140500679215991, + "grad_norm": 1.7890079195029862, + "learning_rate": 1.4371471218119282e-05, + "loss": 0.4419, + "step": 22060 + }, + { + "epoch": 2.1414709877741123, + "grad_norm": 2.535357630446327, + "learning_rate": 1.4355246933610228e-05, + "loss": 0.4305, + "step": 22070 + }, + { + "epoch": 2.1424412963322337, + "grad_norm": 2.2977476166849957, + "learning_rate": 1.4339022649101174e-05, + "loss": 0.4664, + "step": 22080 + }, + { + "epoch": 2.143411604890355, + "grad_norm": 1.8778176122292005, + "learning_rate": 1.432279836459212e-05, + "loss": 0.4321, + "step": 22090 + }, + { + "epoch": 2.1443819134484765, + "grad_norm": 2.2740974899219526, + "learning_rate": 1.4306574080083069e-05, + "loss": 0.4582, + "step": 22100 + }, + { + "epoch": 2.145352222006598, + "grad_norm": 1.7411166330088808, + "learning_rate": 1.4290349795574015e-05, + "loss": 0.4703, + "step": 22110 + }, + { + "epoch": 2.1463225305647198, + "grad_norm": 2.270025807621109, + "learning_rate": 1.4274125511064962e-05, + "loss": 0.462, + "step": 22120 + }, + { + "epoch": 2.147292839122841, + "grad_norm": 2.047404098376008, + "learning_rate": 1.4257901226555908e-05, + "loss": 0.4572, + "step": 22130 + }, + { + "epoch": 2.1482631476809626, + "grad_norm": 1.9025461949345281, + "learning_rate": 1.4241676942046856e-05, + "loss": 0.4523, + "step": 22140 + }, + { + "epoch": 2.149233456239084, + "grad_norm": 2.41657182633304, + "learning_rate": 1.4225452657537803e-05, + "loss": 0.4485, + "step": 22150 + }, + { + "epoch": 2.1502037647972054, + "grad_norm": 2.1984178815028246, + "learning_rate": 1.4209228373028749e-05, + "loss": 0.4744, + "step": 22160 + }, + { + "epoch": 2.151174073355327, + "grad_norm": 2.1671082483900044, + "learning_rate": 1.4193004088519695e-05, + "loss": 0.4277, + "step": 22170 + }, + { + "epoch": 2.1521443819134483, + "grad_norm": 2.2526638629347118, + "learning_rate": 1.4176779804010643e-05, + "loss": 0.4078, + "step": 22180 + }, + { + "epoch": 2.15311469047157, + "grad_norm": 2.4484821608494665, + "learning_rate": 1.416055551950159e-05, + "loss": 0.4726, + "step": 22190 + }, + { + "epoch": 2.1540849990296915, + "grad_norm": 2.145545596752741, + "learning_rate": 1.414433123499254e-05, + "loss": 0.5181, + "step": 22200 + }, + { + "epoch": 2.155055307587813, + "grad_norm": 2.2593836123374, + "learning_rate": 1.4128106950483486e-05, + "loss": 0.4986, + "step": 22210 + }, + { + "epoch": 2.1560256161459344, + "grad_norm": 1.8007881267775863, + "learning_rate": 1.4111882665974432e-05, + "loss": 0.4483, + "step": 22220 + }, + { + "epoch": 2.1569959247040558, + "grad_norm": 1.348362904445229, + "learning_rate": 1.4095658381465379e-05, + "loss": 0.4637, + "step": 22230 + }, + { + "epoch": 2.157966233262177, + "grad_norm": 2.1469770203677445, + "learning_rate": 1.4079434096956327e-05, + "loss": 0.494, + "step": 22240 + }, + { + "epoch": 2.158936541820299, + "grad_norm": 2.337402150331418, + "learning_rate": 1.4063209812447273e-05, + "loss": 0.4761, + "step": 22250 + }, + { + "epoch": 2.1599068503784205, + "grad_norm": 1.7961706912599362, + "learning_rate": 1.404698552793822e-05, + "loss": 0.4683, + "step": 22260 + }, + { + "epoch": 2.160877158936542, + "grad_norm": 1.6086385155901042, + "learning_rate": 1.4030761243429166e-05, + "loss": 0.4305, + "step": 22270 + }, + { + "epoch": 2.1618474674946633, + "grad_norm": 1.853121432266295, + "learning_rate": 1.4014536958920112e-05, + "loss": 0.4808, + "step": 22280 + }, + { + "epoch": 2.1628177760527847, + "grad_norm": 2.1594671768519094, + "learning_rate": 1.399831267441106e-05, + "loss": 0.4279, + "step": 22290 + }, + { + "epoch": 2.163788084610906, + "grad_norm": 1.8445913886566945, + "learning_rate": 1.3982088389902007e-05, + "loss": 0.4905, + "step": 22300 + }, + { + "epoch": 2.164758393169028, + "grad_norm": 1.8975990263034965, + "learning_rate": 1.3965864105392953e-05, + "loss": 0.4888, + "step": 22310 + }, + { + "epoch": 2.1657287017271494, + "grad_norm": 2.015470924856379, + "learning_rate": 1.39496398208839e-05, + "loss": 0.5232, + "step": 22320 + }, + { + "epoch": 2.166699010285271, + "grad_norm": 2.4943071641031698, + "learning_rate": 1.3933415536374848e-05, + "loss": 0.4827, + "step": 22330 + }, + { + "epoch": 2.167669318843392, + "grad_norm": 1.5900299707664105, + "learning_rate": 1.3917191251865794e-05, + "loss": 0.4963, + "step": 22340 + }, + { + "epoch": 2.1686396274015136, + "grad_norm": 1.8705765102756569, + "learning_rate": 1.390096696735674e-05, + "loss": 0.4389, + "step": 22350 + }, + { + "epoch": 2.169609935959635, + "grad_norm": 1.9703821664953534, + "learning_rate": 1.3884742682847687e-05, + "loss": 0.3945, + "step": 22360 + }, + { + "epoch": 2.1705802445177564, + "grad_norm": 1.6647445519639283, + "learning_rate": 1.3868518398338635e-05, + "loss": 0.4163, + "step": 22370 + }, + { + "epoch": 2.1715505530758783, + "grad_norm": 1.617333797152806, + "learning_rate": 1.3852294113829581e-05, + "loss": 0.4943, + "step": 22380 + }, + { + "epoch": 2.1725208616339997, + "grad_norm": 2.1150619610025796, + "learning_rate": 1.3836069829320528e-05, + "loss": 0.4941, + "step": 22390 + }, + { + "epoch": 2.173491170192121, + "grad_norm": 2.70908196165092, + "learning_rate": 1.3819845544811474e-05, + "loss": 0.448, + "step": 22400 + }, + { + "epoch": 2.1744614787502425, + "grad_norm": 2.195631441394506, + "learning_rate": 1.3803621260302422e-05, + "loss": 0.4611, + "step": 22410 + }, + { + "epoch": 2.175431787308364, + "grad_norm": 1.602301760768748, + "learning_rate": 1.3787396975793368e-05, + "loss": 0.4837, + "step": 22420 + }, + { + "epoch": 2.1764020958664854, + "grad_norm": 1.7676765995955546, + "learning_rate": 1.3771172691284315e-05, + "loss": 0.4798, + "step": 22430 + }, + { + "epoch": 2.177372404424607, + "grad_norm": 2.211793623579749, + "learning_rate": 1.3754948406775261e-05, + "loss": 0.4607, + "step": 22440 + }, + { + "epoch": 2.1783427129827286, + "grad_norm": 1.9526273269630037, + "learning_rate": 1.3738724122266208e-05, + "loss": 0.4638, + "step": 22450 + }, + { + "epoch": 2.17931302154085, + "grad_norm": 2.063947829078263, + "learning_rate": 1.3722499837757156e-05, + "loss": 0.4548, + "step": 22460 + }, + { + "epoch": 2.1802833300989715, + "grad_norm": 2.127208435257803, + "learning_rate": 1.3706275553248102e-05, + "loss": 0.4332, + "step": 22470 + }, + { + "epoch": 2.181253638657093, + "grad_norm": 2.098766619473584, + "learning_rate": 1.3690051268739048e-05, + "loss": 0.4822, + "step": 22480 + }, + { + "epoch": 2.1822239472152143, + "grad_norm": 1.5960382905615234, + "learning_rate": 1.3673826984229995e-05, + "loss": 0.4394, + "step": 22490 + }, + { + "epoch": 2.183194255773336, + "grad_norm": 2.5521137154858087, + "learning_rate": 1.3657602699720943e-05, + "loss": 0.4568, + "step": 22500 + }, + { + "epoch": 2.183194255773336, + "eval_loss": 0.6301902532577515, + "eval_runtime": 3074.8498, + "eval_samples_per_second": 0.583, + "eval_steps_per_second": 0.291, + "step": 22500 + }, + { + "epoch": 2.1841645643314576, + "grad_norm": 1.8964546731319356, + "learning_rate": 1.364137841521189e-05, + "loss": 0.4569, + "step": 22510 + }, + { + "epoch": 2.185134872889579, + "grad_norm": 2.1836501057996918, + "learning_rate": 1.3625154130702836e-05, + "loss": 0.4497, + "step": 22520 + }, + { + "epoch": 2.1861051814477004, + "grad_norm": 1.6786624803632986, + "learning_rate": 1.3608929846193782e-05, + "loss": 0.4229, + "step": 22530 + }, + { + "epoch": 2.187075490005822, + "grad_norm": 2.1475126662304675, + "learning_rate": 1.359270556168473e-05, + "loss": 0.4412, + "step": 22540 + }, + { + "epoch": 2.188045798563943, + "grad_norm": 2.322023881303393, + "learning_rate": 1.3576481277175676e-05, + "loss": 0.4235, + "step": 22550 + }, + { + "epoch": 2.1890161071220646, + "grad_norm": 2.208490633488266, + "learning_rate": 1.3560256992666623e-05, + "loss": 0.4997, + "step": 22560 + }, + { + "epoch": 2.1899864156801865, + "grad_norm": 1.869675214539207, + "learning_rate": 1.354403270815757e-05, + "loss": 0.4635, + "step": 22570 + }, + { + "epoch": 2.190956724238308, + "grad_norm": 1.7398189038479703, + "learning_rate": 1.3527808423648516e-05, + "loss": 0.4128, + "step": 22580 + }, + { + "epoch": 2.1919270327964293, + "grad_norm": 2.266171424803214, + "learning_rate": 1.3511584139139465e-05, + "loss": 0.4651, + "step": 22590 + }, + { + "epoch": 2.1928973413545507, + "grad_norm": 2.3614429446369063, + "learning_rate": 1.3495359854630413e-05, + "loss": 0.4123, + "step": 22600 + }, + { + "epoch": 2.193867649912672, + "grad_norm": 1.724551648002317, + "learning_rate": 1.347913557012136e-05, + "loss": 0.4481, + "step": 22610 + }, + { + "epoch": 2.1948379584707935, + "grad_norm": 2.0558570962807945, + "learning_rate": 1.3462911285612306e-05, + "loss": 0.4431, + "step": 22620 + }, + { + "epoch": 2.195808267028915, + "grad_norm": 1.943776934136987, + "learning_rate": 1.3446687001103253e-05, + "loss": 0.5128, + "step": 22630 + }, + { + "epoch": 2.196778575587037, + "grad_norm": 1.958173689925549, + "learning_rate": 1.34304627165942e-05, + "loss": 0.4628, + "step": 22640 + }, + { + "epoch": 2.1977488841451582, + "grad_norm": 1.990366090746634, + "learning_rate": 1.3414238432085147e-05, + "loss": 0.459, + "step": 22650 + }, + { + "epoch": 2.1987191927032796, + "grad_norm": 1.8373680505548435, + "learning_rate": 1.3398014147576093e-05, + "loss": 0.4397, + "step": 22660 + }, + { + "epoch": 2.199689501261401, + "grad_norm": 1.8078209399386682, + "learning_rate": 1.338178986306704e-05, + "loss": 0.5215, + "step": 22670 + }, + { + "epoch": 2.2006598098195225, + "grad_norm": 2.400832940310904, + "learning_rate": 1.3365565578557986e-05, + "loss": 0.4926, + "step": 22680 + }, + { + "epoch": 2.201630118377644, + "grad_norm": 2.055045167286068, + "learning_rate": 1.3349341294048934e-05, + "loss": 0.4461, + "step": 22690 + }, + { + "epoch": 2.2026004269357657, + "grad_norm": 1.8144420639570567, + "learning_rate": 1.333311700953988e-05, + "loss": 0.4112, + "step": 22700 + }, + { + "epoch": 2.203570735493887, + "grad_norm": 2.391519552042341, + "learning_rate": 1.3316892725030827e-05, + "loss": 0.439, + "step": 22710 + }, + { + "epoch": 2.2045410440520086, + "grad_norm": 1.563384599944884, + "learning_rate": 1.3300668440521773e-05, + "loss": 0.4694, + "step": 22720 + }, + { + "epoch": 2.20551135261013, + "grad_norm": 2.3546727224779715, + "learning_rate": 1.3284444156012721e-05, + "loss": 0.4355, + "step": 22730 + }, + { + "epoch": 2.2064816611682514, + "grad_norm": 1.832975462434467, + "learning_rate": 1.3268219871503668e-05, + "loss": 0.4984, + "step": 22740 + }, + { + "epoch": 2.207451969726373, + "grad_norm": 1.7975073035604554, + "learning_rate": 1.3251995586994614e-05, + "loss": 0.4584, + "step": 22750 + }, + { + "epoch": 2.2084222782844947, + "grad_norm": 2.243306278714619, + "learning_rate": 1.323577130248556e-05, + "loss": 0.4365, + "step": 22760 + }, + { + "epoch": 2.209392586842616, + "grad_norm": 2.0297904158946043, + "learning_rate": 1.3219547017976509e-05, + "loss": 0.5148, + "step": 22770 + }, + { + "epoch": 2.2103628954007375, + "grad_norm": 1.6322562865594754, + "learning_rate": 1.3203322733467455e-05, + "loss": 0.4782, + "step": 22780 + }, + { + "epoch": 2.211333203958859, + "grad_norm": 2.277603717413092, + "learning_rate": 1.3187098448958401e-05, + "loss": 0.4899, + "step": 22790 + }, + { + "epoch": 2.2123035125169803, + "grad_norm": 1.2452738117030233, + "learning_rate": 1.3170874164449348e-05, + "loss": 0.4459, + "step": 22800 + }, + { + "epoch": 2.2132738210751017, + "grad_norm": 2.3624119652920177, + "learning_rate": 1.3154649879940294e-05, + "loss": 0.5033, + "step": 22810 + }, + { + "epoch": 2.214244129633223, + "grad_norm": 1.6861367814447958, + "learning_rate": 1.3138425595431242e-05, + "loss": 0.4434, + "step": 22820 + }, + { + "epoch": 2.215214438191345, + "grad_norm": 2.5489904753238166, + "learning_rate": 1.3122201310922189e-05, + "loss": 0.4465, + "step": 22830 + }, + { + "epoch": 2.2161847467494664, + "grad_norm": 1.5652883952842678, + "learning_rate": 1.3105977026413135e-05, + "loss": 0.4648, + "step": 22840 + }, + { + "epoch": 2.217155055307588, + "grad_norm": 2.106609892171019, + "learning_rate": 1.3089752741904081e-05, + "loss": 0.4809, + "step": 22850 + }, + { + "epoch": 2.2181253638657092, + "grad_norm": 2.7417498454164653, + "learning_rate": 1.307352845739503e-05, + "loss": 0.4652, + "step": 22860 + }, + { + "epoch": 2.2190956724238307, + "grad_norm": 2.0375355388485645, + "learning_rate": 1.3057304172885976e-05, + "loss": 0.499, + "step": 22870 + }, + { + "epoch": 2.220065980981952, + "grad_norm": 1.6341471214683112, + "learning_rate": 1.3041079888376922e-05, + "loss": 0.4935, + "step": 22880 + }, + { + "epoch": 2.221036289540074, + "grad_norm": 2.3570884682492346, + "learning_rate": 1.3024855603867869e-05, + "loss": 0.4884, + "step": 22890 + }, + { + "epoch": 2.2220065980981953, + "grad_norm": 1.9493074304368556, + "learning_rate": 1.3008631319358817e-05, + "loss": 0.4838, + "step": 22900 + }, + { + "epoch": 2.2229769066563168, + "grad_norm": 2.1315651431097224, + "learning_rate": 1.2992407034849763e-05, + "loss": 0.4978, + "step": 22910 + }, + { + "epoch": 2.223947215214438, + "grad_norm": 1.8843986392844572, + "learning_rate": 1.297618275034071e-05, + "loss": 0.4922, + "step": 22920 + }, + { + "epoch": 2.2249175237725596, + "grad_norm": 2.0055582988037313, + "learning_rate": 1.2959958465831656e-05, + "loss": 0.4702, + "step": 22930 + }, + { + "epoch": 2.225887832330681, + "grad_norm": 1.8513569200366902, + "learning_rate": 1.2943734181322604e-05, + "loss": 0.467, + "step": 22940 + }, + { + "epoch": 2.226858140888803, + "grad_norm": 2.3018937212184682, + "learning_rate": 1.292750989681355e-05, + "loss": 0.4632, + "step": 22950 + }, + { + "epoch": 2.2278284494469243, + "grad_norm": 2.1028486436310905, + "learning_rate": 1.2911285612304497e-05, + "loss": 0.4329, + "step": 22960 + }, + { + "epoch": 2.2287987580050457, + "grad_norm": 2.3891744137342994, + "learning_rate": 1.2895061327795443e-05, + "loss": 0.4549, + "step": 22970 + }, + { + "epoch": 2.229769066563167, + "grad_norm": 1.8338746725123478, + "learning_rate": 1.2878837043286393e-05, + "loss": 0.4518, + "step": 22980 + }, + { + "epoch": 2.2307393751212885, + "grad_norm": 2.5499388804291483, + "learning_rate": 1.286261275877734e-05, + "loss": 0.4852, + "step": 22990 + }, + { + "epoch": 2.23170968367941, + "grad_norm": 1.6648341489250056, + "learning_rate": 1.2846388474268287e-05, + "loss": 0.4266, + "step": 23000 + }, + { + "epoch": 2.23170968367941, + "eval_loss": 0.6283465623855591, + "eval_runtime": 3074.8923, + "eval_samples_per_second": 0.583, + "eval_steps_per_second": 0.291, + "step": 23000 + }, + { + "epoch": 2.2326799922375313, + "grad_norm": 2.0022361963538584, + "learning_rate": 1.2830164189759234e-05, + "loss": 0.5047, + "step": 23010 + }, + { + "epoch": 2.233650300795653, + "grad_norm": 1.5655335338455796, + "learning_rate": 1.281393990525018e-05, + "loss": 0.4699, + "step": 23020 + }, + { + "epoch": 2.2346206093537746, + "grad_norm": 2.1600874630795452, + "learning_rate": 1.2797715620741127e-05, + "loss": 0.4684, + "step": 23030 + }, + { + "epoch": 2.235590917911896, + "grad_norm": 1.6958961004239297, + "learning_rate": 1.2781491336232073e-05, + "loss": 0.4769, + "step": 23040 + }, + { + "epoch": 2.2365612264700174, + "grad_norm": 2.290659253729356, + "learning_rate": 1.2765267051723021e-05, + "loss": 0.5084, + "step": 23050 + }, + { + "epoch": 2.237531535028139, + "grad_norm": 2.250294105691862, + "learning_rate": 1.2749042767213967e-05, + "loss": 0.4562, + "step": 23060 + }, + { + "epoch": 2.2385018435862603, + "grad_norm": 1.93879453393384, + "learning_rate": 1.2732818482704914e-05, + "loss": 0.4687, + "step": 23070 + }, + { + "epoch": 2.239472152144382, + "grad_norm": 2.3403124810757805, + "learning_rate": 1.271659419819586e-05, + "loss": 0.493, + "step": 23080 + }, + { + "epoch": 2.2404424607025035, + "grad_norm": 1.800829852558906, + "learning_rate": 1.2700369913686808e-05, + "loss": 0.4786, + "step": 23090 + }, + { + "epoch": 2.241412769260625, + "grad_norm": 2.0849042725619165, + "learning_rate": 1.2684145629177755e-05, + "loss": 0.4682, + "step": 23100 + }, + { + "epoch": 2.2423830778187464, + "grad_norm": 2.209642518744148, + "learning_rate": 1.2667921344668701e-05, + "loss": 0.4705, + "step": 23110 + }, + { + "epoch": 2.2433533863768678, + "grad_norm": 2.04809833807039, + "learning_rate": 1.2651697060159647e-05, + "loss": 0.4652, + "step": 23120 + }, + { + "epoch": 2.244323694934989, + "grad_norm": 2.134292380445789, + "learning_rate": 1.2635472775650595e-05, + "loss": 0.4375, + "step": 23130 + }, + { + "epoch": 2.245294003493111, + "grad_norm": 2.0308679578505466, + "learning_rate": 1.2619248491141542e-05, + "loss": 0.4256, + "step": 23140 + }, + { + "epoch": 2.2462643120512324, + "grad_norm": 1.7740882276229983, + "learning_rate": 1.2603024206632488e-05, + "loss": 0.4754, + "step": 23150 + }, + { + "epoch": 2.247234620609354, + "grad_norm": 2.845899626015525, + "learning_rate": 1.2586799922123435e-05, + "loss": 0.4465, + "step": 23160 + }, + { + "epoch": 2.2482049291674753, + "grad_norm": 1.6656047995660936, + "learning_rate": 1.2570575637614383e-05, + "loss": 0.4645, + "step": 23170 + }, + { + "epoch": 2.2491752377255967, + "grad_norm": 2.1317439051006675, + "learning_rate": 1.2554351353105329e-05, + "loss": 0.4215, + "step": 23180 + }, + { + "epoch": 2.250145546283718, + "grad_norm": 1.8180573164715403, + "learning_rate": 1.2538127068596275e-05, + "loss": 0.448, + "step": 23190 + }, + { + "epoch": 2.2511158548418395, + "grad_norm": 1.7997832378510108, + "learning_rate": 1.2521902784087222e-05, + "loss": 0.5032, + "step": 23200 + }, + { + "epoch": 2.2520861633999614, + "grad_norm": 1.97843541131739, + "learning_rate": 1.2505678499578168e-05, + "loss": 0.4577, + "step": 23210 + }, + { + "epoch": 2.253056471958083, + "grad_norm": 2.1980311567337045, + "learning_rate": 1.2489454215069116e-05, + "loss": 0.4492, + "step": 23220 + }, + { + "epoch": 2.254026780516204, + "grad_norm": 1.888365535194607, + "learning_rate": 1.2473229930560063e-05, + "loss": 0.4416, + "step": 23230 + }, + { + "epoch": 2.2549970890743256, + "grad_norm": 1.7544850776435907, + "learning_rate": 1.2457005646051009e-05, + "loss": 0.4803, + "step": 23240 + }, + { + "epoch": 2.255967397632447, + "grad_norm": 1.8829778599554805, + "learning_rate": 1.2440781361541955e-05, + "loss": 0.4227, + "step": 23250 + }, + { + "epoch": 2.2569377061905684, + "grad_norm": 2.1628337950810956, + "learning_rate": 1.2424557077032903e-05, + "loss": 0.4467, + "step": 23260 + }, + { + "epoch": 2.25790801474869, + "grad_norm": 2.1336139136695254, + "learning_rate": 1.240833279252385e-05, + "loss": 0.4798, + "step": 23270 + }, + { + "epoch": 2.2588783233068117, + "grad_norm": 2.3462272747330277, + "learning_rate": 1.2392108508014798e-05, + "loss": 0.4894, + "step": 23280 + }, + { + "epoch": 2.259848631864933, + "grad_norm": 1.9473113253648615, + "learning_rate": 1.2375884223505744e-05, + "loss": 0.4353, + "step": 23290 + }, + { + "epoch": 2.2608189404230545, + "grad_norm": 2.357093728494574, + "learning_rate": 1.235965993899669e-05, + "loss": 0.4508, + "step": 23300 + }, + { + "epoch": 2.261789248981176, + "grad_norm": 2.0651529569693468, + "learning_rate": 1.2343435654487639e-05, + "loss": 0.5106, + "step": 23310 + }, + { + "epoch": 2.2627595575392974, + "grad_norm": 1.8536850282087043, + "learning_rate": 1.2327211369978585e-05, + "loss": 0.446, + "step": 23320 + }, + { + "epoch": 2.263729866097419, + "grad_norm": 2.129566666742622, + "learning_rate": 1.2310987085469532e-05, + "loss": 0.4467, + "step": 23330 + }, + { + "epoch": 2.2647001746555406, + "grad_norm": 1.73183433586886, + "learning_rate": 1.2294762800960478e-05, + "loss": 0.4712, + "step": 23340 + }, + { + "epoch": 2.265670483213662, + "grad_norm": 1.5251850887891814, + "learning_rate": 1.2278538516451426e-05, + "loss": 0.5013, + "step": 23350 + }, + { + "epoch": 2.2666407917717835, + "grad_norm": 2.184835227449666, + "learning_rate": 1.2262314231942372e-05, + "loss": 0.4714, + "step": 23360 + }, + { + "epoch": 2.267611100329905, + "grad_norm": 2.278133559610058, + "learning_rate": 1.2246089947433319e-05, + "loss": 0.4707, + "step": 23370 + }, + { + "epoch": 2.2685814088880263, + "grad_norm": 2.043400099097976, + "learning_rate": 1.2229865662924265e-05, + "loss": 0.4398, + "step": 23380 + }, + { + "epoch": 2.2695517174461477, + "grad_norm": 2.1473483931604145, + "learning_rate": 1.2213641378415212e-05, + "loss": 0.4229, + "step": 23390 + }, + { + "epoch": 2.2705220260042696, + "grad_norm": 1.9230359161534956, + "learning_rate": 1.219741709390616e-05, + "loss": 0.425, + "step": 23400 + }, + { + "epoch": 2.271492334562391, + "grad_norm": 2.0484743523955773, + "learning_rate": 1.2181192809397106e-05, + "loss": 0.4034, + "step": 23410 + }, + { + "epoch": 2.2724626431205124, + "grad_norm": 2.018457750509176, + "learning_rate": 1.2164968524888052e-05, + "loss": 0.5061, + "step": 23420 + }, + { + "epoch": 2.273432951678634, + "grad_norm": 2.0919643594364494, + "learning_rate": 1.2148744240378999e-05, + "loss": 0.432, + "step": 23430 + }, + { + "epoch": 2.274403260236755, + "grad_norm": 2.4420554524762395, + "learning_rate": 1.2132519955869947e-05, + "loss": 0.4734, + "step": 23440 + }, + { + "epoch": 2.2753735687948766, + "grad_norm": 2.2048878285052034, + "learning_rate": 1.2116295671360893e-05, + "loss": 0.5, + "step": 23450 + }, + { + "epoch": 2.276343877352998, + "grad_norm": 2.385315906124704, + "learning_rate": 1.210007138685184e-05, + "loss": 0.3946, + "step": 23460 + }, + { + "epoch": 2.27731418591112, + "grad_norm": 2.1039427863509936, + "learning_rate": 1.2083847102342788e-05, + "loss": 0.4509, + "step": 23470 + }, + { + "epoch": 2.2782844944692413, + "grad_norm": 2.1731393161499977, + "learning_rate": 1.2067622817833734e-05, + "loss": 0.5216, + "step": 23480 + }, + { + "epoch": 2.2792548030273627, + "grad_norm": 2.133407830915671, + "learning_rate": 1.2051398533324682e-05, + "loss": 0.4652, + "step": 23490 + }, + { + "epoch": 2.280225111585484, + "grad_norm": 1.8143906080604477, + "learning_rate": 1.2035174248815629e-05, + "loss": 0.4608, + "step": 23500 + }, + { + "epoch": 2.280225111585484, + "eval_loss": 0.6306902766227722, + "eval_runtime": 3119.399, + "eval_samples_per_second": 0.574, + "eval_steps_per_second": 0.287, + "step": 23500 + }, + { + "epoch": 2.2811954201436055, + "grad_norm": 2.2848754365231545, + "learning_rate": 1.2018949964306575e-05, + "loss": 0.5139, + "step": 23510 + }, + { + "epoch": 2.282165728701727, + "grad_norm": 2.354718632997715, + "learning_rate": 1.2002725679797521e-05, + "loss": 0.4255, + "step": 23520 + }, + { + "epoch": 2.283136037259849, + "grad_norm": 2.256014847978423, + "learning_rate": 1.198650139528847e-05, + "loss": 0.4393, + "step": 23530 + }, + { + "epoch": 2.2841063458179702, + "grad_norm": 2.016243274514756, + "learning_rate": 1.1970277110779416e-05, + "loss": 0.4194, + "step": 23540 + }, + { + "epoch": 2.2850766543760916, + "grad_norm": 2.4759209758692426, + "learning_rate": 1.1954052826270362e-05, + "loss": 0.4757, + "step": 23550 + }, + { + "epoch": 2.286046962934213, + "grad_norm": 2.2547867685912486, + "learning_rate": 1.1937828541761309e-05, + "loss": 0.4511, + "step": 23560 + }, + { + "epoch": 2.2870172714923345, + "grad_norm": 1.9593024304910214, + "learning_rate": 1.1921604257252255e-05, + "loss": 0.5043, + "step": 23570 + }, + { + "epoch": 2.287987580050456, + "grad_norm": 2.2107379578337567, + "learning_rate": 1.1905379972743203e-05, + "loss": 0.4349, + "step": 23580 + }, + { + "epoch": 2.2889578886085777, + "grad_norm": 1.8779808167419767, + "learning_rate": 1.188915568823415e-05, + "loss": 0.4063, + "step": 23590 + }, + { + "epoch": 2.289928197166699, + "grad_norm": 2.1111765494708052, + "learning_rate": 1.1872931403725096e-05, + "loss": 0.4327, + "step": 23600 + }, + { + "epoch": 2.2908985057248206, + "grad_norm": 1.906738344577065, + "learning_rate": 1.1856707119216042e-05, + "loss": 0.4676, + "step": 23610 + }, + { + "epoch": 2.291868814282942, + "grad_norm": 2.6990569903001167, + "learning_rate": 1.184048283470699e-05, + "loss": 0.4759, + "step": 23620 + }, + { + "epoch": 2.2928391228410634, + "grad_norm": 2.39173938568666, + "learning_rate": 1.1824258550197937e-05, + "loss": 0.4565, + "step": 23630 + }, + { + "epoch": 2.293809431399185, + "grad_norm": 2.264327763939328, + "learning_rate": 1.1808034265688883e-05, + "loss": 0.5, + "step": 23640 + }, + { + "epoch": 2.294779739957306, + "grad_norm": 2.5489345132458494, + "learning_rate": 1.179180998117983e-05, + "loss": 0.4545, + "step": 23650 + }, + { + "epoch": 2.295750048515428, + "grad_norm": 1.5435526436096745, + "learning_rate": 1.1775585696670777e-05, + "loss": 0.444, + "step": 23660 + }, + { + "epoch": 2.2967203570735495, + "grad_norm": 2.096262775761873, + "learning_rate": 1.1759361412161725e-05, + "loss": 0.4651, + "step": 23670 + }, + { + "epoch": 2.297690665631671, + "grad_norm": 2.190613074347611, + "learning_rate": 1.1743137127652672e-05, + "loss": 0.5474, + "step": 23680 + }, + { + "epoch": 2.2986609741897923, + "grad_norm": 2.077801881915876, + "learning_rate": 1.1726912843143618e-05, + "loss": 0.4762, + "step": 23690 + }, + { + "epoch": 2.2996312827479137, + "grad_norm": 2.01599660916335, + "learning_rate": 1.1710688558634565e-05, + "loss": 0.4819, + "step": 23700 + }, + { + "epoch": 2.300601591306035, + "grad_norm": 1.7711529107558897, + "learning_rate": 1.1694464274125513e-05, + "loss": 0.5082, + "step": 23710 + }, + { + "epoch": 2.3015718998641566, + "grad_norm": 2.015443958364855, + "learning_rate": 1.1678239989616459e-05, + "loss": 0.4578, + "step": 23720 + }, + { + "epoch": 2.3025422084222784, + "grad_norm": 1.6478971592533067, + "learning_rate": 1.1662015705107405e-05, + "loss": 0.4285, + "step": 23730 + }, + { + "epoch": 2.3035125169804, + "grad_norm": 1.722295133645004, + "learning_rate": 1.1645791420598352e-05, + "loss": 0.4678, + "step": 23740 + }, + { + "epoch": 2.3044828255385212, + "grad_norm": 2.004493677850927, + "learning_rate": 1.16295671360893e-05, + "loss": 0.412, + "step": 23750 + }, + { + "epoch": 2.3054531340966427, + "grad_norm": 2.45778069486736, + "learning_rate": 1.1613342851580246e-05, + "loss": 0.5124, + "step": 23760 + }, + { + "epoch": 2.306423442654764, + "grad_norm": 2.040288471406675, + "learning_rate": 1.1597118567071193e-05, + "loss": 0.448, + "step": 23770 + }, + { + "epoch": 2.307393751212886, + "grad_norm": 1.995303973637335, + "learning_rate": 1.1580894282562139e-05, + "loss": 0.5022, + "step": 23780 + }, + { + "epoch": 2.3083640597710073, + "grad_norm": 2.080262776846013, + "learning_rate": 1.1564669998053085e-05, + "loss": 0.4962, + "step": 23790 + }, + { + "epoch": 2.3093343683291288, + "grad_norm": 2.2695683008172756, + "learning_rate": 1.1548445713544034e-05, + "loss": 0.5149, + "step": 23800 + }, + { + "epoch": 2.31030467688725, + "grad_norm": 2.2271193293614724, + "learning_rate": 1.153222142903498e-05, + "loss": 0.4527, + "step": 23810 + }, + { + "epoch": 2.3112749854453716, + "grad_norm": 1.8037542108398639, + "learning_rate": 1.1515997144525926e-05, + "loss": 0.4569, + "step": 23820 + }, + { + "epoch": 2.312245294003493, + "grad_norm": 1.8828460178890125, + "learning_rate": 1.1499772860016873e-05, + "loss": 0.446, + "step": 23830 + }, + { + "epoch": 2.3132156025616144, + "grad_norm": 2.148750511684889, + "learning_rate": 1.148354857550782e-05, + "loss": 0.4258, + "step": 23840 + }, + { + "epoch": 2.3141859111197363, + "grad_norm": 1.9299282888568785, + "learning_rate": 1.1467324290998767e-05, + "loss": 0.443, + "step": 23850 + }, + { + "epoch": 2.3151562196778577, + "grad_norm": 2.306570936387397, + "learning_rate": 1.1451100006489715e-05, + "loss": 0.4742, + "step": 23860 + }, + { + "epoch": 2.316126528235979, + "grad_norm": 1.652399835965133, + "learning_rate": 1.1434875721980662e-05, + "loss": 0.4434, + "step": 23870 + }, + { + "epoch": 2.3170968367941005, + "grad_norm": 1.8666656145624976, + "learning_rate": 1.1418651437471608e-05, + "loss": 0.4165, + "step": 23880 + }, + { + "epoch": 2.318067145352222, + "grad_norm": 1.799707347697279, + "learning_rate": 1.1402427152962556e-05, + "loss": 0.4525, + "step": 23890 + }, + { + "epoch": 2.3190374539103433, + "grad_norm": 1.8900931059214818, + "learning_rate": 1.1386202868453502e-05, + "loss": 0.4675, + "step": 23900 + }, + { + "epoch": 2.3200077624684647, + "grad_norm": 2.2479167053649896, + "learning_rate": 1.1369978583944449e-05, + "loss": 0.4624, + "step": 23910 + }, + { + "epoch": 2.3209780710265866, + "grad_norm": 2.2784052431273616, + "learning_rate": 1.1353754299435395e-05, + "loss": 0.4219, + "step": 23920 + }, + { + "epoch": 2.321948379584708, + "grad_norm": 2.1485033156725417, + "learning_rate": 1.1337530014926343e-05, + "loss": 0.4118, + "step": 23930 + }, + { + "epoch": 2.3229186881428294, + "grad_norm": 2.0610231375794075, + "learning_rate": 1.132130573041729e-05, + "loss": 0.4672, + "step": 23940 + }, + { + "epoch": 2.323888996700951, + "grad_norm": 1.8373253686424216, + "learning_rate": 1.1305081445908236e-05, + "loss": 0.4892, + "step": 23950 + }, + { + "epoch": 2.3248593052590723, + "grad_norm": 2.0784968854146375, + "learning_rate": 1.1288857161399182e-05, + "loss": 0.426, + "step": 23960 + }, + { + "epoch": 2.325829613817194, + "grad_norm": 1.8246399051495223, + "learning_rate": 1.1272632876890129e-05, + "loss": 0.4707, + "step": 23970 + }, + { + "epoch": 2.3267999223753155, + "grad_norm": 1.8607594933847929, + "learning_rate": 1.1256408592381077e-05, + "loss": 0.4787, + "step": 23980 + }, + { + "epoch": 2.327770230933437, + "grad_norm": 2.1338034995885673, + "learning_rate": 1.1240184307872023e-05, + "loss": 0.4188, + "step": 23990 + }, + { + "epoch": 2.3287405394915583, + "grad_norm": 1.691316437747956, + "learning_rate": 1.122396002336297e-05, + "loss": 0.4683, + "step": 24000 + }, + { + "epoch": 2.3287405394915583, + "eval_loss": 0.6269034147262573, + "eval_runtime": 3077.756, + "eval_samples_per_second": 0.582, + "eval_steps_per_second": 0.291, + "step": 24000 + }, + { + "epoch": 2.3297108480496798, + "grad_norm": 2.040492940843433, + "learning_rate": 1.1207735738853916e-05, + "loss": 0.4569, + "step": 24010 + }, + { + "epoch": 2.330681156607801, + "grad_norm": 1.7971740640523082, + "learning_rate": 1.1191511454344864e-05, + "loss": 0.4495, + "step": 24020 + }, + { + "epoch": 2.3316514651659226, + "grad_norm": 1.939848570121438, + "learning_rate": 1.117528716983581e-05, + "loss": 0.4794, + "step": 24030 + }, + { + "epoch": 2.3326217737240444, + "grad_norm": 2.032618108671819, + "learning_rate": 1.1159062885326757e-05, + "loss": 0.4446, + "step": 24040 + }, + { + "epoch": 2.333592082282166, + "grad_norm": 1.9156260447966438, + "learning_rate": 1.1142838600817703e-05, + "loss": 0.4687, + "step": 24050 + }, + { + "epoch": 2.3345623908402873, + "grad_norm": 1.9351404363288518, + "learning_rate": 1.1126614316308651e-05, + "loss": 0.5079, + "step": 24060 + }, + { + "epoch": 2.3355326993984087, + "grad_norm": 1.9150652539150566, + "learning_rate": 1.11103900317996e-05, + "loss": 0.4673, + "step": 24070 + }, + { + "epoch": 2.33650300795653, + "grad_norm": 1.8679864315268337, + "learning_rate": 1.1094165747290546e-05, + "loss": 0.4291, + "step": 24080 + }, + { + "epoch": 2.3374733165146515, + "grad_norm": 2.414373392352731, + "learning_rate": 1.1077941462781492e-05, + "loss": 0.4738, + "step": 24090 + }, + { + "epoch": 2.338443625072773, + "grad_norm": 2.5480843998856466, + "learning_rate": 1.1061717178272439e-05, + "loss": 0.4281, + "step": 24100 + }, + { + "epoch": 2.339413933630895, + "grad_norm": 1.7367372410706, + "learning_rate": 1.1045492893763387e-05, + "loss": 0.483, + "step": 24110 + }, + { + "epoch": 2.340384242189016, + "grad_norm": 1.8020643362909123, + "learning_rate": 1.1029268609254333e-05, + "loss": 0.4534, + "step": 24120 + }, + { + "epoch": 2.3413545507471376, + "grad_norm": 2.0532213004634117, + "learning_rate": 1.101304432474528e-05, + "loss": 0.4691, + "step": 24130 + }, + { + "epoch": 2.342324859305259, + "grad_norm": 1.825257767760241, + "learning_rate": 1.0996820040236226e-05, + "loss": 0.488, + "step": 24140 + }, + { + "epoch": 2.3432951678633804, + "grad_norm": 2.0243831093236513, + "learning_rate": 1.0980595755727172e-05, + "loss": 0.4846, + "step": 24150 + }, + { + "epoch": 2.344265476421502, + "grad_norm": 2.249331730728062, + "learning_rate": 1.096437147121812e-05, + "loss": 0.4578, + "step": 24160 + }, + { + "epoch": 2.3452357849796237, + "grad_norm": 2.147870816517021, + "learning_rate": 1.0948147186709067e-05, + "loss": 0.4453, + "step": 24170 + }, + { + "epoch": 2.346206093537745, + "grad_norm": 1.8861077026920583, + "learning_rate": 1.0931922902200013e-05, + "loss": 0.4214, + "step": 24180 + }, + { + "epoch": 2.3471764020958665, + "grad_norm": 1.9335635244208784, + "learning_rate": 1.091569861769096e-05, + "loss": 0.4053, + "step": 24190 + }, + { + "epoch": 2.348146710653988, + "grad_norm": 2.1792571618590117, + "learning_rate": 1.0899474333181907e-05, + "loss": 0.4502, + "step": 24200 + }, + { + "epoch": 2.3491170192121094, + "grad_norm": 1.8035101955175616, + "learning_rate": 1.0883250048672854e-05, + "loss": 0.4777, + "step": 24210 + }, + { + "epoch": 2.3500873277702308, + "grad_norm": 2.3748193703673506, + "learning_rate": 1.08670257641638e-05, + "loss": 0.4348, + "step": 24220 + }, + { + "epoch": 2.3510576363283526, + "grad_norm": 2.093876681748244, + "learning_rate": 1.0850801479654747e-05, + "loss": 0.4801, + "step": 24230 + }, + { + "epoch": 2.352027944886474, + "grad_norm": 2.0917507142883025, + "learning_rate": 1.0834577195145695e-05, + "loss": 0.4622, + "step": 24240 + }, + { + "epoch": 2.3529982534445955, + "grad_norm": 2.4449607859132816, + "learning_rate": 1.0818352910636643e-05, + "loss": 0.462, + "step": 24250 + }, + { + "epoch": 2.353968562002717, + "grad_norm": 2.1889912797955877, + "learning_rate": 1.080212862612759e-05, + "loss": 0.4614, + "step": 24260 + }, + { + "epoch": 2.3549388705608383, + "grad_norm": 1.7339445133001758, + "learning_rate": 1.0785904341618536e-05, + "loss": 0.4477, + "step": 24270 + }, + { + "epoch": 2.3559091791189597, + "grad_norm": 2.2974387900444064, + "learning_rate": 1.0769680057109482e-05, + "loss": 0.488, + "step": 24280 + }, + { + "epoch": 2.356879487677081, + "grad_norm": 1.9216704451577764, + "learning_rate": 1.075345577260043e-05, + "loss": 0.4602, + "step": 24290 + }, + { + "epoch": 2.357849796235203, + "grad_norm": 1.595623484078005, + "learning_rate": 1.0737231488091376e-05, + "loss": 0.4618, + "step": 24300 + }, + { + "epoch": 2.3588201047933244, + "grad_norm": 2.1389381123821973, + "learning_rate": 1.0721007203582323e-05, + "loss": 0.4109, + "step": 24310 + }, + { + "epoch": 2.359790413351446, + "grad_norm": 2.1703895980886294, + "learning_rate": 1.070478291907327e-05, + "loss": 0.4755, + "step": 24320 + }, + { + "epoch": 2.360760721909567, + "grad_norm": 2.101496536635865, + "learning_rate": 1.0688558634564216e-05, + "loss": 0.4371, + "step": 24330 + }, + { + "epoch": 2.3617310304676886, + "grad_norm": 2.201243439271482, + "learning_rate": 1.0672334350055164e-05, + "loss": 0.4252, + "step": 24340 + }, + { + "epoch": 2.36270133902581, + "grad_norm": 2.067190247810472, + "learning_rate": 1.065611006554611e-05, + "loss": 0.4684, + "step": 24350 + }, + { + "epoch": 2.3636716475839314, + "grad_norm": 1.593808072912525, + "learning_rate": 1.0639885781037056e-05, + "loss": 0.4474, + "step": 24360 + }, + { + "epoch": 2.3646419561420533, + "grad_norm": 2.3672169952188344, + "learning_rate": 1.0623661496528003e-05, + "loss": 0.4543, + "step": 24370 + }, + { + "epoch": 2.3656122647001747, + "grad_norm": 2.173568364373994, + "learning_rate": 1.060743721201895e-05, + "loss": 0.4172, + "step": 24380 + }, + { + "epoch": 2.366582573258296, + "grad_norm": 2.0059231401496005, + "learning_rate": 1.0591212927509897e-05, + "loss": 0.4735, + "step": 24390 + }, + { + "epoch": 2.3675528818164175, + "grad_norm": 2.0851213756560187, + "learning_rate": 1.0574988643000844e-05, + "loss": 0.4363, + "step": 24400 + }, + { + "epoch": 2.368523190374539, + "grad_norm": 2.05634160028851, + "learning_rate": 1.055876435849179e-05, + "loss": 0.5026, + "step": 24410 + }, + { + "epoch": 2.369493498932661, + "grad_norm": 2.2318992281649583, + "learning_rate": 1.0542540073982738e-05, + "loss": 0.4272, + "step": 24420 + }, + { + "epoch": 2.3704638074907822, + "grad_norm": 2.3837912348792862, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.4873, + "step": 24430 + }, + { + "epoch": 2.3714341160489036, + "grad_norm": 1.9839906718631033, + "learning_rate": 1.051009150496463e-05, + "loss": 0.4689, + "step": 24440 + }, + { + "epoch": 2.372404424607025, + "grad_norm": 2.9092572102641374, + "learning_rate": 1.0493867220455579e-05, + "loss": 0.455, + "step": 24450 + }, + { + "epoch": 2.3733747331651465, + "grad_norm": 1.783405495522693, + "learning_rate": 1.0477642935946525e-05, + "loss": 0.47, + "step": 24460 + }, + { + "epoch": 2.374345041723268, + "grad_norm": 2.4188709581674, + "learning_rate": 1.0461418651437473e-05, + "loss": 0.4488, + "step": 24470 + }, + { + "epoch": 2.3753153502813893, + "grad_norm": 2.014782666224647, + "learning_rate": 1.044519436692842e-05, + "loss": 0.4533, + "step": 24480 + }, + { + "epoch": 2.376285658839511, + "grad_norm": 2.174178898015034, + "learning_rate": 1.0428970082419366e-05, + "loss": 0.444, + "step": 24490 + }, + { + "epoch": 2.3772559673976326, + "grad_norm": 2.485611502054494, + "learning_rate": 1.0412745797910313e-05, + "loss": 0.5027, + "step": 24500 + }, + { + "epoch": 2.3772559673976326, + "eval_loss": 0.6276779174804688, + "eval_runtime": 3079.5235, + "eval_samples_per_second": 0.582, + "eval_steps_per_second": 0.291, + "step": 24500 + }, + { + "epoch": 2.378226275955754, + "grad_norm": 2.5871995720796184, + "learning_rate": 1.0396521513401259e-05, + "loss": 0.4382, + "step": 24510 + }, + { + "epoch": 2.3791965845138754, + "grad_norm": 1.536190288984018, + "learning_rate": 1.0380297228892207e-05, + "loss": 0.4473, + "step": 24520 + }, + { + "epoch": 2.380166893071997, + "grad_norm": 1.6459160347807742, + "learning_rate": 1.0364072944383153e-05, + "loss": 0.4078, + "step": 24530 + }, + { + "epoch": 2.381137201630118, + "grad_norm": 2.152035121079775, + "learning_rate": 1.03478486598741e-05, + "loss": 0.4447, + "step": 24540 + }, + { + "epoch": 2.3821075101882396, + "grad_norm": 2.369193332564519, + "learning_rate": 1.0331624375365046e-05, + "loss": 0.3994, + "step": 24550 + }, + { + "epoch": 2.3830778187463615, + "grad_norm": 2.1514780345815256, + "learning_rate": 1.0315400090855994e-05, + "loss": 0.496, + "step": 24560 + }, + { + "epoch": 2.384048127304483, + "grad_norm": 1.7911554347140786, + "learning_rate": 1.029917580634694e-05, + "loss": 0.4701, + "step": 24570 + }, + { + "epoch": 2.3850184358626043, + "grad_norm": 2.0153664778745854, + "learning_rate": 1.0282951521837887e-05, + "loss": 0.4219, + "step": 24580 + }, + { + "epoch": 2.3859887444207257, + "grad_norm": 1.80318954794296, + "learning_rate": 1.0266727237328833e-05, + "loss": 0.4683, + "step": 24590 + }, + { + "epoch": 2.386959052978847, + "grad_norm": 2.588414199711988, + "learning_rate": 1.0250502952819781e-05, + "loss": 0.5015, + "step": 24600 + }, + { + "epoch": 2.387929361536969, + "grad_norm": 2.4217752650788613, + "learning_rate": 1.0234278668310728e-05, + "loss": 0.4975, + "step": 24610 + }, + { + "epoch": 2.3888996700950904, + "grad_norm": 1.794705542002023, + "learning_rate": 1.0218054383801674e-05, + "loss": 0.4836, + "step": 24620 + }, + { + "epoch": 2.389869978653212, + "grad_norm": 1.7631883574466254, + "learning_rate": 1.020183009929262e-05, + "loss": 0.5004, + "step": 24630 + }, + { + "epoch": 2.3908402872113332, + "grad_norm": 2.510901529741251, + "learning_rate": 1.0185605814783569e-05, + "loss": 0.4626, + "step": 24640 + }, + { + "epoch": 2.3918105957694547, + "grad_norm": 2.0262233819009157, + "learning_rate": 1.0169381530274517e-05, + "loss": 0.4289, + "step": 24650 + }, + { + "epoch": 2.392780904327576, + "grad_norm": 2.039691163120118, + "learning_rate": 1.0153157245765463e-05, + "loss": 0.4712, + "step": 24660 + }, + { + "epoch": 2.3937512128856975, + "grad_norm": 2.3530360716419665, + "learning_rate": 1.013693296125641e-05, + "loss": 0.4661, + "step": 24670 + }, + { + "epoch": 2.3947215214438193, + "grad_norm": 1.8731826227321207, + "learning_rate": 1.0120708676747356e-05, + "loss": 0.4602, + "step": 24680 + }, + { + "epoch": 2.3956918300019407, + "grad_norm": 2.1068678736668067, + "learning_rate": 1.0104484392238304e-05, + "loss": 0.3962, + "step": 24690 + }, + { + "epoch": 2.396662138560062, + "grad_norm": 2.0442268653871443, + "learning_rate": 1.008826010772925e-05, + "loss": 0.4241, + "step": 24700 + }, + { + "epoch": 2.3976324471181836, + "grad_norm": 1.823987854926844, + "learning_rate": 1.0072035823220197e-05, + "loss": 0.4332, + "step": 24710 + }, + { + "epoch": 2.398602755676305, + "grad_norm": 2.0466476195429433, + "learning_rate": 1.0055811538711143e-05, + "loss": 0.4973, + "step": 24720 + }, + { + "epoch": 2.3995730642344264, + "grad_norm": 2.2091928863349706, + "learning_rate": 1.003958725420209e-05, + "loss": 0.4561, + "step": 24730 + }, + { + "epoch": 2.400543372792548, + "grad_norm": 2.0728809538988417, + "learning_rate": 1.0023362969693038e-05, + "loss": 0.4311, + "step": 24740 + }, + { + "epoch": 2.4015136813506697, + "grad_norm": 2.5482352204418612, + "learning_rate": 1.0007138685183984e-05, + "loss": 0.5071, + "step": 24750 + }, + { + "epoch": 2.402483989908791, + "grad_norm": 1.9610940178733647, + "learning_rate": 9.99091440067493e-06, + "loss": 0.4401, + "step": 24760 + }, + { + "epoch": 2.4034542984669125, + "grad_norm": 2.3445676431912448, + "learning_rate": 9.974690116165877e-06, + "loss": 0.4482, + "step": 24770 + }, + { + "epoch": 2.404424607025034, + "grad_norm": 1.6828772581664517, + "learning_rate": 9.958465831656825e-06, + "loss": 0.4489, + "step": 24780 + }, + { + "epoch": 2.4053949155831553, + "grad_norm": 2.126865830634638, + "learning_rate": 9.942241547147771e-06, + "loss": 0.4136, + "step": 24790 + }, + { + "epoch": 2.4063652241412767, + "grad_norm": 1.6333253702974164, + "learning_rate": 9.926017262638718e-06, + "loss": 0.4479, + "step": 24800 + }, + { + "epoch": 2.4073355326993986, + "grad_norm": 2.05443116143666, + "learning_rate": 9.909792978129664e-06, + "loss": 0.4806, + "step": 24810 + }, + { + "epoch": 2.40830584125752, + "grad_norm": 2.1173326880866234, + "learning_rate": 9.893568693620612e-06, + "loss": 0.496, + "step": 24820 + }, + { + "epoch": 2.4092761498156414, + "grad_norm": 2.6271672766867185, + "learning_rate": 9.877344409111558e-06, + "loss": 0.4953, + "step": 24830 + }, + { + "epoch": 2.410246458373763, + "grad_norm": 1.7740927010946907, + "learning_rate": 9.861120124602506e-06, + "loss": 0.445, + "step": 24840 + }, + { + "epoch": 2.4112167669318842, + "grad_norm": 1.7938164516772293, + "learning_rate": 9.844895840093453e-06, + "loss": 0.4651, + "step": 24850 + }, + { + "epoch": 2.4121870754900057, + "grad_norm": 1.876181751989744, + "learning_rate": 9.8286715555844e-06, + "loss": 0.4742, + "step": 24860 + }, + { + "epoch": 2.4131573840481275, + "grad_norm": 2.268711698650067, + "learning_rate": 9.812447271075347e-06, + "loss": 0.4858, + "step": 24870 + }, + { + "epoch": 2.414127692606249, + "grad_norm": 2.002023556609326, + "learning_rate": 9.796222986566294e-06, + "loss": 0.437, + "step": 24880 + }, + { + "epoch": 2.4150980011643703, + "grad_norm": 2.2377584218251787, + "learning_rate": 9.77999870205724e-06, + "loss": 0.4631, + "step": 24890 + }, + { + "epoch": 2.4160683097224918, + "grad_norm": 2.048732665820942, + "learning_rate": 9.763774417548186e-06, + "loss": 0.4828, + "step": 24900 + }, + { + "epoch": 2.417038618280613, + "grad_norm": 1.6448348280754015, + "learning_rate": 9.747550133039133e-06, + "loss": 0.5178, + "step": 24910 + }, + { + "epoch": 2.4180089268387346, + "grad_norm": 1.5590200283867917, + "learning_rate": 9.731325848530081e-06, + "loss": 0.4582, + "step": 24920 + }, + { + "epoch": 2.418979235396856, + "grad_norm": 1.9618039375978873, + "learning_rate": 9.715101564021027e-06, + "loss": 0.4207, + "step": 24930 + }, + { + "epoch": 2.419949543954978, + "grad_norm": 2.067363699561904, + "learning_rate": 9.698877279511974e-06, + "loss": 0.4825, + "step": 24940 + }, + { + "epoch": 2.4209198525130993, + "grad_norm": 2.1824993479688635, + "learning_rate": 9.68265299500292e-06, + "loss": 0.4507, + "step": 24950 + }, + { + "epoch": 2.4218901610712207, + "grad_norm": 1.9138947437636256, + "learning_rate": 9.666428710493868e-06, + "loss": 0.4576, + "step": 24960 + }, + { + "epoch": 2.422860469629342, + "grad_norm": 1.6115918485232108, + "learning_rate": 9.650204425984815e-06, + "loss": 0.4705, + "step": 24970 + }, + { + "epoch": 2.4238307781874635, + "grad_norm": 2.250793958728259, + "learning_rate": 9.633980141475761e-06, + "loss": 0.4521, + "step": 24980 + }, + { + "epoch": 2.424801086745585, + "grad_norm": 1.8523875454672862, + "learning_rate": 9.617755856966707e-06, + "loss": 0.4397, + "step": 24990 + }, + { + "epoch": 2.4257713953037063, + "grad_norm": 1.904730107613101, + "learning_rate": 9.601531572457655e-06, + "loss": 0.4564, + "step": 25000 + }, + { + "epoch": 2.4257713953037063, + "eval_loss": 0.6268747448921204, + "eval_runtime": 3077.7529, + "eval_samples_per_second": 0.582, + "eval_steps_per_second": 0.291, + "step": 25000 + }, + { + "epoch": 2.426741703861828, + "grad_norm": 2.161806358765534, + "learning_rate": 9.585307287948602e-06, + "loss": 0.4854, + "step": 25010 + }, + { + "epoch": 2.4277120124199496, + "grad_norm": 2.3367238235277483, + "learning_rate": 9.569083003439548e-06, + "loss": 0.4657, + "step": 25020 + }, + { + "epoch": 2.428682320978071, + "grad_norm": 1.7620434830381735, + "learning_rate": 9.552858718930495e-06, + "loss": 0.4716, + "step": 25030 + }, + { + "epoch": 2.4296526295361924, + "grad_norm": 1.9332408663776153, + "learning_rate": 9.536634434421443e-06, + "loss": 0.4321, + "step": 25040 + }, + { + "epoch": 2.430622938094314, + "grad_norm": 2.0611278551192465, + "learning_rate": 9.52041014991239e-06, + "loss": 0.4677, + "step": 25050 + }, + { + "epoch": 2.4315932466524357, + "grad_norm": 1.6768258233531348, + "learning_rate": 9.504185865403337e-06, + "loss": 0.462, + "step": 25060 + }, + { + "epoch": 2.432563555210557, + "grad_norm": 2.3845648710174285, + "learning_rate": 9.487961580894283e-06, + "loss": 0.4557, + "step": 25070 + }, + { + "epoch": 2.4335338637686785, + "grad_norm": 2.1104954575429047, + "learning_rate": 9.47173729638523e-06, + "loss": 0.4731, + "step": 25080 + }, + { + "epoch": 2.4345041723268, + "grad_norm": 2.0800318180116446, + "learning_rate": 9.455513011876176e-06, + "loss": 0.5289, + "step": 25090 + }, + { + "epoch": 2.4354744808849214, + "grad_norm": 2.237535485463924, + "learning_rate": 9.439288727367124e-06, + "loss": 0.456, + "step": 25100 + }, + { + "epoch": 2.4364447894430428, + "grad_norm": 2.160116067888104, + "learning_rate": 9.42306444285807e-06, + "loss": 0.4984, + "step": 25110 + }, + { + "epoch": 2.437415098001164, + "grad_norm": 2.0888945069063336, + "learning_rate": 9.406840158349017e-06, + "loss": 0.4893, + "step": 25120 + }, + { + "epoch": 2.438385406559286, + "grad_norm": 1.997111795963389, + "learning_rate": 9.390615873839963e-06, + "loss": 0.4434, + "step": 25130 + }, + { + "epoch": 2.4393557151174075, + "grad_norm": 1.3102614971883442, + "learning_rate": 9.374391589330911e-06, + "loss": 0.4015, + "step": 25140 + }, + { + "epoch": 2.440326023675529, + "grad_norm": 1.972554181589825, + "learning_rate": 9.358167304821858e-06, + "loss": 0.4408, + "step": 25150 + }, + { + "epoch": 2.4412963322336503, + "grad_norm": 1.6989988042772415, + "learning_rate": 9.341943020312804e-06, + "loss": 0.435, + "step": 25160 + }, + { + "epoch": 2.4422666407917717, + "grad_norm": 2.124313923821145, + "learning_rate": 9.32571873580375e-06, + "loss": 0.4387, + "step": 25170 + }, + { + "epoch": 2.443236949349893, + "grad_norm": 2.19131174810045, + "learning_rate": 9.309494451294699e-06, + "loss": 0.4672, + "step": 25180 + }, + { + "epoch": 2.4442072579080145, + "grad_norm": 1.962932141350282, + "learning_rate": 9.293270166785645e-06, + "loss": 0.4983, + "step": 25190 + }, + { + "epoch": 2.4451775664661364, + "grad_norm": 1.9122705512506952, + "learning_rate": 9.277045882276591e-06, + "loss": 0.4272, + "step": 25200 + }, + { + "epoch": 2.446147875024258, + "grad_norm": 2.3053425047807736, + "learning_rate": 9.260821597767538e-06, + "loss": 0.4199, + "step": 25210 + }, + { + "epoch": 2.447118183582379, + "grad_norm": 1.7446501480241867, + "learning_rate": 9.244597313258486e-06, + "loss": 0.4457, + "step": 25220 + }, + { + "epoch": 2.4480884921405006, + "grad_norm": 2.1569409270025774, + "learning_rate": 9.228373028749434e-06, + "loss": 0.4419, + "step": 25230 + }, + { + "epoch": 2.449058800698622, + "grad_norm": 2.0436766020436394, + "learning_rate": 9.21214874424038e-06, + "loss": 0.494, + "step": 25240 + }, + { + "epoch": 2.450029109256744, + "grad_norm": 1.7972560660936232, + "learning_rate": 9.195924459731327e-06, + "loss": 0.45, + "step": 25250 + }, + { + "epoch": 2.4509994178148653, + "grad_norm": 2.323226898620802, + "learning_rate": 9.179700175222273e-06, + "loss": 0.5016, + "step": 25260 + }, + { + "epoch": 2.4519697263729867, + "grad_norm": 2.0725138170956408, + "learning_rate": 9.16347589071322e-06, + "loss": 0.4894, + "step": 25270 + }, + { + "epoch": 2.452940034931108, + "grad_norm": 1.9222900586962415, + "learning_rate": 9.147251606204168e-06, + "loss": 0.5101, + "step": 25280 + }, + { + "epoch": 2.4539103434892295, + "grad_norm": 2.0067752335911684, + "learning_rate": 9.131027321695114e-06, + "loss": 0.4418, + "step": 25290 + }, + { + "epoch": 2.454880652047351, + "grad_norm": 2.1844091982849587, + "learning_rate": 9.11480303718606e-06, + "loss": 0.4812, + "step": 25300 + }, + { + "epoch": 2.4558509606054724, + "grad_norm": 2.1484177813886487, + "learning_rate": 9.098578752677007e-06, + "loss": 0.4516, + "step": 25310 + }, + { + "epoch": 2.4568212691635942, + "grad_norm": 2.070642163273252, + "learning_rate": 9.082354468167955e-06, + "loss": 0.402, + "step": 25320 + }, + { + "epoch": 2.4577915777217156, + "grad_norm": 1.8576636206526207, + "learning_rate": 9.066130183658901e-06, + "loss": 0.4864, + "step": 25330 + }, + { + "epoch": 2.458761886279837, + "grad_norm": 1.9269579474919305, + "learning_rate": 9.049905899149848e-06, + "loss": 0.4647, + "step": 25340 + }, + { + "epoch": 2.4597321948379585, + "grad_norm": 1.79111642345789, + "learning_rate": 9.033681614640794e-06, + "loss": 0.4454, + "step": 25350 + }, + { + "epoch": 2.46070250339608, + "grad_norm": 1.788304393616984, + "learning_rate": 9.017457330131742e-06, + "loss": 0.4422, + "step": 25360 + }, + { + "epoch": 2.4616728119542013, + "grad_norm": 2.2543923009140734, + "learning_rate": 9.001233045622688e-06, + "loss": 0.4413, + "step": 25370 + }, + { + "epoch": 2.4626431205123227, + "grad_norm": 1.1384131071115926, + "learning_rate": 8.985008761113635e-06, + "loss": 0.3985, + "step": 25380 + }, + { + "epoch": 2.4636134290704446, + "grad_norm": 2.105028387028187, + "learning_rate": 8.968784476604581e-06, + "loss": 0.4467, + "step": 25390 + }, + { + "epoch": 2.464583737628566, + "grad_norm": 1.925526835319373, + "learning_rate": 8.95256019209553e-06, + "loss": 0.4567, + "step": 25400 + }, + { + "epoch": 2.4655540461866874, + "grad_norm": 2.229525401490896, + "learning_rate": 8.936335907586476e-06, + "loss": 0.4889, + "step": 25410 + }, + { + "epoch": 2.466524354744809, + "grad_norm": 1.9389731054706656, + "learning_rate": 8.920111623077422e-06, + "loss": 0.3961, + "step": 25420 + }, + { + "epoch": 2.46749466330293, + "grad_norm": 2.0732327430253985, + "learning_rate": 8.90388733856837e-06, + "loss": 0.4688, + "step": 25430 + }, + { + "epoch": 2.4684649718610516, + "grad_norm": 2.0697499575240665, + "learning_rate": 8.887663054059317e-06, + "loss": 0.4383, + "step": 25440 + }, + { + "epoch": 2.4694352804191735, + "grad_norm": 1.7921240641401464, + "learning_rate": 8.871438769550265e-06, + "loss": 0.4513, + "step": 25450 + }, + { + "epoch": 2.470405588977295, + "grad_norm": 1.3473761062363179, + "learning_rate": 8.855214485041211e-06, + "loss": 0.4384, + "step": 25460 + }, + { + "epoch": 2.4713758975354163, + "grad_norm": 1.7010157532863919, + "learning_rate": 8.838990200532157e-06, + "loss": 0.4791, + "step": 25470 + }, + { + "epoch": 2.4723462060935377, + "grad_norm": 1.3896979150167772, + "learning_rate": 8.822765916023104e-06, + "loss": 0.4421, + "step": 25480 + }, + { + "epoch": 2.473316514651659, + "grad_norm": 1.8009616913979432, + "learning_rate": 8.80654163151405e-06, + "loss": 0.4873, + "step": 25490 + }, + { + "epoch": 2.4742868232097806, + "grad_norm": 2.2299861589049885, + "learning_rate": 8.790317347004998e-06, + "loss": 0.4032, + "step": 25500 + }, + { + "epoch": 2.4742868232097806, + "eval_loss": 0.6272784471511841, + "eval_runtime": 3078.0792, + "eval_samples_per_second": 0.582, + "eval_steps_per_second": 0.291, + "step": 25500 + } + ], + "logging_steps": 10, + "max_steps": 30918, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.02536313307136e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}