{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.23170968367941, "eval_steps": 500, "global_step": 23000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009703085581214826, "grad_norm": 0.5093896998931756, "learning_rate": 5e-06, "loss": 1.7158, "step": 10 }, { "epoch": 0.0019406171162429653, "grad_norm": 0.5499013987843694, "learning_rate": 1e-05, "loss": 1.6625, "step": 20 }, { "epoch": 0.002910925674364448, "grad_norm": 0.6551534985295564, "learning_rate": 1.5e-05, "loss": 1.6378, "step": 30 }, { "epoch": 0.0038812342324859306, "grad_norm": 0.7544385328936806, "learning_rate": 2e-05, "loss": 1.5712, "step": 40 }, { "epoch": 0.004851542790607413, "grad_norm": 0.6539371672140126, "learning_rate": 2.5e-05, "loss": 1.5217, "step": 50 }, { "epoch": 0.005821851348728896, "grad_norm": 0.6806376549433046, "learning_rate": 3e-05, "loss": 1.4412, "step": 60 }, { "epoch": 0.0067921599068503785, "grad_norm": 0.9109066482466388, "learning_rate": 3.5e-05, "loss": 1.2303, "step": 70 }, { "epoch": 0.007762468464971861, "grad_norm": 0.8941485474193254, "learning_rate": 4e-05, "loss": 1.1185, "step": 80 }, { "epoch": 0.008732777023093344, "grad_norm": 0.6564948122239989, "learning_rate": 4.5e-05, "loss": 1.0481, "step": 90 }, { "epoch": 0.009703085581214826, "grad_norm": 0.7749322437088195, "learning_rate": 5e-05, "loss": 0.9965, "step": 100 }, { "epoch": 0.010673394139336309, "grad_norm": 0.7305812400337627, "learning_rate": 4.998377571549095e-05, "loss": 1.0689, "step": 110 }, { "epoch": 0.011643702697457792, "grad_norm": 0.7320600025098228, "learning_rate": 4.9967551430981895e-05, "loss": 0.9554, "step": 120 }, { "epoch": 0.012614011255579274, "grad_norm": 0.7997476828820285, "learning_rate": 4.9951327146472845e-05, "loss": 0.9048, "step": 130 }, { "epoch": 0.013584319813700757, "grad_norm": 0.9942301355244718, "learning_rate": 4.993510286196379e-05, "loss": 0.9736, "step": 140 }, { "epoch": 0.01455462837182224, "grad_norm": 0.8990043753777502, "learning_rate": 4.991887857745474e-05, "loss": 0.9447, "step": 150 }, { "epoch": 0.015524936929943722, "grad_norm": 0.6931013922842434, "learning_rate": 4.990265429294568e-05, "loss": 0.8748, "step": 160 }, { "epoch": 0.016495245488065205, "grad_norm": 0.8305007689150241, "learning_rate": 4.988643000843663e-05, "loss": 0.9467, "step": 170 }, { "epoch": 0.017465554046186688, "grad_norm": 0.8993058386814992, "learning_rate": 4.9870205723927573e-05, "loss": 0.9394, "step": 180 }, { "epoch": 0.01843586260430817, "grad_norm": 0.7486900307747556, "learning_rate": 4.985398143941852e-05, "loss": 0.931, "step": 190 }, { "epoch": 0.019406171162429653, "grad_norm": 0.8879788388070488, "learning_rate": 4.983775715490947e-05, "loss": 0.9217, "step": 200 }, { "epoch": 0.020376479720551136, "grad_norm": 0.7351527409279133, "learning_rate": 4.9821532870400416e-05, "loss": 0.8438, "step": 210 }, { "epoch": 0.021346788278672618, "grad_norm": 0.9651267349255982, "learning_rate": 4.9805308585891366e-05, "loss": 0.8806, "step": 220 }, { "epoch": 0.0223170968367941, "grad_norm": 0.8358141761944308, "learning_rate": 4.978908430138231e-05, "loss": 0.9075, "step": 230 }, { "epoch": 0.023287405394915583, "grad_norm": 0.9068073418448239, "learning_rate": 4.977286001687326e-05, "loss": 0.9126, "step": 240 }, { "epoch": 0.024257713953037066, "grad_norm": 0.8510875679665427, "learning_rate": 4.97566357323642e-05, "loss": 0.8509, "step": 250 }, { "epoch": 0.02522802251115855, "grad_norm": 0.8371392324883873, "learning_rate": 4.974041144785515e-05, "loss": 0.9018, "step": 260 }, { "epoch": 0.02619833106928003, "grad_norm": 0.7343155911324605, "learning_rate": 4.9724187163346094e-05, "loss": 0.9251, "step": 270 }, { "epoch": 0.027168639627401514, "grad_norm": 1.012390858760769, "learning_rate": 4.9707962878837044e-05, "loss": 0.8881, "step": 280 }, { "epoch": 0.028138948185522997, "grad_norm": 0.8092967868905977, "learning_rate": 4.9691738594327994e-05, "loss": 0.8831, "step": 290 }, { "epoch": 0.02910925674364448, "grad_norm": 0.9545292755123069, "learning_rate": 4.967551430981894e-05, "loss": 0.8871, "step": 300 }, { "epoch": 0.030079565301765962, "grad_norm": 0.8451115258584492, "learning_rate": 4.9659290025309887e-05, "loss": 0.8913, "step": 310 }, { "epoch": 0.031049873859887445, "grad_norm": 0.984138459636415, "learning_rate": 4.964306574080083e-05, "loss": 0.9046, "step": 320 }, { "epoch": 0.03202018241800893, "grad_norm": 0.9275621393016203, "learning_rate": 4.962684145629178e-05, "loss": 0.88, "step": 330 }, { "epoch": 0.03299049097613041, "grad_norm": 1.0192818586577115, "learning_rate": 4.961061717178272e-05, "loss": 0.8923, "step": 340 }, { "epoch": 0.03396079953425189, "grad_norm": 0.9779533854801943, "learning_rate": 4.959439288727367e-05, "loss": 0.868, "step": 350 }, { "epoch": 0.034931108092373375, "grad_norm": 0.8554421373526296, "learning_rate": 4.957816860276462e-05, "loss": 0.8661, "step": 360 }, { "epoch": 0.03590141665049486, "grad_norm": 0.7404878078324254, "learning_rate": 4.9561944318255565e-05, "loss": 0.8883, "step": 370 }, { "epoch": 0.03687172520861634, "grad_norm": 0.9879847832445883, "learning_rate": 4.9545720033746515e-05, "loss": 0.8156, "step": 380 }, { "epoch": 0.03784203376673782, "grad_norm": 1.0209320862188216, "learning_rate": 4.952949574923746e-05, "loss": 0.8212, "step": 390 }, { "epoch": 0.038812342324859306, "grad_norm": 1.0823734215432599, "learning_rate": 4.951327146472841e-05, "loss": 0.8619, "step": 400 }, { "epoch": 0.03978265088298079, "grad_norm": 1.033686091190584, "learning_rate": 4.949704718021935e-05, "loss": 0.818, "step": 410 }, { "epoch": 0.04075295944110227, "grad_norm": 0.9587642487246072, "learning_rate": 4.94808228957103e-05, "loss": 0.8377, "step": 420 }, { "epoch": 0.041723267999223754, "grad_norm": 0.8072742668289667, "learning_rate": 4.946459861120124e-05, "loss": 0.8043, "step": 430 }, { "epoch": 0.042693576557345236, "grad_norm": 1.0090353092499478, "learning_rate": 4.944837432669219e-05, "loss": 0.8829, "step": 440 }, { "epoch": 0.04366388511546672, "grad_norm": 1.177538600328968, "learning_rate": 4.943215004218314e-05, "loss": 0.859, "step": 450 }, { "epoch": 0.0446341936735882, "grad_norm": 1.225063127801835, "learning_rate": 4.9415925757674086e-05, "loss": 0.8231, "step": 460 }, { "epoch": 0.045604502231709684, "grad_norm": 0.8224632705866025, "learning_rate": 4.9399701473165035e-05, "loss": 0.8335, "step": 470 }, { "epoch": 0.04657481078983117, "grad_norm": 1.153934818941284, "learning_rate": 4.938347718865598e-05, "loss": 0.8046, "step": 480 }, { "epoch": 0.04754511934795265, "grad_norm": 1.0954567838682858, "learning_rate": 4.936725290414693e-05, "loss": 0.8975, "step": 490 }, { "epoch": 0.04851542790607413, "grad_norm": 1.2466392758405571, "learning_rate": 4.935102861963788e-05, "loss": 0.8361, "step": 500 }, { "epoch": 0.049485736464195615, "grad_norm": 0.902356430744448, "learning_rate": 4.933480433512883e-05, "loss": 0.8048, "step": 510 }, { "epoch": 0.0504560450223171, "grad_norm": 1.01027561407788, "learning_rate": 4.931858005061977e-05, "loss": 0.8094, "step": 520 }, { "epoch": 0.05142635358043858, "grad_norm": 1.1330239054084805, "learning_rate": 4.930235576611072e-05, "loss": 0.8288, "step": 530 }, { "epoch": 0.05239666213856006, "grad_norm": 1.19235957479236, "learning_rate": 4.9286131481601664e-05, "loss": 0.8358, "step": 540 }, { "epoch": 0.053366970696681545, "grad_norm": 1.070882879754836, "learning_rate": 4.926990719709261e-05, "loss": 0.783, "step": 550 }, { "epoch": 0.05433727925480303, "grad_norm": 0.772829083325615, "learning_rate": 4.925368291258356e-05, "loss": 0.7888, "step": 560 }, { "epoch": 0.05530758781292451, "grad_norm": 1.0485603174488358, "learning_rate": 4.9237458628074506e-05, "loss": 0.8893, "step": 570 }, { "epoch": 0.05627789637104599, "grad_norm": 1.1365438528959075, "learning_rate": 4.9221234343565456e-05, "loss": 0.8533, "step": 580 }, { "epoch": 0.057248204929167476, "grad_norm": 1.1791401079390134, "learning_rate": 4.92050100590564e-05, "loss": 0.8208, "step": 590 }, { "epoch": 0.05821851348728896, "grad_norm": 1.165054586063552, "learning_rate": 4.918878577454735e-05, "loss": 0.7945, "step": 600 }, { "epoch": 0.05918882204541044, "grad_norm": 1.0001362775159148, "learning_rate": 4.917256149003829e-05, "loss": 0.8426, "step": 610 }, { "epoch": 0.060159130603531924, "grad_norm": 1.1955538077863535, "learning_rate": 4.915633720552924e-05, "loss": 0.8346, "step": 620 }, { "epoch": 0.06112943916165341, "grad_norm": 1.016839980469479, "learning_rate": 4.9140112921020184e-05, "loss": 0.8224, "step": 630 }, { "epoch": 0.06209974771977489, "grad_norm": 0.9782695489269584, "learning_rate": 4.9123888636511134e-05, "loss": 0.849, "step": 640 }, { "epoch": 0.06307005627789637, "grad_norm": 1.0768176346339298, "learning_rate": 4.9107664352002084e-05, "loss": 0.8519, "step": 650 }, { "epoch": 0.06404036483601785, "grad_norm": 1.0636061081219466, "learning_rate": 4.909144006749303e-05, "loss": 0.7887, "step": 660 }, { "epoch": 0.06501067339413934, "grad_norm": 1.0734895912783853, "learning_rate": 4.907521578298398e-05, "loss": 0.8415, "step": 670 }, { "epoch": 0.06598098195226082, "grad_norm": 1.0796531470403106, "learning_rate": 4.905899149847492e-05, "loss": 0.7512, "step": 680 }, { "epoch": 0.0669512905103823, "grad_norm": 0.9361767973637918, "learning_rate": 4.904276721396587e-05, "loss": 0.8482, "step": 690 }, { "epoch": 0.06792159906850379, "grad_norm": 1.5287022172498188, "learning_rate": 4.902654292945681e-05, "loss": 0.8389, "step": 700 }, { "epoch": 0.06889190762662527, "grad_norm": 1.129894676066631, "learning_rate": 4.901031864494776e-05, "loss": 0.7629, "step": 710 }, { "epoch": 0.06986221618474675, "grad_norm": 1.060923634010241, "learning_rate": 4.8994094360438705e-05, "loss": 0.8134, "step": 720 }, { "epoch": 0.07083252474286823, "grad_norm": 1.121507522572716, "learning_rate": 4.8977870075929655e-05, "loss": 0.8295, "step": 730 }, { "epoch": 0.07180283330098972, "grad_norm": 1.2375518463265478, "learning_rate": 4.8961645791420605e-05, "loss": 0.8006, "step": 740 }, { "epoch": 0.0727731418591112, "grad_norm": 1.1957590053535605, "learning_rate": 4.894542150691155e-05, "loss": 0.8185, "step": 750 }, { "epoch": 0.07374345041723268, "grad_norm": 1.1525450079069435, "learning_rate": 4.89291972224025e-05, "loss": 0.8186, "step": 760 }, { "epoch": 0.07471375897535416, "grad_norm": 1.2637346291101144, "learning_rate": 4.891297293789344e-05, "loss": 0.8094, "step": 770 }, { "epoch": 0.07568406753347565, "grad_norm": 1.1118603684784645, "learning_rate": 4.889674865338439e-05, "loss": 0.8075, "step": 780 }, { "epoch": 0.07665437609159713, "grad_norm": 0.987279065085845, "learning_rate": 4.888052436887533e-05, "loss": 0.8088, "step": 790 }, { "epoch": 0.07762468464971861, "grad_norm": 1.050050445685608, "learning_rate": 4.886430008436628e-05, "loss": 0.8283, "step": 800 }, { "epoch": 0.0785949932078401, "grad_norm": 1.1064553828881234, "learning_rate": 4.8848075799857226e-05, "loss": 0.7906, "step": 810 }, { "epoch": 0.07956530176596158, "grad_norm": 1.0679283923210974, "learning_rate": 4.8831851515348176e-05, "loss": 0.7888, "step": 820 }, { "epoch": 0.08053561032408306, "grad_norm": 1.325000406776113, "learning_rate": 4.8815627230839126e-05, "loss": 0.8573, "step": 830 }, { "epoch": 0.08150591888220454, "grad_norm": 1.1430990025005974, "learning_rate": 4.879940294633007e-05, "loss": 0.823, "step": 840 }, { "epoch": 0.08247622744032602, "grad_norm": 0.9708573314011439, "learning_rate": 4.878317866182102e-05, "loss": 0.7812, "step": 850 }, { "epoch": 0.08344653599844751, "grad_norm": 1.040516439704035, "learning_rate": 4.876695437731196e-05, "loss": 0.8043, "step": 860 }, { "epoch": 0.08441684455656899, "grad_norm": 1.4624875951419556, "learning_rate": 4.875073009280291e-05, "loss": 0.8054, "step": 870 }, { "epoch": 0.08538715311469047, "grad_norm": 1.2973550173157966, "learning_rate": 4.8734505808293854e-05, "loss": 0.7693, "step": 880 }, { "epoch": 0.08635746167281196, "grad_norm": 1.21645485433679, "learning_rate": 4.8718281523784804e-05, "loss": 0.7402, "step": 890 }, { "epoch": 0.08732777023093344, "grad_norm": 1.2989905455220712, "learning_rate": 4.870205723927575e-05, "loss": 0.7603, "step": 900 }, { "epoch": 0.08829807878905492, "grad_norm": 1.1647056182819275, "learning_rate": 4.86858329547667e-05, "loss": 0.7576, "step": 910 }, { "epoch": 0.0892683873471764, "grad_norm": 1.32821951027441, "learning_rate": 4.8669608670257646e-05, "loss": 0.7593, "step": 920 }, { "epoch": 0.09023869590529789, "grad_norm": 0.9792995911846096, "learning_rate": 4.865338438574859e-05, "loss": 0.7726, "step": 930 }, { "epoch": 0.09120900446341937, "grad_norm": 1.1716259071546666, "learning_rate": 4.863716010123954e-05, "loss": 0.7585, "step": 940 }, { "epoch": 0.09217931302154085, "grad_norm": 1.4206264005961533, "learning_rate": 4.862093581673048e-05, "loss": 0.7922, "step": 950 }, { "epoch": 0.09314962157966233, "grad_norm": 0.9969780924285648, "learning_rate": 4.860471153222143e-05, "loss": 0.8338, "step": 960 }, { "epoch": 0.09411993013778382, "grad_norm": 1.2259249945683814, "learning_rate": 4.8588487247712375e-05, "loss": 0.7848, "step": 970 }, { "epoch": 0.0950902386959053, "grad_norm": 1.304284918297249, "learning_rate": 4.8572262963203325e-05, "loss": 0.799, "step": 980 }, { "epoch": 0.09606054725402678, "grad_norm": 1.382632512351389, "learning_rate": 4.8556038678694274e-05, "loss": 0.7358, "step": 990 }, { "epoch": 0.09703085581214826, "grad_norm": 1.1047797334502243, "learning_rate": 4.853981439418522e-05, "loss": 0.7175, "step": 1000 }, { "epoch": 0.09800116437026975, "grad_norm": 1.298146739590951, "learning_rate": 4.852359010967617e-05, "loss": 0.7694, "step": 1010 }, { "epoch": 0.09897147292839123, "grad_norm": 1.3449159574026, "learning_rate": 4.850736582516711e-05, "loss": 0.7549, "step": 1020 }, { "epoch": 0.09994178148651271, "grad_norm": 1.0510958795717098, "learning_rate": 4.849114154065806e-05, "loss": 0.7851, "step": 1030 }, { "epoch": 0.1009120900446342, "grad_norm": 1.2932499353997113, "learning_rate": 4.8474917256149e-05, "loss": 0.7948, "step": 1040 }, { "epoch": 0.10188239860275568, "grad_norm": 1.024772482994231, "learning_rate": 4.845869297163995e-05, "loss": 0.7551, "step": 1050 }, { "epoch": 0.10285270716087716, "grad_norm": 0.9151226850367016, "learning_rate": 4.8442468687130896e-05, "loss": 0.8212, "step": 1060 }, { "epoch": 0.10382301571899864, "grad_norm": 1.0356064137085648, "learning_rate": 4.8426244402621846e-05, "loss": 0.7815, "step": 1070 }, { "epoch": 0.10479332427712013, "grad_norm": 1.0454220890712578, "learning_rate": 4.8410020118112795e-05, "loss": 0.7813, "step": 1080 }, { "epoch": 0.10576363283524161, "grad_norm": 1.0245673186100301, "learning_rate": 4.839379583360374e-05, "loss": 0.7759, "step": 1090 }, { "epoch": 0.10673394139336309, "grad_norm": 1.1805883290044246, "learning_rate": 4.837757154909469e-05, "loss": 0.8016, "step": 1100 }, { "epoch": 0.10770424995148457, "grad_norm": 1.305171444801399, "learning_rate": 4.836134726458563e-05, "loss": 0.7603, "step": 1110 }, { "epoch": 0.10867455850960606, "grad_norm": 1.1334982322569604, "learning_rate": 4.834512298007658e-05, "loss": 0.7818, "step": 1120 }, { "epoch": 0.10964486706772754, "grad_norm": 1.4897469886835581, "learning_rate": 4.8328898695567524e-05, "loss": 0.7391, "step": 1130 }, { "epoch": 0.11061517562584902, "grad_norm": 1.12299562877673, "learning_rate": 4.8312674411058474e-05, "loss": 0.8129, "step": 1140 }, { "epoch": 0.1115854841839705, "grad_norm": 1.2581064725802191, "learning_rate": 4.8296450126549417e-05, "loss": 0.7132, "step": 1150 }, { "epoch": 0.11255579274209199, "grad_norm": 1.1117033056933057, "learning_rate": 4.8280225842040366e-05, "loss": 0.7247, "step": 1160 }, { "epoch": 0.11352610130021347, "grad_norm": 1.5072697151276053, "learning_rate": 4.8264001557531316e-05, "loss": 0.7832, "step": 1170 }, { "epoch": 0.11449640985833495, "grad_norm": 1.3629201153146466, "learning_rate": 4.824777727302226e-05, "loss": 0.7894, "step": 1180 }, { "epoch": 0.11546671841645643, "grad_norm": 1.3456355263382838, "learning_rate": 4.823155298851321e-05, "loss": 0.8145, "step": 1190 }, { "epoch": 0.11643702697457792, "grad_norm": 1.3206439343547949, "learning_rate": 4.821532870400415e-05, "loss": 0.7802, "step": 1200 }, { "epoch": 0.1174073355326994, "grad_norm": 1.0980976013487813, "learning_rate": 4.81991044194951e-05, "loss": 0.801, "step": 1210 }, { "epoch": 0.11837764409082088, "grad_norm": 1.1370443655089153, "learning_rate": 4.8182880134986045e-05, "loss": 0.8006, "step": 1220 }, { "epoch": 0.11934795264894237, "grad_norm": 1.4354316240483984, "learning_rate": 4.8166655850476994e-05, "loss": 0.8131, "step": 1230 }, { "epoch": 0.12031826120706385, "grad_norm": 0.9112243461310537, "learning_rate": 4.815043156596794e-05, "loss": 0.8145, "step": 1240 }, { "epoch": 0.12128856976518533, "grad_norm": 0.8213091259360801, "learning_rate": 4.813420728145889e-05, "loss": 0.7114, "step": 1250 }, { "epoch": 0.12225887832330681, "grad_norm": 1.3405078182982422, "learning_rate": 4.811798299694984e-05, "loss": 0.7986, "step": 1260 }, { "epoch": 0.1232291868814283, "grad_norm": 1.4540711324279514, "learning_rate": 4.810175871244078e-05, "loss": 0.7281, "step": 1270 }, { "epoch": 0.12419949543954978, "grad_norm": 1.0040222075537582, "learning_rate": 4.8085534427931737e-05, "loss": 0.8048, "step": 1280 }, { "epoch": 0.12516980399767125, "grad_norm": 1.9264161003194282, "learning_rate": 4.806931014342268e-05, "loss": 0.7512, "step": 1290 }, { "epoch": 0.12614011255579274, "grad_norm": 1.155430871835739, "learning_rate": 4.805308585891363e-05, "loss": 0.7909, "step": 1300 }, { "epoch": 0.1271104211139142, "grad_norm": 1.382153914724162, "learning_rate": 4.803686157440457e-05, "loss": 0.7613, "step": 1310 }, { "epoch": 0.1280807296720357, "grad_norm": 1.1079273142955168, "learning_rate": 4.802063728989552e-05, "loss": 0.7747, "step": 1320 }, { "epoch": 0.12905103823015718, "grad_norm": 1.4797892743755068, "learning_rate": 4.8004413005386465e-05, "loss": 0.7448, "step": 1330 }, { "epoch": 0.13002134678827867, "grad_norm": 1.2840858833683126, "learning_rate": 4.7988188720877415e-05, "loss": 0.7567, "step": 1340 }, { "epoch": 0.13099165534640014, "grad_norm": 1.7212676971427285, "learning_rate": 4.797196443636836e-05, "loss": 0.7743, "step": 1350 }, { "epoch": 0.13196196390452164, "grad_norm": 1.7283159526025742, "learning_rate": 4.795574015185931e-05, "loss": 0.7696, "step": 1360 }, { "epoch": 0.1329322724626431, "grad_norm": 1.0739621496825589, "learning_rate": 4.793951586735026e-05, "loss": 0.7408, "step": 1370 }, { "epoch": 0.1339025810207646, "grad_norm": 1.183199109006057, "learning_rate": 4.79232915828412e-05, "loss": 0.7935, "step": 1380 }, { "epoch": 0.13487288957888607, "grad_norm": 1.3981562096537532, "learning_rate": 4.790706729833215e-05, "loss": 0.7113, "step": 1390 }, { "epoch": 0.13584319813700757, "grad_norm": 1.192516841319882, "learning_rate": 4.789084301382309e-05, "loss": 0.7786, "step": 1400 }, { "epoch": 0.13681350669512904, "grad_norm": 1.9269898255758637, "learning_rate": 4.787461872931404e-05, "loss": 0.7262, "step": 1410 }, { "epoch": 0.13778381525325054, "grad_norm": 1.1218056549529871, "learning_rate": 4.7858394444804986e-05, "loss": 0.6762, "step": 1420 }, { "epoch": 0.138754123811372, "grad_norm": 1.3635607352483248, "learning_rate": 4.7842170160295936e-05, "loss": 0.7733, "step": 1430 }, { "epoch": 0.1397244323694935, "grad_norm": 1.2380674478503626, "learning_rate": 4.782594587578688e-05, "loss": 0.7599, "step": 1440 }, { "epoch": 0.14069474092761497, "grad_norm": 1.4869366640536255, "learning_rate": 4.780972159127783e-05, "loss": 0.7408, "step": 1450 }, { "epoch": 0.14166504948573647, "grad_norm": 1.413887084722376, "learning_rate": 4.779349730676878e-05, "loss": 0.7792, "step": 1460 }, { "epoch": 0.14263535804385793, "grad_norm": 1.1522475316568597, "learning_rate": 4.777727302225972e-05, "loss": 0.7494, "step": 1470 }, { "epoch": 0.14360566660197943, "grad_norm": 1.5458892686752617, "learning_rate": 4.776104873775067e-05, "loss": 0.7151, "step": 1480 }, { "epoch": 0.1445759751601009, "grad_norm": 1.8892150029961168, "learning_rate": 4.7744824453241614e-05, "loss": 0.7025, "step": 1490 }, { "epoch": 0.1455462837182224, "grad_norm": 1.5206314987834826, "learning_rate": 4.7728600168732564e-05, "loss": 0.7748, "step": 1500 }, { "epoch": 0.14651659227634387, "grad_norm": 1.2234736643633124, "learning_rate": 4.771237588422351e-05, "loss": 0.7321, "step": 1510 }, { "epoch": 0.14748690083446536, "grad_norm": 1.3582944219822406, "learning_rate": 4.7696151599714456e-05, "loss": 0.7524, "step": 1520 }, { "epoch": 0.14845720939258683, "grad_norm": 1.3505468356868415, "learning_rate": 4.76799273152054e-05, "loss": 0.8032, "step": 1530 }, { "epoch": 0.14942751795070833, "grad_norm": 1.161565772908295, "learning_rate": 4.766370303069635e-05, "loss": 0.7449, "step": 1540 }, { "epoch": 0.1503978265088298, "grad_norm": 1.5018575379494306, "learning_rate": 4.76474787461873e-05, "loss": 0.779, "step": 1550 }, { "epoch": 0.1513681350669513, "grad_norm": 1.2744144878497465, "learning_rate": 4.763125446167824e-05, "loss": 0.7396, "step": 1560 }, { "epoch": 0.15233844362507276, "grad_norm": 1.2326350258332728, "learning_rate": 4.761503017716919e-05, "loss": 0.7249, "step": 1570 }, { "epoch": 0.15330875218319426, "grad_norm": 1.2982918445496714, "learning_rate": 4.7598805892660135e-05, "loss": 0.7453, "step": 1580 }, { "epoch": 0.15427906074131573, "grad_norm": 1.4792372825641715, "learning_rate": 4.7582581608151085e-05, "loss": 0.7573, "step": 1590 }, { "epoch": 0.15524936929943722, "grad_norm": 1.2935035055138504, "learning_rate": 4.756635732364203e-05, "loss": 0.7125, "step": 1600 }, { "epoch": 0.1562196778575587, "grad_norm": 1.2527694639896991, "learning_rate": 4.755013303913298e-05, "loss": 0.7406, "step": 1610 }, { "epoch": 0.1571899864156802, "grad_norm": 1.0724672098454868, "learning_rate": 4.753390875462392e-05, "loss": 0.7108, "step": 1620 }, { "epoch": 0.15816029497380166, "grad_norm": 1.1696947872465324, "learning_rate": 4.751768447011487e-05, "loss": 0.7636, "step": 1630 }, { "epoch": 0.15913060353192315, "grad_norm": 1.216857763890884, "learning_rate": 4.750146018560582e-05, "loss": 0.7916, "step": 1640 }, { "epoch": 0.16010091209004462, "grad_norm": 1.1371281502973842, "learning_rate": 4.748523590109676e-05, "loss": 0.7663, "step": 1650 }, { "epoch": 0.16107122064816612, "grad_norm": 1.2599433230373354, "learning_rate": 4.746901161658771e-05, "loss": 0.7279, "step": 1660 }, { "epoch": 0.1620415292062876, "grad_norm": 1.4580521921521419, "learning_rate": 4.7452787332078656e-05, "loss": 0.7241, "step": 1670 }, { "epoch": 0.16301183776440908, "grad_norm": 1.2227652156436173, "learning_rate": 4.7436563047569605e-05, "loss": 0.7082, "step": 1680 }, { "epoch": 0.16398214632253055, "grad_norm": 1.3578429437561153, "learning_rate": 4.742033876306055e-05, "loss": 0.6972, "step": 1690 }, { "epoch": 0.16495245488065205, "grad_norm": 1.2809095001474842, "learning_rate": 4.74041144785515e-05, "loss": 0.7267, "step": 1700 }, { "epoch": 0.16592276343877352, "grad_norm": 1.1017366555347645, "learning_rate": 4.738789019404245e-05, "loss": 0.7243, "step": 1710 }, { "epoch": 0.16689307199689501, "grad_norm": 1.2852463688715783, "learning_rate": 4.737166590953339e-05, "loss": 0.7276, "step": 1720 }, { "epoch": 0.16786338055501648, "grad_norm": 1.3099425645156408, "learning_rate": 4.735544162502434e-05, "loss": 0.7583, "step": 1730 }, { "epoch": 0.16883368911313798, "grad_norm": 1.5220953005112245, "learning_rate": 4.7339217340515284e-05, "loss": 0.7651, "step": 1740 }, { "epoch": 0.16980399767125945, "grad_norm": 1.3251019142596, "learning_rate": 4.7322993056006233e-05, "loss": 0.7428, "step": 1750 }, { "epoch": 0.17077430622938095, "grad_norm": 1.3275994643711895, "learning_rate": 4.7306768771497176e-05, "loss": 0.7552, "step": 1760 }, { "epoch": 0.17174461478750241, "grad_norm": 1.1363294732621385, "learning_rate": 4.7290544486988126e-05, "loss": 0.7441, "step": 1770 }, { "epoch": 0.1727149233456239, "grad_norm": 1.6604556501118164, "learning_rate": 4.727432020247907e-05, "loss": 0.7404, "step": 1780 }, { "epoch": 0.17368523190374538, "grad_norm": 1.4971063178125654, "learning_rate": 4.725809591797002e-05, "loss": 0.707, "step": 1790 }, { "epoch": 0.17465554046186688, "grad_norm": 1.296038406932857, "learning_rate": 4.724187163346097e-05, "loss": 0.7226, "step": 1800 }, { "epoch": 0.17562584901998834, "grad_norm": 0.9900558600646284, "learning_rate": 4.722564734895191e-05, "loss": 0.7107, "step": 1810 }, { "epoch": 0.17659615757810984, "grad_norm": 0.9589095684989648, "learning_rate": 4.720942306444286e-05, "loss": 0.6789, "step": 1820 }, { "epoch": 0.1775664661362313, "grad_norm": 1.300487413115222, "learning_rate": 4.7193198779933805e-05, "loss": 0.7656, "step": 1830 }, { "epoch": 0.1785367746943528, "grad_norm": 1.4678054839136885, "learning_rate": 4.7176974495424754e-05, "loss": 0.7559, "step": 1840 }, { "epoch": 0.17950708325247428, "grad_norm": 1.1487384302132937, "learning_rate": 4.71607502109157e-05, "loss": 0.7218, "step": 1850 }, { "epoch": 0.18047739181059577, "grad_norm": 1.5013237178369594, "learning_rate": 4.714452592640665e-05, "loss": 0.6618, "step": 1860 }, { "epoch": 0.18144770036871724, "grad_norm": 1.3544109774626758, "learning_rate": 4.712830164189759e-05, "loss": 0.7348, "step": 1870 }, { "epoch": 0.18241800892683874, "grad_norm": 1.7098997535129123, "learning_rate": 4.711207735738854e-05, "loss": 0.6721, "step": 1880 }, { "epoch": 0.1833883174849602, "grad_norm": 1.439742442692076, "learning_rate": 4.709585307287949e-05, "loss": 0.6823, "step": 1890 }, { "epoch": 0.1843586260430817, "grad_norm": 1.1240799236385792, "learning_rate": 4.707962878837043e-05, "loss": 0.7337, "step": 1900 }, { "epoch": 0.18532893460120317, "grad_norm": 1.611378134043144, "learning_rate": 4.706340450386138e-05, "loss": 0.7348, "step": 1910 }, { "epoch": 0.18629924315932467, "grad_norm": 1.15451740477432, "learning_rate": 4.7047180219352325e-05, "loss": 0.6945, "step": 1920 }, { "epoch": 0.18726955171744614, "grad_norm": 1.5743486470192276, "learning_rate": 4.7030955934843275e-05, "loss": 0.8062, "step": 1930 }, { "epoch": 0.18823986027556763, "grad_norm": 1.0248040084556684, "learning_rate": 4.701473165033422e-05, "loss": 0.7155, "step": 1940 }, { "epoch": 0.1892101688336891, "grad_norm": 1.0310414805904136, "learning_rate": 4.699850736582517e-05, "loss": 0.6776, "step": 1950 }, { "epoch": 0.1901804773918106, "grad_norm": 1.0785156193356158, "learning_rate": 4.698228308131611e-05, "loss": 0.7092, "step": 1960 }, { "epoch": 0.19115078594993207, "grad_norm": 1.4378941602091937, "learning_rate": 4.696605879680706e-05, "loss": 0.7286, "step": 1970 }, { "epoch": 0.19212109450805356, "grad_norm": 1.1595135486156671, "learning_rate": 4.694983451229801e-05, "loss": 0.7479, "step": 1980 }, { "epoch": 0.19309140306617503, "grad_norm": 1.69348514333408, "learning_rate": 4.6933610227788953e-05, "loss": 0.7263, "step": 1990 }, { "epoch": 0.19406171162429653, "grad_norm": 1.502953634133657, "learning_rate": 4.69173859432799e-05, "loss": 0.7655, "step": 2000 }, { "epoch": 0.19406171162429653, "eval_loss": 0.7587813138961792, "eval_runtime": 2477.8973, "eval_samples_per_second": 0.723, "eval_steps_per_second": 0.362, "step": 2000 }, { "epoch": 0.195032020182418, "grad_norm": 1.4070334559445785, "learning_rate": 4.6901161658770846e-05, "loss": 0.7475, "step": 2010 }, { "epoch": 0.1960023287405395, "grad_norm": 0.8673128363877267, "learning_rate": 4.6884937374261796e-05, "loss": 0.6747, "step": 2020 }, { "epoch": 0.19697263729866096, "grad_norm": 1.428849127809278, "learning_rate": 4.686871308975274e-05, "loss": 0.7666, "step": 2030 }, { "epoch": 0.19794294585678246, "grad_norm": 1.2749540514563555, "learning_rate": 4.685248880524369e-05, "loss": 0.7346, "step": 2040 }, { "epoch": 0.19891325441490393, "grad_norm": 1.2400883720583105, "learning_rate": 4.683626452073464e-05, "loss": 0.7012, "step": 2050 }, { "epoch": 0.19988356297302542, "grad_norm": 1.4035260703119048, "learning_rate": 4.682004023622559e-05, "loss": 0.6697, "step": 2060 }, { "epoch": 0.2008538715311469, "grad_norm": 1.9046063428505313, "learning_rate": 4.680381595171653e-05, "loss": 0.7126, "step": 2070 }, { "epoch": 0.2018241800892684, "grad_norm": 1.3281602578294986, "learning_rate": 4.678759166720748e-05, "loss": 0.6895, "step": 2080 }, { "epoch": 0.20279448864738986, "grad_norm": 1.271644663424638, "learning_rate": 4.677136738269843e-05, "loss": 0.7354, "step": 2090 }, { "epoch": 0.20376479720551136, "grad_norm": 1.2206696245686643, "learning_rate": 4.6755143098189374e-05, "loss": 0.7252, "step": 2100 }, { "epoch": 0.20473510576363282, "grad_norm": 1.3032653564716208, "learning_rate": 4.6738918813680324e-05, "loss": 0.683, "step": 2110 }, { "epoch": 0.20570541432175432, "grad_norm": 1.3212954807490243, "learning_rate": 4.6722694529171267e-05, "loss": 0.737, "step": 2120 }, { "epoch": 0.2066757228798758, "grad_norm": 1.2586128849417775, "learning_rate": 4.6706470244662216e-05, "loss": 0.6841, "step": 2130 }, { "epoch": 0.20764603143799729, "grad_norm": 1.3491929764497603, "learning_rate": 4.669024596015316e-05, "loss": 0.7637, "step": 2140 }, { "epoch": 0.20861633999611875, "grad_norm": 1.5081978528458062, "learning_rate": 4.667402167564411e-05, "loss": 0.713, "step": 2150 }, { "epoch": 0.20958664855424025, "grad_norm": 1.5788893940918114, "learning_rate": 4.665779739113505e-05, "loss": 0.6337, "step": 2160 }, { "epoch": 0.21055695711236172, "grad_norm": 1.422646038947752, "learning_rate": 4.6641573106626e-05, "loss": 0.6653, "step": 2170 }, { "epoch": 0.21152726567048322, "grad_norm": 1.3417475771584986, "learning_rate": 4.662534882211695e-05, "loss": 0.7246, "step": 2180 }, { "epoch": 0.21249757422860469, "grad_norm": 1.0925391942237144, "learning_rate": 4.6609124537607895e-05, "loss": 0.7228, "step": 2190 }, { "epoch": 0.21346788278672618, "grad_norm": 1.8433218311262853, "learning_rate": 4.6592900253098844e-05, "loss": 0.7507, "step": 2200 }, { "epoch": 0.21443819134484765, "grad_norm": 1.792431274692439, "learning_rate": 4.657667596858979e-05, "loss": 0.6949, "step": 2210 }, { "epoch": 0.21540849990296915, "grad_norm": 1.3809251069016177, "learning_rate": 4.656045168408074e-05, "loss": 0.7252, "step": 2220 }, { "epoch": 0.21637880846109062, "grad_norm": 1.3641658734062512, "learning_rate": 4.654422739957168e-05, "loss": 0.7518, "step": 2230 }, { "epoch": 0.2173491170192121, "grad_norm": 1.452315608698053, "learning_rate": 4.652800311506263e-05, "loss": 0.6684, "step": 2240 }, { "epoch": 0.21831942557733358, "grad_norm": 1.1444356316783801, "learning_rate": 4.651177883055357e-05, "loss": 0.6714, "step": 2250 }, { "epoch": 0.21928973413545508, "grad_norm": 1.2977358748240138, "learning_rate": 4.649555454604452e-05, "loss": 0.7129, "step": 2260 }, { "epoch": 0.22026004269357655, "grad_norm": 1.328329389241565, "learning_rate": 4.647933026153547e-05, "loss": 0.6292, "step": 2270 }, { "epoch": 0.22123035125169804, "grad_norm": 1.2461104595186587, "learning_rate": 4.6463105977026415e-05, "loss": 0.7372, "step": 2280 }, { "epoch": 0.2222006598098195, "grad_norm": 1.3008437677725404, "learning_rate": 4.6446881692517365e-05, "loss": 0.6503, "step": 2290 }, { "epoch": 0.223170968367941, "grad_norm": 1.3630765232741517, "learning_rate": 4.643065740800831e-05, "loss": 0.657, "step": 2300 }, { "epoch": 0.22414127692606248, "grad_norm": 0.9600325037717949, "learning_rate": 4.641443312349926e-05, "loss": 0.7117, "step": 2310 }, { "epoch": 0.22511158548418397, "grad_norm": 1.733755330685857, "learning_rate": 4.63982088389902e-05, "loss": 0.7594, "step": 2320 }, { "epoch": 0.22608189404230544, "grad_norm": 1.722193618002385, "learning_rate": 4.638198455448115e-05, "loss": 0.6555, "step": 2330 }, { "epoch": 0.22705220260042694, "grad_norm": 1.2647254702280388, "learning_rate": 4.63657602699721e-05, "loss": 0.6996, "step": 2340 }, { "epoch": 0.2280225111585484, "grad_norm": 1.2801225623311008, "learning_rate": 4.6349535985463043e-05, "loss": 0.7038, "step": 2350 }, { "epoch": 0.2289928197166699, "grad_norm": 1.214420395864136, "learning_rate": 4.633331170095399e-05, "loss": 0.6617, "step": 2360 }, { "epoch": 0.22996312827479137, "grad_norm": 1.1970250608654163, "learning_rate": 4.6317087416444936e-05, "loss": 0.7505, "step": 2370 }, { "epoch": 0.23093343683291287, "grad_norm": 1.318541272085749, "learning_rate": 4.6300863131935886e-05, "loss": 0.703, "step": 2380 }, { "epoch": 0.23190374539103434, "grad_norm": 1.0733620278703964, "learning_rate": 4.628463884742683e-05, "loss": 0.7076, "step": 2390 }, { "epoch": 0.23287405394915583, "grad_norm": 1.320066160024292, "learning_rate": 4.626841456291778e-05, "loss": 0.6845, "step": 2400 }, { "epoch": 0.2338443625072773, "grad_norm": 1.3916335472579557, "learning_rate": 4.625219027840872e-05, "loss": 0.7213, "step": 2410 }, { "epoch": 0.2348146710653988, "grad_norm": 1.6506684734262902, "learning_rate": 4.623596599389967e-05, "loss": 0.6995, "step": 2420 }, { "epoch": 0.23578497962352027, "grad_norm": 0.8976988021024955, "learning_rate": 4.621974170939062e-05, "loss": 0.6994, "step": 2430 }, { "epoch": 0.23675528818164177, "grad_norm": 1.7783475330175254, "learning_rate": 4.6203517424881564e-05, "loss": 0.7475, "step": 2440 }, { "epoch": 0.23772559673976323, "grad_norm": 1.3329956259541478, "learning_rate": 4.6187293140372514e-05, "loss": 0.6991, "step": 2450 }, { "epoch": 0.23869590529788473, "grad_norm": 1.3282617232306233, "learning_rate": 4.617106885586346e-05, "loss": 0.7328, "step": 2460 }, { "epoch": 0.2396662138560062, "grad_norm": 1.1394548446385124, "learning_rate": 4.615484457135441e-05, "loss": 0.7092, "step": 2470 }, { "epoch": 0.2406365224141277, "grad_norm": 1.3304671581253036, "learning_rate": 4.613862028684535e-05, "loss": 0.6765, "step": 2480 }, { "epoch": 0.24160683097224916, "grad_norm": 1.5514738877871737, "learning_rate": 4.61223960023363e-05, "loss": 0.6558, "step": 2490 }, { "epoch": 0.24257713953037066, "grad_norm": 1.3668799500092241, "learning_rate": 4.610617171782724e-05, "loss": 0.7106, "step": 2500 }, { "epoch": 0.24257713953037066, "eval_loss": 0.7454198598861694, "eval_runtime": 2468.3109, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 2500 }, { "epoch": 0.24354744808849213, "grad_norm": 1.27051542074557, "learning_rate": 4.608994743331819e-05, "loss": 0.7037, "step": 2510 }, { "epoch": 0.24451775664661363, "grad_norm": 1.4333699367072212, "learning_rate": 4.607372314880914e-05, "loss": 0.7054, "step": 2520 }, { "epoch": 0.2454880652047351, "grad_norm": 1.3383562796673616, "learning_rate": 4.6057498864300085e-05, "loss": 0.7674, "step": 2530 }, { "epoch": 0.2464583737628566, "grad_norm": 1.5405408532446832, "learning_rate": 4.6041274579791035e-05, "loss": 0.7541, "step": 2540 }, { "epoch": 0.24742868232097806, "grad_norm": 1.4592658544748531, "learning_rate": 4.602505029528198e-05, "loss": 0.7286, "step": 2550 }, { "epoch": 0.24839899087909956, "grad_norm": 1.4060719508768202, "learning_rate": 4.600882601077293e-05, "loss": 0.7722, "step": 2560 }, { "epoch": 0.24936929943722103, "grad_norm": 1.2688352607364943, "learning_rate": 4.599260172626387e-05, "loss": 0.6851, "step": 2570 }, { "epoch": 0.2503396079953425, "grad_norm": 1.1798367463314652, "learning_rate": 4.597637744175482e-05, "loss": 0.6897, "step": 2580 }, { "epoch": 0.251309916553464, "grad_norm": 1.4210256244438406, "learning_rate": 4.5960153157245763e-05, "loss": 0.7005, "step": 2590 }, { "epoch": 0.2522802251115855, "grad_norm": 1.3304574917634664, "learning_rate": 4.594392887273671e-05, "loss": 0.6878, "step": 2600 }, { "epoch": 0.25325053366970696, "grad_norm": 1.4281359203156077, "learning_rate": 4.592770458822766e-05, "loss": 0.7519, "step": 2610 }, { "epoch": 0.2542208422278284, "grad_norm": 1.351420571440429, "learning_rate": 4.5911480303718606e-05, "loss": 0.6939, "step": 2620 }, { "epoch": 0.25519115078594995, "grad_norm": 1.3618970623647955, "learning_rate": 4.5895256019209556e-05, "loss": 0.669, "step": 2630 }, { "epoch": 0.2561614593440714, "grad_norm": 1.1008259683717303, "learning_rate": 4.58790317347005e-05, "loss": 0.6331, "step": 2640 }, { "epoch": 0.2571317679021929, "grad_norm": 1.690823489066079, "learning_rate": 4.586280745019145e-05, "loss": 0.6571, "step": 2650 }, { "epoch": 0.25810207646031436, "grad_norm": 1.2328191346656623, "learning_rate": 4.584658316568239e-05, "loss": 0.6712, "step": 2660 }, { "epoch": 0.2590723850184359, "grad_norm": 1.1997509506925832, "learning_rate": 4.583035888117334e-05, "loss": 0.6998, "step": 2670 }, { "epoch": 0.26004269357655735, "grad_norm": 1.3726212075390893, "learning_rate": 4.5814134596664284e-05, "loss": 0.6577, "step": 2680 }, { "epoch": 0.2610130021346788, "grad_norm": 1.4778299478224584, "learning_rate": 4.5797910312155234e-05, "loss": 0.7213, "step": 2690 }, { "epoch": 0.2619833106928003, "grad_norm": 1.2065482843241282, "learning_rate": 4.5781686027646184e-05, "loss": 0.6504, "step": 2700 }, { "epoch": 0.2629536192509218, "grad_norm": 1.6950271620933635, "learning_rate": 4.576546174313713e-05, "loss": 0.6938, "step": 2710 }, { "epoch": 0.2639239278090433, "grad_norm": 1.049429219350124, "learning_rate": 4.5749237458628077e-05, "loss": 0.7235, "step": 2720 }, { "epoch": 0.26489423636716475, "grad_norm": 1.2856525880832654, "learning_rate": 4.573301317411902e-05, "loss": 0.769, "step": 2730 }, { "epoch": 0.2658645449252862, "grad_norm": 1.1413973985217811, "learning_rate": 4.571678888960997e-05, "loss": 0.6542, "step": 2740 }, { "epoch": 0.26683485348340774, "grad_norm": 1.2963652895204112, "learning_rate": 4.570056460510091e-05, "loss": 0.6988, "step": 2750 }, { "epoch": 0.2678051620415292, "grad_norm": 1.200192916715494, "learning_rate": 4.568434032059186e-05, "loss": 0.662, "step": 2760 }, { "epoch": 0.2687754705996507, "grad_norm": 1.2418786623251215, "learning_rate": 4.566811603608281e-05, "loss": 0.6615, "step": 2770 }, { "epoch": 0.26974577915777215, "grad_norm": 1.2920018198618357, "learning_rate": 4.5651891751573755e-05, "loss": 0.683, "step": 2780 }, { "epoch": 0.27071608771589367, "grad_norm": 1.721771869410589, "learning_rate": 4.5635667467064705e-05, "loss": 0.6742, "step": 2790 }, { "epoch": 0.27168639627401514, "grad_norm": 1.595428896858632, "learning_rate": 4.561944318255565e-05, "loss": 0.6961, "step": 2800 }, { "epoch": 0.2726567048321366, "grad_norm": 1.5674989639835977, "learning_rate": 4.56032188980466e-05, "loss": 0.6246, "step": 2810 }, { "epoch": 0.2736270133902581, "grad_norm": 1.333761320008167, "learning_rate": 4.558699461353754e-05, "loss": 0.7142, "step": 2820 }, { "epoch": 0.2745973219483796, "grad_norm": 1.175942959739195, "learning_rate": 4.557077032902849e-05, "loss": 0.7335, "step": 2830 }, { "epoch": 0.27556763050650107, "grad_norm": 1.0433896859463463, "learning_rate": 4.555454604451944e-05, "loss": 0.7523, "step": 2840 }, { "epoch": 0.27653793906462254, "grad_norm": 1.4484346047275096, "learning_rate": 4.553832176001039e-05, "loss": 0.6576, "step": 2850 }, { "epoch": 0.277508247622744, "grad_norm": 1.922226387784083, "learning_rate": 4.552209747550133e-05, "loss": 0.6797, "step": 2860 }, { "epoch": 0.27847855618086553, "grad_norm": 1.4433287579337053, "learning_rate": 4.550587319099228e-05, "loss": 0.6787, "step": 2870 }, { "epoch": 0.279448864738987, "grad_norm": 1.4997180997318538, "learning_rate": 4.5489648906483225e-05, "loss": 0.7091, "step": 2880 }, { "epoch": 0.28041917329710847, "grad_norm": 1.347204680869145, "learning_rate": 4.5473424621974175e-05, "loss": 0.725, "step": 2890 }, { "epoch": 0.28138948185522994, "grad_norm": 1.3046706448190106, "learning_rate": 4.5457200337465125e-05, "loss": 0.6669, "step": 2900 }, { "epoch": 0.28235979041335146, "grad_norm": 1.1467739204887912, "learning_rate": 4.544097605295607e-05, "loss": 0.756, "step": 2910 }, { "epoch": 0.28333009897147293, "grad_norm": 1.162771827035537, "learning_rate": 4.542475176844702e-05, "loss": 0.7147, "step": 2920 }, { "epoch": 0.2843004075295944, "grad_norm": 1.6563795609214405, "learning_rate": 4.540852748393796e-05, "loss": 0.6845, "step": 2930 }, { "epoch": 0.28527071608771587, "grad_norm": 1.6728193237645246, "learning_rate": 4.539230319942891e-05, "loss": 0.673, "step": 2940 }, { "epoch": 0.2862410246458374, "grad_norm": 1.194612705131068, "learning_rate": 4.5376078914919854e-05, "loss": 0.6942, "step": 2950 }, { "epoch": 0.28721133320395886, "grad_norm": 1.439958876835649, "learning_rate": 4.53598546304108e-05, "loss": 0.6809, "step": 2960 }, { "epoch": 0.28818164176208033, "grad_norm": 1.5873335965261735, "learning_rate": 4.534363034590175e-05, "loss": 0.6877, "step": 2970 }, { "epoch": 0.2891519503202018, "grad_norm": 1.0997513483559662, "learning_rate": 4.5327406061392696e-05, "loss": 0.665, "step": 2980 }, { "epoch": 0.2901222588783233, "grad_norm": 1.3952744081716912, "learning_rate": 4.5311181776883646e-05, "loss": 0.7524, "step": 2990 }, { "epoch": 0.2910925674364448, "grad_norm": 1.1613111987177211, "learning_rate": 4.529495749237459e-05, "loss": 0.6659, "step": 3000 }, { "epoch": 0.2910925674364448, "eval_loss": 0.7351760268211365, "eval_runtime": 2466.7648, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 3000 }, { "epoch": 0.29206287599456626, "grad_norm": 1.404418707672653, "learning_rate": 4.527873320786554e-05, "loss": 0.6635, "step": 3010 }, { "epoch": 0.29303318455268773, "grad_norm": 1.7333713648105467, "learning_rate": 4.526250892335648e-05, "loss": 0.6918, "step": 3020 }, { "epoch": 0.29400349311080926, "grad_norm": 1.325947337933012, "learning_rate": 4.524628463884743e-05, "loss": 0.708, "step": 3030 }, { "epoch": 0.2949738016689307, "grad_norm": 1.420774851884038, "learning_rate": 4.5230060354338374e-05, "loss": 0.7173, "step": 3040 }, { "epoch": 0.2959441102270522, "grad_norm": 1.1753762610267322, "learning_rate": 4.5213836069829324e-05, "loss": 0.6654, "step": 3050 }, { "epoch": 0.29691441878517366, "grad_norm": 1.526767780243654, "learning_rate": 4.5197611785320274e-05, "loss": 0.6321, "step": 3060 }, { "epoch": 0.2978847273432952, "grad_norm": 1.8280158534376985, "learning_rate": 4.518138750081122e-05, "loss": 0.6647, "step": 3070 }, { "epoch": 0.29885503590141665, "grad_norm": 1.4181228966831005, "learning_rate": 4.516516321630217e-05, "loss": 0.7338, "step": 3080 }, { "epoch": 0.2998253444595381, "grad_norm": 1.7077549961994072, "learning_rate": 4.514893893179311e-05, "loss": 0.6272, "step": 3090 }, { "epoch": 0.3007956530176596, "grad_norm": 1.518857601222952, "learning_rate": 4.513271464728406e-05, "loss": 0.6184, "step": 3100 }, { "epoch": 0.3017659615757811, "grad_norm": 1.5688964933772704, "learning_rate": 4.5116490362775e-05, "loss": 0.6904, "step": 3110 }, { "epoch": 0.3027362701339026, "grad_norm": 1.346013000056015, "learning_rate": 4.510026607826595e-05, "loss": 0.6591, "step": 3120 }, { "epoch": 0.30370657869202405, "grad_norm": 1.7289926750048026, "learning_rate": 4.5084041793756895e-05, "loss": 0.602, "step": 3130 }, { "epoch": 0.3046768872501455, "grad_norm": 1.634884511910698, "learning_rate": 4.5067817509247845e-05, "loss": 0.6344, "step": 3140 }, { "epoch": 0.30564719580826705, "grad_norm": 1.2072013322253554, "learning_rate": 4.5051593224738795e-05, "loss": 0.6441, "step": 3150 }, { "epoch": 0.3066175043663885, "grad_norm": 1.4877523069726029, "learning_rate": 4.503536894022974e-05, "loss": 0.6711, "step": 3160 }, { "epoch": 0.30758781292451, "grad_norm": 1.3820572558751547, "learning_rate": 4.501914465572069e-05, "loss": 0.7005, "step": 3170 }, { "epoch": 0.30855812148263145, "grad_norm": 1.4545570501775118, "learning_rate": 4.500292037121163e-05, "loss": 0.6796, "step": 3180 }, { "epoch": 0.309528430040753, "grad_norm": 1.4415846087886384, "learning_rate": 4.498669608670258e-05, "loss": 0.6753, "step": 3190 }, { "epoch": 0.31049873859887445, "grad_norm": 1.253477394104618, "learning_rate": 4.497047180219352e-05, "loss": 0.745, "step": 3200 }, { "epoch": 0.3114690471569959, "grad_norm": 1.6938495814472803, "learning_rate": 4.495424751768447e-05, "loss": 0.6656, "step": 3210 }, { "epoch": 0.3124393557151174, "grad_norm": 1.7041386856543572, "learning_rate": 4.4938023233175416e-05, "loss": 0.678, "step": 3220 }, { "epoch": 0.3134096642732389, "grad_norm": 1.945535890749437, "learning_rate": 4.4921798948666366e-05, "loss": 0.6741, "step": 3230 }, { "epoch": 0.3143799728313604, "grad_norm": 1.504845552046309, "learning_rate": 4.4905574664157316e-05, "loss": 0.7055, "step": 3240 }, { "epoch": 0.31535028138948185, "grad_norm": 1.5218102037928898, "learning_rate": 4.488935037964826e-05, "loss": 0.6664, "step": 3250 }, { "epoch": 0.3163205899476033, "grad_norm": 1.281958434229701, "learning_rate": 4.487312609513921e-05, "loss": 0.689, "step": 3260 }, { "epoch": 0.31729089850572484, "grad_norm": 1.4017870385811553, "learning_rate": 4.485690181063015e-05, "loss": 0.656, "step": 3270 }, { "epoch": 0.3182612070638463, "grad_norm": 1.1312164184452325, "learning_rate": 4.48406775261211e-05, "loss": 0.6956, "step": 3280 }, { "epoch": 0.3192315156219678, "grad_norm": 1.5553483087810633, "learning_rate": 4.4824453241612044e-05, "loss": 0.627, "step": 3290 }, { "epoch": 0.32020182418008925, "grad_norm": 1.5414421667820604, "learning_rate": 4.4808228957102994e-05, "loss": 0.6567, "step": 3300 }, { "epoch": 0.32117213273821077, "grad_norm": 1.3882171728699897, "learning_rate": 4.479200467259394e-05, "loss": 0.6478, "step": 3310 }, { "epoch": 0.32214244129633224, "grad_norm": 1.2870563704453035, "learning_rate": 4.477578038808489e-05, "loss": 0.6258, "step": 3320 }, { "epoch": 0.3231127498544537, "grad_norm": 1.2176939882656843, "learning_rate": 4.4759556103575836e-05, "loss": 0.6831, "step": 3330 }, { "epoch": 0.3240830584125752, "grad_norm": 1.5859704946797195, "learning_rate": 4.474333181906678e-05, "loss": 0.6875, "step": 3340 }, { "epoch": 0.3250533669706967, "grad_norm": 1.3048860685179964, "learning_rate": 4.472710753455773e-05, "loss": 0.6797, "step": 3350 }, { "epoch": 0.32602367552881817, "grad_norm": 1.4244729025826581, "learning_rate": 4.471088325004867e-05, "loss": 0.6671, "step": 3360 }, { "epoch": 0.32699398408693964, "grad_norm": 1.664993378309651, "learning_rate": 4.469465896553962e-05, "loss": 0.6946, "step": 3370 }, { "epoch": 0.3279642926450611, "grad_norm": 1.4374586076882605, "learning_rate": 4.4678434681030565e-05, "loss": 0.7152, "step": 3380 }, { "epoch": 0.32893460120318263, "grad_norm": 1.183729655148727, "learning_rate": 4.4662210396521515e-05, "loss": 0.6695, "step": 3390 }, { "epoch": 0.3299049097613041, "grad_norm": 1.3106066103858482, "learning_rate": 4.4645986112012464e-05, "loss": 0.681, "step": 3400 }, { "epoch": 0.33087521831942557, "grad_norm": 1.9834642707734547, "learning_rate": 4.462976182750341e-05, "loss": 0.6544, "step": 3410 }, { "epoch": 0.33184552687754704, "grad_norm": 1.2898353030549436, "learning_rate": 4.461353754299436e-05, "loss": 0.6516, "step": 3420 }, { "epoch": 0.33281583543566856, "grad_norm": 1.3210516235415775, "learning_rate": 4.45973132584853e-05, "loss": 0.6684, "step": 3430 }, { "epoch": 0.33378614399379003, "grad_norm": 1.4442553376708276, "learning_rate": 4.458108897397625e-05, "loss": 0.6673, "step": 3440 }, { "epoch": 0.3347564525519115, "grad_norm": 1.1500209299804536, "learning_rate": 4.456486468946719e-05, "loss": 0.7158, "step": 3450 }, { "epoch": 0.33572676111003297, "grad_norm": 1.31538694061074, "learning_rate": 4.454864040495814e-05, "loss": 0.647, "step": 3460 }, { "epoch": 0.3366970696681545, "grad_norm": 1.4081971729681526, "learning_rate": 4.4532416120449086e-05, "loss": 0.6964, "step": 3470 }, { "epoch": 0.33766737822627596, "grad_norm": 1.4794888005976117, "learning_rate": 4.4516191835940036e-05, "loss": 0.6644, "step": 3480 }, { "epoch": 0.33863768678439743, "grad_norm": 1.3524205715043236, "learning_rate": 4.4499967551430985e-05, "loss": 0.6421, "step": 3490 }, { "epoch": 0.3396079953425189, "grad_norm": 1.5105858350763188, "learning_rate": 4.448374326692193e-05, "loss": 0.7105, "step": 3500 }, { "epoch": 0.3396079953425189, "eval_loss": 0.7246462106704712, "eval_runtime": 2471.0932, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 3500 }, { "epoch": 0.3405783039006404, "grad_norm": 1.4583137688880468, "learning_rate": 4.446751898241288e-05, "loss": 0.6087, "step": 3510 }, { "epoch": 0.3415486124587619, "grad_norm": 1.4130179418528255, "learning_rate": 4.445129469790382e-05, "loss": 0.6628, "step": 3520 }, { "epoch": 0.34251892101688336, "grad_norm": 1.3225389415201874, "learning_rate": 4.443507041339477e-05, "loss": 0.625, "step": 3530 }, { "epoch": 0.34348922957500483, "grad_norm": 1.8587881703474012, "learning_rate": 4.4418846128885714e-05, "loss": 0.6416, "step": 3540 }, { "epoch": 0.34445953813312635, "grad_norm": 1.2742394540775415, "learning_rate": 4.4402621844376664e-05, "loss": 0.632, "step": 3550 }, { "epoch": 0.3454298466912478, "grad_norm": 1.64437983962982, "learning_rate": 4.4386397559867607e-05, "loss": 0.6383, "step": 3560 }, { "epoch": 0.3464001552493693, "grad_norm": 1.3231966805125737, "learning_rate": 4.4370173275358556e-05, "loss": 0.6841, "step": 3570 }, { "epoch": 0.34737046380749076, "grad_norm": 1.167387762307211, "learning_rate": 4.4353948990849506e-05, "loss": 0.616, "step": 3580 }, { "epoch": 0.3483407723656123, "grad_norm": 1.4336867980215322, "learning_rate": 4.433772470634045e-05, "loss": 0.6195, "step": 3590 }, { "epoch": 0.34931108092373375, "grad_norm": 1.403759520909343, "learning_rate": 4.43215004218314e-05, "loss": 0.6787, "step": 3600 }, { "epoch": 0.3502813894818552, "grad_norm": 1.4695150471417817, "learning_rate": 4.430527613732234e-05, "loss": 0.6433, "step": 3610 }, { "epoch": 0.3512516980399767, "grad_norm": 1.3322730792588582, "learning_rate": 4.42890518528133e-05, "loss": 0.6827, "step": 3620 }, { "epoch": 0.3522220065980982, "grad_norm": 1.2541946759332971, "learning_rate": 4.427282756830424e-05, "loss": 0.6767, "step": 3630 }, { "epoch": 0.3531923151562197, "grad_norm": 1.3220529304919946, "learning_rate": 4.425660328379519e-05, "loss": 0.6785, "step": 3640 }, { "epoch": 0.35416262371434115, "grad_norm": 1.8159199916167459, "learning_rate": 4.4240378999286134e-05, "loss": 0.6504, "step": 3650 }, { "epoch": 0.3551329322724626, "grad_norm": 1.5513977697025123, "learning_rate": 4.4224154714777084e-05, "loss": 0.668, "step": 3660 }, { "epoch": 0.35610324083058414, "grad_norm": 1.9383083203795937, "learning_rate": 4.420793043026803e-05, "loss": 0.6314, "step": 3670 }, { "epoch": 0.3570735493887056, "grad_norm": 1.5611559659864904, "learning_rate": 4.419170614575898e-05, "loss": 0.6617, "step": 3680 }, { "epoch": 0.3580438579468271, "grad_norm": 1.2579691148448051, "learning_rate": 4.4175481861249927e-05, "loss": 0.6641, "step": 3690 }, { "epoch": 0.35901416650494855, "grad_norm": 1.8250727183362423, "learning_rate": 4.415925757674087e-05, "loss": 0.628, "step": 3700 }, { "epoch": 0.3599844750630701, "grad_norm": 1.8663848290613063, "learning_rate": 4.414303329223182e-05, "loss": 0.6819, "step": 3710 }, { "epoch": 0.36095478362119154, "grad_norm": 0.998691604867035, "learning_rate": 4.412680900772276e-05, "loss": 0.6603, "step": 3720 }, { "epoch": 0.361925092179313, "grad_norm": 1.5186010030355703, "learning_rate": 4.411058472321371e-05, "loss": 0.6608, "step": 3730 }, { "epoch": 0.3628954007374345, "grad_norm": 1.3444105870233574, "learning_rate": 4.4094360438704655e-05, "loss": 0.6684, "step": 3740 }, { "epoch": 0.363865709295556, "grad_norm": 1.8433609494425311, "learning_rate": 4.4078136154195605e-05, "loss": 0.7007, "step": 3750 }, { "epoch": 0.3648360178536775, "grad_norm": 1.3736456316572478, "learning_rate": 4.406191186968655e-05, "loss": 0.6691, "step": 3760 }, { "epoch": 0.36580632641179894, "grad_norm": 1.3845972262001316, "learning_rate": 4.40456875851775e-05, "loss": 0.699, "step": 3770 }, { "epoch": 0.3667766349699204, "grad_norm": 1.3202439744871353, "learning_rate": 4.402946330066845e-05, "loss": 0.6463, "step": 3780 }, { "epoch": 0.36774694352804194, "grad_norm": 1.527206134833558, "learning_rate": 4.401323901615939e-05, "loss": 0.6261, "step": 3790 }, { "epoch": 0.3687172520861634, "grad_norm": 1.85165256113485, "learning_rate": 4.399701473165034e-05, "loss": 0.6268, "step": 3800 }, { "epoch": 0.3696875606442849, "grad_norm": 1.7774545505998887, "learning_rate": 4.398079044714128e-05, "loss": 0.674, "step": 3810 }, { "epoch": 0.37065786920240634, "grad_norm": 1.1302938960409563, "learning_rate": 4.396456616263223e-05, "loss": 0.6814, "step": 3820 }, { "epoch": 0.37162817776052787, "grad_norm": 1.5633030889510193, "learning_rate": 4.3948341878123176e-05, "loss": 0.6821, "step": 3830 }, { "epoch": 0.37259848631864934, "grad_norm": 1.4226623590722947, "learning_rate": 4.3932117593614126e-05, "loss": 0.6721, "step": 3840 }, { "epoch": 0.3735687948767708, "grad_norm": 1.3788997600836588, "learning_rate": 4.391589330910507e-05, "loss": 0.6484, "step": 3850 }, { "epoch": 0.3745391034348923, "grad_norm": 1.9387617060516118, "learning_rate": 4.389966902459602e-05, "loss": 0.7059, "step": 3860 }, { "epoch": 0.3755094119930138, "grad_norm": 1.4159508230128068, "learning_rate": 4.388344474008697e-05, "loss": 0.6763, "step": 3870 }, { "epoch": 0.37647972055113527, "grad_norm": 1.4395070409428377, "learning_rate": 4.386722045557791e-05, "loss": 0.6441, "step": 3880 }, { "epoch": 0.37745002910925674, "grad_norm": 1.5259821305785304, "learning_rate": 4.385099617106886e-05, "loss": 0.7342, "step": 3890 }, { "epoch": 0.3784203376673782, "grad_norm": 1.58232361711394, "learning_rate": 4.3834771886559804e-05, "loss": 0.7058, "step": 3900 }, { "epoch": 0.37939064622549973, "grad_norm": 1.2758386103028025, "learning_rate": 4.3818547602050754e-05, "loss": 0.6116, "step": 3910 }, { "epoch": 0.3803609547836212, "grad_norm": 1.354723087565567, "learning_rate": 4.38023233175417e-05, "loss": 0.6709, "step": 3920 }, { "epoch": 0.38133126334174267, "grad_norm": 1.5790529961766175, "learning_rate": 4.3786099033032646e-05, "loss": 0.7028, "step": 3930 }, { "epoch": 0.38230157189986413, "grad_norm": 1.9609521386475108, "learning_rate": 4.376987474852359e-05, "loss": 0.5906, "step": 3940 }, { "epoch": 0.38327188045798566, "grad_norm": 1.3349118770454718, "learning_rate": 4.375365046401454e-05, "loss": 0.6623, "step": 3950 }, { "epoch": 0.3842421890161071, "grad_norm": 1.2937167531350466, "learning_rate": 4.373742617950549e-05, "loss": 0.6369, "step": 3960 }, { "epoch": 0.3852124975742286, "grad_norm": 1.3419705920519438, "learning_rate": 4.372120189499643e-05, "loss": 0.7277, "step": 3970 }, { "epoch": 0.38618280613235006, "grad_norm": 1.7073814830995901, "learning_rate": 4.370497761048738e-05, "loss": 0.6241, "step": 3980 }, { "epoch": 0.3871531146904716, "grad_norm": 1.373271354057727, "learning_rate": 4.3688753325978325e-05, "loss": 0.6481, "step": 3990 }, { "epoch": 0.38812342324859306, "grad_norm": 1.7949929197599008, "learning_rate": 4.3672529041469275e-05, "loss": 0.6125, "step": 4000 }, { "epoch": 0.38812342324859306, "eval_loss": 0.7137264609336853, "eval_runtime": 2466.144, "eval_samples_per_second": 0.727, "eval_steps_per_second": 0.363, "step": 4000 }, { "epoch": 0.3890937318067145, "grad_norm": 0.8329579782452979, "learning_rate": 4.365630475696022e-05, "loss": 0.6341, "step": 4010 }, { "epoch": 0.390064040364836, "grad_norm": 1.2617089929475087, "learning_rate": 4.364008047245117e-05, "loss": 0.671, "step": 4020 }, { "epoch": 0.3910343489229575, "grad_norm": 1.5577127482655397, "learning_rate": 4.362385618794212e-05, "loss": 0.6793, "step": 4030 }, { "epoch": 0.392004657481079, "grad_norm": 1.6386238000694935, "learning_rate": 4.360763190343306e-05, "loss": 0.7014, "step": 4040 }, { "epoch": 0.39297496603920046, "grad_norm": 1.5648938337659175, "learning_rate": 4.359140761892401e-05, "loss": 0.6867, "step": 4050 }, { "epoch": 0.3939452745973219, "grad_norm": 1.6294675174253543, "learning_rate": 4.357518333441495e-05, "loss": 0.6706, "step": 4060 }, { "epoch": 0.39491558315544345, "grad_norm": 1.549167727126926, "learning_rate": 4.35589590499059e-05, "loss": 0.6536, "step": 4070 }, { "epoch": 0.3958858917135649, "grad_norm": 1.4182199112027882, "learning_rate": 4.3542734765396846e-05, "loss": 0.679, "step": 4080 }, { "epoch": 0.3968562002716864, "grad_norm": 1.2821446945633657, "learning_rate": 4.3526510480887795e-05, "loss": 0.6643, "step": 4090 }, { "epoch": 0.39782650882980786, "grad_norm": 1.6858376405816184, "learning_rate": 4.351028619637874e-05, "loss": 0.6394, "step": 4100 }, { "epoch": 0.3987968173879294, "grad_norm": 1.4610732414303427, "learning_rate": 4.349406191186969e-05, "loss": 0.7123, "step": 4110 }, { "epoch": 0.39976712594605085, "grad_norm": 1.504769761142886, "learning_rate": 4.347783762736064e-05, "loss": 0.6071, "step": 4120 }, { "epoch": 0.4007374345041723, "grad_norm": 1.6912171907525997, "learning_rate": 4.346161334285158e-05, "loss": 0.5923, "step": 4130 }, { "epoch": 0.4017077430622938, "grad_norm": 1.5783940240743402, "learning_rate": 4.344538905834253e-05, "loss": 0.5902, "step": 4140 }, { "epoch": 0.4026780516204153, "grad_norm": 1.300258602942144, "learning_rate": 4.3429164773833474e-05, "loss": 0.668, "step": 4150 }, { "epoch": 0.4036483601785368, "grad_norm": 1.3879301457682354, "learning_rate": 4.3412940489324423e-05, "loss": 0.6578, "step": 4160 }, { "epoch": 0.40461866873665825, "grad_norm": 1.0418247331643762, "learning_rate": 4.3396716204815366e-05, "loss": 0.6539, "step": 4170 }, { "epoch": 0.4055889772947797, "grad_norm": 1.817053798243336, "learning_rate": 4.3380491920306316e-05, "loss": 0.6296, "step": 4180 }, { "epoch": 0.40655928585290124, "grad_norm": 1.8784845030864437, "learning_rate": 4.336426763579726e-05, "loss": 0.6435, "step": 4190 }, { "epoch": 0.4075295944110227, "grad_norm": 1.3173177735824406, "learning_rate": 4.334804335128821e-05, "loss": 0.6467, "step": 4200 }, { "epoch": 0.4084999029691442, "grad_norm": 1.6609920860725593, "learning_rate": 4.333181906677916e-05, "loss": 0.6506, "step": 4210 }, { "epoch": 0.40947021152726565, "grad_norm": 1.3422418231507067, "learning_rate": 4.33155947822701e-05, "loss": 0.6794, "step": 4220 }, { "epoch": 0.4104405200853872, "grad_norm": 1.6530218294736676, "learning_rate": 4.329937049776105e-05, "loss": 0.6458, "step": 4230 }, { "epoch": 0.41141082864350864, "grad_norm": 1.7371351925998049, "learning_rate": 4.3283146213251995e-05, "loss": 0.6085, "step": 4240 }, { "epoch": 0.4123811372016301, "grad_norm": 1.1986220099049816, "learning_rate": 4.3266921928742944e-05, "loss": 0.6763, "step": 4250 }, { "epoch": 0.4133514457597516, "grad_norm": 1.4910668696454408, "learning_rate": 4.325069764423389e-05, "loss": 0.6943, "step": 4260 }, { "epoch": 0.4143217543178731, "grad_norm": 1.605222234480949, "learning_rate": 4.323447335972484e-05, "loss": 0.5683, "step": 4270 }, { "epoch": 0.41529206287599457, "grad_norm": 1.6517210814862113, "learning_rate": 4.321824907521578e-05, "loss": 0.6121, "step": 4280 }, { "epoch": 0.41626237143411604, "grad_norm": 1.5780093620130797, "learning_rate": 4.320202479070673e-05, "loss": 0.634, "step": 4290 }, { "epoch": 0.4172326799922375, "grad_norm": 1.4948523984666717, "learning_rate": 4.318580050619768e-05, "loss": 0.6736, "step": 4300 }, { "epoch": 0.41820298855035903, "grad_norm": 1.412186726690487, "learning_rate": 4.316957622168862e-05, "loss": 0.6645, "step": 4310 }, { "epoch": 0.4191732971084805, "grad_norm": 1.3014470286002153, "learning_rate": 4.315335193717957e-05, "loss": 0.611, "step": 4320 }, { "epoch": 0.42014360566660197, "grad_norm": 1.5892208566989257, "learning_rate": 4.3137127652670515e-05, "loss": 0.6545, "step": 4330 }, { "epoch": 0.42111391422472344, "grad_norm": 1.912773548887141, "learning_rate": 4.3120903368161465e-05, "loss": 0.6454, "step": 4340 }, { "epoch": 0.42208422278284496, "grad_norm": 1.7286922382345113, "learning_rate": 4.310467908365241e-05, "loss": 0.6591, "step": 4350 }, { "epoch": 0.42305453134096643, "grad_norm": 1.8839412216123284, "learning_rate": 4.308845479914336e-05, "loss": 0.5875, "step": 4360 }, { "epoch": 0.4240248398990879, "grad_norm": 1.433166987175659, "learning_rate": 4.30722305146343e-05, "loss": 0.6405, "step": 4370 }, { "epoch": 0.42499514845720937, "grad_norm": 1.493313820047122, "learning_rate": 4.305600623012525e-05, "loss": 0.6421, "step": 4380 }, { "epoch": 0.4259654570153309, "grad_norm": 1.470156638554737, "learning_rate": 4.30397819456162e-05, "loss": 0.6212, "step": 4390 }, { "epoch": 0.42693576557345236, "grad_norm": 1.801577389486602, "learning_rate": 4.3023557661107143e-05, "loss": 0.589, "step": 4400 }, { "epoch": 0.42790607413157383, "grad_norm": 1.6174671178388904, "learning_rate": 4.30073333765981e-05, "loss": 0.6612, "step": 4410 }, { "epoch": 0.4288763826896953, "grad_norm": 1.5169174455759806, "learning_rate": 4.299110909208904e-05, "loss": 0.6418, "step": 4420 }, { "epoch": 0.4298466912478168, "grad_norm": 1.7159303308076814, "learning_rate": 4.297488480757999e-05, "loss": 0.6363, "step": 4430 }, { "epoch": 0.4308169998059383, "grad_norm": 1.9158386115366701, "learning_rate": 4.2958660523070936e-05, "loss": 0.6639, "step": 4440 }, { "epoch": 0.43178730836405976, "grad_norm": 1.6614034382324376, "learning_rate": 4.2942436238561885e-05, "loss": 0.6546, "step": 4450 }, { "epoch": 0.43275761692218123, "grad_norm": 1.528595189219001, "learning_rate": 4.292621195405283e-05, "loss": 0.6151, "step": 4460 }, { "epoch": 0.43372792548030276, "grad_norm": 1.297393909600355, "learning_rate": 4.290998766954378e-05, "loss": 0.6142, "step": 4470 }, { "epoch": 0.4346982340384242, "grad_norm": 1.6025277242190177, "learning_rate": 4.289376338503472e-05, "loss": 0.6415, "step": 4480 }, { "epoch": 0.4356685425965457, "grad_norm": 1.550877285078, "learning_rate": 4.287753910052567e-05, "loss": 0.6896, "step": 4490 }, { "epoch": 0.43663885115466716, "grad_norm": 1.8134887382719538, "learning_rate": 4.286131481601662e-05, "loss": 0.674, "step": 4500 }, { "epoch": 0.43663885115466716, "eval_loss": 0.70569908618927, "eval_runtime": 2470.7726, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 4500 }, { "epoch": 0.4376091597127887, "grad_norm": 1.5715320462855416, "learning_rate": 4.2845090531507564e-05, "loss": 0.6107, "step": 4510 }, { "epoch": 0.43857946827091016, "grad_norm": 1.659219569116577, "learning_rate": 4.2828866246998514e-05, "loss": 0.554, "step": 4520 }, { "epoch": 0.4395497768290316, "grad_norm": 1.5014148593583845, "learning_rate": 4.2812641962489457e-05, "loss": 0.6541, "step": 4530 }, { "epoch": 0.4405200853871531, "grad_norm": 1.2080187388338057, "learning_rate": 4.2796417677980406e-05, "loss": 0.6704, "step": 4540 }, { "epoch": 0.4414903939452746, "grad_norm": 1.4344070923299042, "learning_rate": 4.278019339347135e-05, "loss": 0.637, "step": 4550 }, { "epoch": 0.4424607025033961, "grad_norm": 1.9816113300875973, "learning_rate": 4.27639691089623e-05, "loss": 0.6036, "step": 4560 }, { "epoch": 0.44343101106151755, "grad_norm": 1.270379951275499, "learning_rate": 4.274774482445324e-05, "loss": 0.6481, "step": 4570 }, { "epoch": 0.444401319619639, "grad_norm": 1.559280795643455, "learning_rate": 4.273152053994419e-05, "loss": 0.62, "step": 4580 }, { "epoch": 0.44537162817776055, "grad_norm": 1.8377481053949813, "learning_rate": 4.271529625543514e-05, "loss": 0.6662, "step": 4590 }, { "epoch": 0.446341936735882, "grad_norm": 1.3952819132659193, "learning_rate": 4.2699071970926085e-05, "loss": 0.6969, "step": 4600 }, { "epoch": 0.4473122452940035, "grad_norm": 1.5967451654603113, "learning_rate": 4.2682847686417034e-05, "loss": 0.6604, "step": 4610 }, { "epoch": 0.44828255385212495, "grad_norm": 1.5849329398639342, "learning_rate": 4.266662340190798e-05, "loss": 0.6589, "step": 4620 }, { "epoch": 0.4492528624102465, "grad_norm": 1.7238645892594557, "learning_rate": 4.265039911739893e-05, "loss": 0.6658, "step": 4630 }, { "epoch": 0.45022317096836795, "grad_norm": 1.7849098949361206, "learning_rate": 4.263417483288987e-05, "loss": 0.6507, "step": 4640 }, { "epoch": 0.4511934795264894, "grad_norm": 1.3539873989398281, "learning_rate": 4.261795054838082e-05, "loss": 0.631, "step": 4650 }, { "epoch": 0.4521637880846109, "grad_norm": 1.6177606598072791, "learning_rate": 4.260172626387176e-05, "loss": 0.7667, "step": 4660 }, { "epoch": 0.4531340966427324, "grad_norm": 1.7683798129102917, "learning_rate": 4.258550197936271e-05, "loss": 0.6182, "step": 4670 }, { "epoch": 0.4541044052008539, "grad_norm": 1.5536790285936453, "learning_rate": 4.256927769485366e-05, "loss": 0.6515, "step": 4680 }, { "epoch": 0.45507471375897535, "grad_norm": 1.4626963492189242, "learning_rate": 4.2553053410344605e-05, "loss": 0.5595, "step": 4690 }, { "epoch": 0.4560450223170968, "grad_norm": 1.8455989589758681, "learning_rate": 4.2536829125835555e-05, "loss": 0.6382, "step": 4700 }, { "epoch": 0.45701533087521834, "grad_norm": 1.8260482347716946, "learning_rate": 4.25206048413265e-05, "loss": 0.6708, "step": 4710 }, { "epoch": 0.4579856394333398, "grad_norm": 1.506588655076192, "learning_rate": 4.250438055681745e-05, "loss": 0.6138, "step": 4720 }, { "epoch": 0.4589559479914613, "grad_norm": 1.7415606928937182, "learning_rate": 4.248815627230839e-05, "loss": 0.6688, "step": 4730 }, { "epoch": 0.45992625654958275, "grad_norm": 1.2860177177533143, "learning_rate": 4.247193198779934e-05, "loss": 0.6496, "step": 4740 }, { "epoch": 0.46089656510770427, "grad_norm": 1.419953340190783, "learning_rate": 4.245570770329029e-05, "loss": 0.6399, "step": 4750 }, { "epoch": 0.46186687366582574, "grad_norm": 1.4197804283366926, "learning_rate": 4.2439483418781234e-05, "loss": 0.572, "step": 4760 }, { "epoch": 0.4628371822239472, "grad_norm": 1.1761100117238914, "learning_rate": 4.242325913427218e-05, "loss": 0.6099, "step": 4770 }, { "epoch": 0.4638074907820687, "grad_norm": 1.5767612487613212, "learning_rate": 4.2407034849763126e-05, "loss": 0.6223, "step": 4780 }, { "epoch": 0.4647777993401902, "grad_norm": 1.8050008733247063, "learning_rate": 4.2390810565254076e-05, "loss": 0.5695, "step": 4790 }, { "epoch": 0.46574810789831167, "grad_norm": 1.3088703288484584, "learning_rate": 4.237458628074502e-05, "loss": 0.5723, "step": 4800 }, { "epoch": 0.46671841645643314, "grad_norm": 1.7711140961973422, "learning_rate": 4.235836199623597e-05, "loss": 0.6173, "step": 4810 }, { "epoch": 0.4676887250145546, "grad_norm": 1.6431929005817145, "learning_rate": 4.234213771172691e-05, "loss": 0.5982, "step": 4820 }, { "epoch": 0.46865903357267613, "grad_norm": 1.557431489902951, "learning_rate": 4.232591342721786e-05, "loss": 0.6098, "step": 4830 }, { "epoch": 0.4696293421307976, "grad_norm": 1.4479656995240782, "learning_rate": 4.230968914270881e-05, "loss": 0.5699, "step": 4840 }, { "epoch": 0.47059965068891907, "grad_norm": 1.6046344258439647, "learning_rate": 4.2293464858199754e-05, "loss": 0.6243, "step": 4850 }, { "epoch": 0.47156995924704054, "grad_norm": 1.252218532607539, "learning_rate": 4.2277240573690704e-05, "loss": 0.5787, "step": 4860 }, { "epoch": 0.47254026780516206, "grad_norm": 1.4372595855894126, "learning_rate": 4.226101628918165e-05, "loss": 0.6483, "step": 4870 }, { "epoch": 0.47351057636328353, "grad_norm": 1.6112811676393963, "learning_rate": 4.22447920046726e-05, "loss": 0.6042, "step": 4880 }, { "epoch": 0.474480884921405, "grad_norm": 1.6387669351547591, "learning_rate": 4.222856772016354e-05, "loss": 0.6176, "step": 4890 }, { "epoch": 0.47545119347952647, "grad_norm": 1.4893296879883764, "learning_rate": 4.221234343565449e-05, "loss": 0.5976, "step": 4900 }, { "epoch": 0.476421502037648, "grad_norm": 1.663980815080282, "learning_rate": 4.219611915114543e-05, "loss": 0.6655, "step": 4910 }, { "epoch": 0.47739181059576946, "grad_norm": 1.3387695933594599, "learning_rate": 4.217989486663638e-05, "loss": 0.6166, "step": 4920 }, { "epoch": 0.47836211915389093, "grad_norm": 1.7319521884893254, "learning_rate": 4.216367058212733e-05, "loss": 0.6225, "step": 4930 }, { "epoch": 0.4793324277120124, "grad_norm": 1.4563922038982844, "learning_rate": 4.2147446297618275e-05, "loss": 0.6351, "step": 4940 }, { "epoch": 0.4803027362701339, "grad_norm": 1.7473358339447822, "learning_rate": 4.2131222013109225e-05, "loss": 0.6205, "step": 4950 }, { "epoch": 0.4812730448282554, "grad_norm": 1.4006136660964967, "learning_rate": 4.211499772860017e-05, "loss": 0.6992, "step": 4960 }, { "epoch": 0.48224335338637686, "grad_norm": 1.7446969484471915, "learning_rate": 4.209877344409112e-05, "loss": 0.6251, "step": 4970 }, { "epoch": 0.48321366194449833, "grad_norm": 1.3789648220512414, "learning_rate": 4.208254915958206e-05, "loss": 0.5856, "step": 4980 }, { "epoch": 0.48418397050261985, "grad_norm": 1.77919007957107, "learning_rate": 4.206632487507301e-05, "loss": 0.621, "step": 4990 }, { "epoch": 0.4851542790607413, "grad_norm": 1.5926084629766706, "learning_rate": 4.2050100590563953e-05, "loss": 0.6542, "step": 5000 }, { "epoch": 0.4851542790607413, "eval_loss": 0.6975318789482117, "eval_runtime": 2469.0533, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 5000 }, { "epoch": 0.4861245876188628, "grad_norm": 1.7733999980186248, "learning_rate": 4.20338763060549e-05, "loss": 0.6943, "step": 5010 }, { "epoch": 0.48709489617698426, "grad_norm": 1.7221032223753934, "learning_rate": 4.201765202154585e-05, "loss": 0.6338, "step": 5020 }, { "epoch": 0.4880652047351058, "grad_norm": 1.7180735514930228, "learning_rate": 4.2001427737036796e-05, "loss": 0.6698, "step": 5030 }, { "epoch": 0.48903551329322725, "grad_norm": 1.698242629280347, "learning_rate": 4.1985203452527746e-05, "loss": 0.6972, "step": 5040 }, { "epoch": 0.4900058218513487, "grad_norm": 1.8269714939747912, "learning_rate": 4.196897916801869e-05, "loss": 0.6212, "step": 5050 }, { "epoch": 0.4909761304094702, "grad_norm": 1.4693827904505679, "learning_rate": 4.195275488350964e-05, "loss": 0.5933, "step": 5060 }, { "epoch": 0.4919464389675917, "grad_norm": 1.49361517085645, "learning_rate": 4.193653059900058e-05, "loss": 0.5489, "step": 5070 }, { "epoch": 0.4929167475257132, "grad_norm": 1.5630723710374232, "learning_rate": 4.192030631449153e-05, "loss": 0.6578, "step": 5080 }, { "epoch": 0.49388705608383465, "grad_norm": 1.6495843346442778, "learning_rate": 4.1904082029982474e-05, "loss": 0.6906, "step": 5090 }, { "epoch": 0.4948573646419561, "grad_norm": 1.7983678208711196, "learning_rate": 4.1887857745473424e-05, "loss": 0.5486, "step": 5100 }, { "epoch": 0.49582767320007765, "grad_norm": 2.00722752034129, "learning_rate": 4.1871633460964374e-05, "loss": 0.648, "step": 5110 }, { "epoch": 0.4967979817581991, "grad_norm": 1.7918897206429327, "learning_rate": 4.185540917645532e-05, "loss": 0.6693, "step": 5120 }, { "epoch": 0.4977682903163206, "grad_norm": 1.3410097713324785, "learning_rate": 4.1839184891946267e-05, "loss": 0.6065, "step": 5130 }, { "epoch": 0.49873859887444205, "grad_norm": 1.8485275360772722, "learning_rate": 4.182296060743721e-05, "loss": 0.6802, "step": 5140 }, { "epoch": 0.4997089074325636, "grad_norm": 1.923253820990962, "learning_rate": 4.180673632292816e-05, "loss": 0.6014, "step": 5150 }, { "epoch": 0.500679215990685, "grad_norm": 1.325223633752785, "learning_rate": 4.17905120384191e-05, "loss": 0.5787, "step": 5160 }, { "epoch": 0.5016495245488065, "grad_norm": 1.7937826702440574, "learning_rate": 4.177428775391005e-05, "loss": 0.6197, "step": 5170 }, { "epoch": 0.502619833106928, "grad_norm": 1.4050667202069218, "learning_rate": 4.1758063469401e-05, "loss": 0.6, "step": 5180 }, { "epoch": 0.5035901416650495, "grad_norm": 1.5656335055581363, "learning_rate": 4.174183918489195e-05, "loss": 0.6924, "step": 5190 }, { "epoch": 0.504560450223171, "grad_norm": 1.3325372640233277, "learning_rate": 4.1725614900382895e-05, "loss": 0.6477, "step": 5200 }, { "epoch": 0.5055307587812925, "grad_norm": 1.5612035852842387, "learning_rate": 4.1709390615873844e-05, "loss": 0.6372, "step": 5210 }, { "epoch": 0.5065010673394139, "grad_norm": 1.6164770277307943, "learning_rate": 4.1693166331364794e-05, "loss": 0.5822, "step": 5220 }, { "epoch": 0.5074713758975354, "grad_norm": 1.8957196740567803, "learning_rate": 4.167694204685574e-05, "loss": 0.5695, "step": 5230 }, { "epoch": 0.5084416844556569, "grad_norm": 1.505057761308431, "learning_rate": 4.166071776234669e-05, "loss": 0.5893, "step": 5240 }, { "epoch": 0.5094119930137784, "grad_norm": 1.2392435413477463, "learning_rate": 4.164449347783763e-05, "loss": 0.6641, "step": 5250 }, { "epoch": 0.5103823015718999, "grad_norm": 1.8002682332596205, "learning_rate": 4.162826919332858e-05, "loss": 0.5892, "step": 5260 }, { "epoch": 0.5113526101300213, "grad_norm": 1.5161558066332184, "learning_rate": 4.161204490881952e-05, "loss": 0.5985, "step": 5270 }, { "epoch": 0.5123229186881428, "grad_norm": 1.3773663290971068, "learning_rate": 4.159582062431047e-05, "loss": 0.6244, "step": 5280 }, { "epoch": 0.5132932272462644, "grad_norm": 1.8649060352228473, "learning_rate": 4.1579596339801415e-05, "loss": 0.6144, "step": 5290 }, { "epoch": 0.5142635358043858, "grad_norm": 1.711468575333758, "learning_rate": 4.1563372055292365e-05, "loss": 0.5889, "step": 5300 }, { "epoch": 0.5152338443625073, "grad_norm": 1.4531536302097425, "learning_rate": 4.1547147770783315e-05, "loss": 0.6063, "step": 5310 }, { "epoch": 0.5162041529206287, "grad_norm": 1.6628035885794548, "learning_rate": 4.153092348627426e-05, "loss": 0.5872, "step": 5320 }, { "epoch": 0.5171744614787502, "grad_norm": 1.43386966167638, "learning_rate": 4.151469920176521e-05, "loss": 0.5836, "step": 5330 }, { "epoch": 0.5181447700368718, "grad_norm": 1.6338584720484304, "learning_rate": 4.149847491725615e-05, "loss": 0.583, "step": 5340 }, { "epoch": 0.5191150785949932, "grad_norm": 1.602663089927568, "learning_rate": 4.14822506327471e-05, "loss": 0.6466, "step": 5350 }, { "epoch": 0.5200853871531147, "grad_norm": 0.993656188560276, "learning_rate": 4.1466026348238044e-05, "loss": 0.5972, "step": 5360 }, { "epoch": 0.5210556957112362, "grad_norm": 1.5353879720655148, "learning_rate": 4.144980206372899e-05, "loss": 0.6159, "step": 5370 }, { "epoch": 0.5220260042693576, "grad_norm": 1.4710027502404226, "learning_rate": 4.143357777921994e-05, "loss": 0.6231, "step": 5380 }, { "epoch": 0.5229963128274792, "grad_norm": 1.3154974116618938, "learning_rate": 4.1417353494710886e-05, "loss": 0.6414, "step": 5390 }, { "epoch": 0.5239666213856006, "grad_norm": 1.5917256061619933, "learning_rate": 4.1401129210201836e-05, "loss": 0.5574, "step": 5400 }, { "epoch": 0.5249369299437221, "grad_norm": 1.999701613939348, "learning_rate": 4.138490492569278e-05, "loss": 0.6659, "step": 5410 }, { "epoch": 0.5259072385018436, "grad_norm": 1.158059375940914, "learning_rate": 4.136868064118373e-05, "loss": 0.5702, "step": 5420 }, { "epoch": 0.526877547059965, "grad_norm": 1.6503529559993917, "learning_rate": 4.135245635667467e-05, "loss": 0.5866, "step": 5430 }, { "epoch": 0.5278478556180866, "grad_norm": 1.778595452682844, "learning_rate": 4.133623207216562e-05, "loss": 0.6727, "step": 5440 }, { "epoch": 0.5288181641762081, "grad_norm": 2.153378806689067, "learning_rate": 4.1320007787656564e-05, "loss": 0.5849, "step": 5450 }, { "epoch": 0.5297884727343295, "grad_norm": 1.5480145123110607, "learning_rate": 4.1303783503147514e-05, "loss": 0.5985, "step": 5460 }, { "epoch": 0.530758781292451, "grad_norm": 1.3074515838089584, "learning_rate": 4.1287559218638464e-05, "loss": 0.6567, "step": 5470 }, { "epoch": 0.5317290898505724, "grad_norm": 1.2634236320868193, "learning_rate": 4.127133493412941e-05, "loss": 0.6744, "step": 5480 }, { "epoch": 0.532699398408694, "grad_norm": 1.4158920202942755, "learning_rate": 4.125511064962036e-05, "loss": 0.6089, "step": 5490 }, { "epoch": 0.5336697069668155, "grad_norm": 1.530835103672291, "learning_rate": 4.12388863651113e-05, "loss": 0.6637, "step": 5500 }, { "epoch": 0.5336697069668155, "eval_loss": 0.6922717094421387, "eval_runtime": 2471.913, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 5500 }, { "epoch": 0.5346400155249369, "grad_norm": 1.0979844633621818, "learning_rate": 4.122266208060225e-05, "loss": 0.6711, "step": 5510 }, { "epoch": 0.5356103240830584, "grad_norm": 1.9531703151807043, "learning_rate": 4.120643779609319e-05, "loss": 0.634, "step": 5520 }, { "epoch": 0.5365806326411799, "grad_norm": 1.4345990864009392, "learning_rate": 4.119021351158414e-05, "loss": 0.6289, "step": 5530 }, { "epoch": 0.5375509411993014, "grad_norm": 1.5002874747132873, "learning_rate": 4.1173989227075085e-05, "loss": 0.6655, "step": 5540 }, { "epoch": 0.5385212497574229, "grad_norm": 1.4114133378596565, "learning_rate": 4.1157764942566035e-05, "loss": 0.6314, "step": 5550 }, { "epoch": 0.5394915583155443, "grad_norm": 1.2792208401822267, "learning_rate": 4.1141540658056985e-05, "loss": 0.5622, "step": 5560 }, { "epoch": 0.5404618668736658, "grad_norm": 1.4436849148969537, "learning_rate": 4.112531637354793e-05, "loss": 0.6758, "step": 5570 }, { "epoch": 0.5414321754317873, "grad_norm": 1.495469226735889, "learning_rate": 4.110909208903888e-05, "loss": 0.6457, "step": 5580 }, { "epoch": 0.5424024839899088, "grad_norm": 1.5696729181281173, "learning_rate": 4.109286780452982e-05, "loss": 0.5434, "step": 5590 }, { "epoch": 0.5433727925480303, "grad_norm": 1.4884099597371836, "learning_rate": 4.107664352002077e-05, "loss": 0.6602, "step": 5600 }, { "epoch": 0.5443431011061518, "grad_norm": 1.364422329620206, "learning_rate": 4.106041923551171e-05, "loss": 0.6093, "step": 5610 }, { "epoch": 0.5453134096642732, "grad_norm": 1.710525492450016, "learning_rate": 4.104419495100266e-05, "loss": 0.5733, "step": 5620 }, { "epoch": 0.5462837182223947, "grad_norm": 1.8645615729328109, "learning_rate": 4.1027970666493606e-05, "loss": 0.6301, "step": 5630 }, { "epoch": 0.5472540267805162, "grad_norm": 2.1212649730065403, "learning_rate": 4.1011746381984556e-05, "loss": 0.6489, "step": 5640 }, { "epoch": 0.5482243353386377, "grad_norm": 1.3978835293983003, "learning_rate": 4.0995522097475506e-05, "loss": 0.6651, "step": 5650 }, { "epoch": 0.5491946438967592, "grad_norm": 1.968855731350402, "learning_rate": 4.097929781296645e-05, "loss": 0.5849, "step": 5660 }, { "epoch": 0.5501649524548806, "grad_norm": 1.3003479487793523, "learning_rate": 4.09630735284574e-05, "loss": 0.5981, "step": 5670 }, { "epoch": 0.5511352610130021, "grad_norm": 1.3093675797291962, "learning_rate": 4.094684924394834e-05, "loss": 0.6001, "step": 5680 }, { "epoch": 0.5521055695711237, "grad_norm": 1.386619668616355, "learning_rate": 4.093062495943929e-05, "loss": 0.587, "step": 5690 }, { "epoch": 0.5530758781292451, "grad_norm": 2.1925053494451108, "learning_rate": 4.0914400674930234e-05, "loss": 0.6748, "step": 5700 }, { "epoch": 0.5540461866873666, "grad_norm": 1.401533269192193, "learning_rate": 4.0898176390421184e-05, "loss": 0.5987, "step": 5710 }, { "epoch": 0.555016495245488, "grad_norm": 1.8417454170154262, "learning_rate": 4.088195210591213e-05, "loss": 0.6169, "step": 5720 }, { "epoch": 0.5559868038036095, "grad_norm": 1.70211748961807, "learning_rate": 4.086572782140308e-05, "loss": 0.6231, "step": 5730 }, { "epoch": 0.5569571123617311, "grad_norm": 1.449836249162655, "learning_rate": 4.0849503536894026e-05, "loss": 0.6636, "step": 5740 }, { "epoch": 0.5579274209198525, "grad_norm": 1.411704552119965, "learning_rate": 4.083327925238497e-05, "loss": 0.6334, "step": 5750 }, { "epoch": 0.558897729477974, "grad_norm": 1.3373247464479125, "learning_rate": 4.081705496787592e-05, "loss": 0.6127, "step": 5760 }, { "epoch": 0.5598680380360955, "grad_norm": 1.6669305203635734, "learning_rate": 4.080083068336686e-05, "loss": 0.6419, "step": 5770 }, { "epoch": 0.5608383465942169, "grad_norm": 1.3599639094085172, "learning_rate": 4.078460639885781e-05, "loss": 0.6838, "step": 5780 }, { "epoch": 0.5618086551523385, "grad_norm": 1.5754322431175416, "learning_rate": 4.0768382114348755e-05, "loss": 0.5997, "step": 5790 }, { "epoch": 0.5627789637104599, "grad_norm": 1.667207521590911, "learning_rate": 4.0752157829839705e-05, "loss": 0.6199, "step": 5800 }, { "epoch": 0.5637492722685814, "grad_norm": 1.5869372089733027, "learning_rate": 4.0735933545330654e-05, "loss": 0.5779, "step": 5810 }, { "epoch": 0.5647195808267029, "grad_norm": 1.7150097071735784, "learning_rate": 4.07197092608216e-05, "loss": 0.5978, "step": 5820 }, { "epoch": 0.5656898893848243, "grad_norm": 1.1835353092728575, "learning_rate": 4.070348497631255e-05, "loss": 0.6781, "step": 5830 }, { "epoch": 0.5666601979429459, "grad_norm": 1.7307613010314937, "learning_rate": 4.068726069180349e-05, "loss": 0.634, "step": 5840 }, { "epoch": 0.5676305065010674, "grad_norm": 1.6364191249486493, "learning_rate": 4.067103640729444e-05, "loss": 0.6377, "step": 5850 }, { "epoch": 0.5686008150591888, "grad_norm": 1.4754833764988036, "learning_rate": 4.065481212278538e-05, "loss": 0.6148, "step": 5860 }, { "epoch": 0.5695711236173103, "grad_norm": 1.8389583610281375, "learning_rate": 4.063858783827633e-05, "loss": 0.6739, "step": 5870 }, { "epoch": 0.5705414321754317, "grad_norm": 1.4669099461048227, "learning_rate": 4.0622363553767276e-05, "loss": 0.598, "step": 5880 }, { "epoch": 0.5715117407335533, "grad_norm": 1.609973267529918, "learning_rate": 4.0606139269258226e-05, "loss": 0.5476, "step": 5890 }, { "epoch": 0.5724820492916748, "grad_norm": 1.6951117185016165, "learning_rate": 4.0589914984749175e-05, "loss": 0.595, "step": 5900 }, { "epoch": 0.5734523578497962, "grad_norm": 1.494158253886906, "learning_rate": 4.057369070024012e-05, "loss": 0.6886, "step": 5910 }, { "epoch": 0.5744226664079177, "grad_norm": 1.3036559330975726, "learning_rate": 4.055746641573107e-05, "loss": 0.5986, "step": 5920 }, { "epoch": 0.5753929749660393, "grad_norm": 1.4737709906961922, "learning_rate": 4.054124213122201e-05, "loss": 0.615, "step": 5930 }, { "epoch": 0.5763632835241607, "grad_norm": 1.4022175413897375, "learning_rate": 4.052501784671296e-05, "loss": 0.6398, "step": 5940 }, { "epoch": 0.5773335920822822, "grad_norm": 1.433959705605443, "learning_rate": 4.0508793562203904e-05, "loss": 0.6415, "step": 5950 }, { "epoch": 0.5783039006404036, "grad_norm": 1.7123129741732308, "learning_rate": 4.0492569277694854e-05, "loss": 0.6216, "step": 5960 }, { "epoch": 0.5792742091985251, "grad_norm": 2.131052124619936, "learning_rate": 4.04763449931858e-05, "loss": 0.5596, "step": 5970 }, { "epoch": 0.5802445177566466, "grad_norm": 1.2233223645362516, "learning_rate": 4.046012070867675e-05, "loss": 0.6023, "step": 5980 }, { "epoch": 0.5812148263147681, "grad_norm": 1.734811390420522, "learning_rate": 4.0443896424167696e-05, "loss": 0.6347, "step": 5990 }, { "epoch": 0.5821851348728896, "grad_norm": 1.626024609025115, "learning_rate": 4.0427672139658646e-05, "loss": 0.5688, "step": 6000 }, { "epoch": 0.5821851348728896, "eval_loss": 0.6903010606765747, "eval_runtime": 2473.5711, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 6000 }, { "epoch": 0.5831554434310111, "grad_norm": 1.3892850877811105, "learning_rate": 4.0411447855149596e-05, "loss": 0.6131, "step": 6010 }, { "epoch": 0.5841257519891325, "grad_norm": 1.6290044308163973, "learning_rate": 4.039522357064054e-05, "loss": 0.6319, "step": 6020 }, { "epoch": 0.585096060547254, "grad_norm": 1.7770012777694764, "learning_rate": 4.037899928613149e-05, "loss": 0.5756, "step": 6030 }, { "epoch": 0.5860663691053755, "grad_norm": 0.9954028008191703, "learning_rate": 4.036277500162243e-05, "loss": 0.6161, "step": 6040 }, { "epoch": 0.587036677663497, "grad_norm": 1.7893541919562175, "learning_rate": 4.034655071711338e-05, "loss": 0.62, "step": 6050 }, { "epoch": 0.5880069862216185, "grad_norm": 1.6500512739247042, "learning_rate": 4.0330326432604324e-05, "loss": 0.5928, "step": 6060 }, { "epoch": 0.5889772947797399, "grad_norm": 1.6568317215206447, "learning_rate": 4.0314102148095274e-05, "loss": 0.6043, "step": 6070 }, { "epoch": 0.5899476033378614, "grad_norm": 2.006336465212855, "learning_rate": 4.029787786358622e-05, "loss": 0.6283, "step": 6080 }, { "epoch": 0.590917911895983, "grad_norm": 1.481658511307882, "learning_rate": 4.028165357907717e-05, "loss": 0.6229, "step": 6090 }, { "epoch": 0.5918882204541044, "grad_norm": 1.7467752898199094, "learning_rate": 4.0265429294568117e-05, "loss": 0.5997, "step": 6100 }, { "epoch": 0.5928585290122259, "grad_norm": 1.747997366501937, "learning_rate": 4.024920501005906e-05, "loss": 0.6441, "step": 6110 }, { "epoch": 0.5938288375703473, "grad_norm": 1.6963969244893895, "learning_rate": 4.023298072555001e-05, "loss": 0.6059, "step": 6120 }, { "epoch": 0.5947991461284688, "grad_norm": 1.335184560881826, "learning_rate": 4.021675644104095e-05, "loss": 0.5627, "step": 6130 }, { "epoch": 0.5957694546865904, "grad_norm": 1.5256295408049876, "learning_rate": 4.02005321565319e-05, "loss": 0.578, "step": 6140 }, { "epoch": 0.5967397632447118, "grad_norm": 1.7615333752357474, "learning_rate": 4.0184307872022845e-05, "loss": 0.6046, "step": 6150 }, { "epoch": 0.5977100718028333, "grad_norm": 1.9321393707932597, "learning_rate": 4.0168083587513795e-05, "loss": 0.615, "step": 6160 }, { "epoch": 0.5986803803609548, "grad_norm": 1.2267052175289888, "learning_rate": 4.015185930300474e-05, "loss": 0.6238, "step": 6170 }, { "epoch": 0.5996506889190762, "grad_norm": 1.8092624768121868, "learning_rate": 4.013563501849569e-05, "loss": 0.6514, "step": 6180 }, { "epoch": 0.6006209974771978, "grad_norm": 1.546108807816704, "learning_rate": 4.011941073398664e-05, "loss": 0.6071, "step": 6190 }, { "epoch": 0.6015913060353192, "grad_norm": 1.8186789877657976, "learning_rate": 4.010318644947758e-05, "loss": 0.6624, "step": 6200 }, { "epoch": 0.6025616145934407, "grad_norm": 1.6349446430924426, "learning_rate": 4.008696216496853e-05, "loss": 0.6208, "step": 6210 }, { "epoch": 0.6035319231515622, "grad_norm": 1.6574782074117176, "learning_rate": 4.007073788045947e-05, "loss": 0.6067, "step": 6220 }, { "epoch": 0.6045022317096836, "grad_norm": 1.372441926371392, "learning_rate": 4.005451359595042e-05, "loss": 0.5805, "step": 6230 }, { "epoch": 0.6054725402678052, "grad_norm": 2.1366961584133164, "learning_rate": 4.0038289311441366e-05, "loss": 0.6797, "step": 6240 }, { "epoch": 0.6064428488259267, "grad_norm": 1.8553936516223448, "learning_rate": 4.0022065026932316e-05, "loss": 0.582, "step": 6250 }, { "epoch": 0.6074131573840481, "grad_norm": 1.95303083914795, "learning_rate": 4.000584074242326e-05, "loss": 0.6474, "step": 6260 }, { "epoch": 0.6083834659421696, "grad_norm": 1.6884162400220937, "learning_rate": 3.998961645791421e-05, "loss": 0.6339, "step": 6270 }, { "epoch": 0.609353774500291, "grad_norm": 1.7825690304753616, "learning_rate": 3.997339217340516e-05, "loss": 0.5568, "step": 6280 }, { "epoch": 0.6103240830584126, "grad_norm": 1.5564493386693086, "learning_rate": 3.99571678888961e-05, "loss": 0.602, "step": 6290 }, { "epoch": 0.6112943916165341, "grad_norm": 1.667415447715766, "learning_rate": 3.994094360438705e-05, "loss": 0.5877, "step": 6300 }, { "epoch": 0.6122647001746555, "grad_norm": 1.6232084554425263, "learning_rate": 3.9924719319877994e-05, "loss": 0.6247, "step": 6310 }, { "epoch": 0.613235008732777, "grad_norm": 1.7806635141428606, "learning_rate": 3.9908495035368944e-05, "loss": 0.5651, "step": 6320 }, { "epoch": 0.6142053172908986, "grad_norm": 1.2898375089967073, "learning_rate": 3.989227075085989e-05, "loss": 0.6206, "step": 6330 }, { "epoch": 0.61517562584902, "grad_norm": 1.762789516395008, "learning_rate": 3.9876046466350836e-05, "loss": 0.5538, "step": 6340 }, { "epoch": 0.6161459344071415, "grad_norm": 1.8092369800491652, "learning_rate": 3.985982218184178e-05, "loss": 0.6236, "step": 6350 }, { "epoch": 0.6171162429652629, "grad_norm": 1.8060820825881239, "learning_rate": 3.984359789733273e-05, "loss": 0.5875, "step": 6360 }, { "epoch": 0.6180865515233844, "grad_norm": 1.7317977897542403, "learning_rate": 3.982737361282368e-05, "loss": 0.594, "step": 6370 }, { "epoch": 0.619056860081506, "grad_norm": 1.5504516693013288, "learning_rate": 3.981114932831462e-05, "loss": 0.5833, "step": 6380 }, { "epoch": 0.6200271686396274, "grad_norm": 1.3714460442712506, "learning_rate": 3.979492504380557e-05, "loss": 0.6048, "step": 6390 }, { "epoch": 0.6209974771977489, "grad_norm": 1.741408754147969, "learning_rate": 3.9778700759296515e-05, "loss": 0.6729, "step": 6400 }, { "epoch": 0.6219677857558704, "grad_norm": 1.0600787702033703, "learning_rate": 3.9762476474787465e-05, "loss": 0.6934, "step": 6410 }, { "epoch": 0.6229380943139918, "grad_norm": 1.5722449067246123, "learning_rate": 3.974625219027841e-05, "loss": 0.6043, "step": 6420 }, { "epoch": 0.6239084028721134, "grad_norm": 1.3176596782485965, "learning_rate": 3.973002790576936e-05, "loss": 0.5898, "step": 6430 }, { "epoch": 0.6248787114302348, "grad_norm": 1.4177190750303141, "learning_rate": 3.971380362126031e-05, "loss": 0.6146, "step": 6440 }, { "epoch": 0.6258490199883563, "grad_norm": 1.3993450966845369, "learning_rate": 3.969757933675125e-05, "loss": 0.608, "step": 6450 }, { "epoch": 0.6268193285464778, "grad_norm": 1.6552784809097676, "learning_rate": 3.96813550522422e-05, "loss": 0.6078, "step": 6460 }, { "epoch": 0.6277896371045992, "grad_norm": 1.315768136368434, "learning_rate": 3.966513076773314e-05, "loss": 0.6021, "step": 6470 }, { "epoch": 0.6287599456627208, "grad_norm": 1.431816586824017, "learning_rate": 3.964890648322409e-05, "loss": 0.5571, "step": 6480 }, { "epoch": 0.6297302542208423, "grad_norm": 1.4950226300857892, "learning_rate": 3.9632682198715036e-05, "loss": 0.6297, "step": 6490 }, { "epoch": 0.6307005627789637, "grad_norm": 2.0781333722978284, "learning_rate": 3.9616457914205985e-05, "loss": 0.596, "step": 6500 }, { "epoch": 0.6307005627789637, "eval_loss": 0.6844401955604553, "eval_runtime": 2468.3658, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 6500 }, { "epoch": 0.6316708713370852, "grad_norm": 1.5337482284796442, "learning_rate": 3.960023362969693e-05, "loss": 0.6634, "step": 6510 }, { "epoch": 0.6326411798952066, "grad_norm": 1.7040234243162709, "learning_rate": 3.958400934518788e-05, "loss": 0.5936, "step": 6520 }, { "epoch": 0.6336114884533282, "grad_norm": 1.7943789476429368, "learning_rate": 3.956778506067883e-05, "loss": 0.6361, "step": 6530 }, { "epoch": 0.6345817970114497, "grad_norm": 1.5737933588874193, "learning_rate": 3.955156077616977e-05, "loss": 0.6192, "step": 6540 }, { "epoch": 0.6355521055695711, "grad_norm": 1.532774889369556, "learning_rate": 3.953533649166072e-05, "loss": 0.5694, "step": 6550 }, { "epoch": 0.6365224141276926, "grad_norm": 1.6854751291833254, "learning_rate": 3.9519112207151664e-05, "loss": 0.6365, "step": 6560 }, { "epoch": 0.6374927226858141, "grad_norm": 1.4850693595634958, "learning_rate": 3.9502887922642613e-05, "loss": 0.5966, "step": 6570 }, { "epoch": 0.6384630312439356, "grad_norm": 1.5032167913978984, "learning_rate": 3.9486663638133556e-05, "loss": 0.6413, "step": 6580 }, { "epoch": 0.6394333398020571, "grad_norm": 1.4058296063426399, "learning_rate": 3.9470439353624506e-05, "loss": 0.5845, "step": 6590 }, { "epoch": 0.6404036483601785, "grad_norm": 1.563521067932564, "learning_rate": 3.945421506911545e-05, "loss": 0.6179, "step": 6600 }, { "epoch": 0.6413739569183, "grad_norm": 1.4036808350972751, "learning_rate": 3.94379907846064e-05, "loss": 0.5942, "step": 6610 }, { "epoch": 0.6423442654764215, "grad_norm": 1.4964922216668848, "learning_rate": 3.942176650009735e-05, "loss": 0.6516, "step": 6620 }, { "epoch": 0.643314574034543, "grad_norm": 1.6801427039047954, "learning_rate": 3.940554221558829e-05, "loss": 0.6192, "step": 6630 }, { "epoch": 0.6442848825926645, "grad_norm": 1.582294622357827, "learning_rate": 3.938931793107924e-05, "loss": 0.6051, "step": 6640 }, { "epoch": 0.645255191150786, "grad_norm": 1.531859995817891, "learning_rate": 3.9373093646570185e-05, "loss": 0.6407, "step": 6650 }, { "epoch": 0.6462254997089074, "grad_norm": 1.9088041020547684, "learning_rate": 3.9356869362061134e-05, "loss": 0.6078, "step": 6660 }, { "epoch": 0.6471958082670289, "grad_norm": 1.3066981837793257, "learning_rate": 3.934064507755208e-05, "loss": 0.5665, "step": 6670 }, { "epoch": 0.6481661168251504, "grad_norm": 1.313352722546981, "learning_rate": 3.932442079304303e-05, "loss": 0.5201, "step": 6680 }, { "epoch": 0.6491364253832719, "grad_norm": 1.8145770193834194, "learning_rate": 3.930819650853397e-05, "loss": 0.636, "step": 6690 }, { "epoch": 0.6501067339413934, "grad_norm": 1.6198734288858008, "learning_rate": 3.929197222402492e-05, "loss": 0.6529, "step": 6700 }, { "epoch": 0.6510770424995148, "grad_norm": 1.6029042771301745, "learning_rate": 3.927574793951587e-05, "loss": 0.5777, "step": 6710 }, { "epoch": 0.6520473510576363, "grad_norm": 1.7982640944263595, "learning_rate": 3.925952365500681e-05, "loss": 0.5833, "step": 6720 }, { "epoch": 0.6530176596157579, "grad_norm": 1.9034497019204402, "learning_rate": 3.924329937049776e-05, "loss": 0.5981, "step": 6730 }, { "epoch": 0.6539879681738793, "grad_norm": 1.3239435404528657, "learning_rate": 3.9227075085988705e-05, "loss": 0.5861, "step": 6740 }, { "epoch": 0.6549582767320008, "grad_norm": 1.7333381509447607, "learning_rate": 3.921085080147966e-05, "loss": 0.5696, "step": 6750 }, { "epoch": 0.6559285852901222, "grad_norm": 1.7408114949745195, "learning_rate": 3.9194626516970605e-05, "loss": 0.5868, "step": 6760 }, { "epoch": 0.6568988938482437, "grad_norm": 1.4140478484561587, "learning_rate": 3.9178402232461555e-05, "loss": 0.6126, "step": 6770 }, { "epoch": 0.6578692024063653, "grad_norm": 1.2359753742563322, "learning_rate": 3.91621779479525e-05, "loss": 0.595, "step": 6780 }, { "epoch": 0.6588395109644867, "grad_norm": 1.4986954153208918, "learning_rate": 3.914595366344345e-05, "loss": 0.6399, "step": 6790 }, { "epoch": 0.6598098195226082, "grad_norm": 1.60396773259511, "learning_rate": 3.912972937893439e-05, "loss": 0.573, "step": 6800 }, { "epoch": 0.6607801280807297, "grad_norm": 1.6446241731745532, "learning_rate": 3.911350509442534e-05, "loss": 0.5758, "step": 6810 }, { "epoch": 0.6617504366388511, "grad_norm": 1.6940327126152295, "learning_rate": 3.909728080991629e-05, "loss": 0.5646, "step": 6820 }, { "epoch": 0.6627207451969727, "grad_norm": 1.6725911752868794, "learning_rate": 3.908105652540723e-05, "loss": 0.6282, "step": 6830 }, { "epoch": 0.6636910537550941, "grad_norm": 1.724226678968073, "learning_rate": 3.906483224089818e-05, "loss": 0.6391, "step": 6840 }, { "epoch": 0.6646613623132156, "grad_norm": 1.5959383703623191, "learning_rate": 3.9048607956389126e-05, "loss": 0.6003, "step": 6850 }, { "epoch": 0.6656316708713371, "grad_norm": 1.7487148965006274, "learning_rate": 3.9032383671880075e-05, "loss": 0.6019, "step": 6860 }, { "epoch": 0.6666019794294585, "grad_norm": 2.0088682339975312, "learning_rate": 3.901615938737102e-05, "loss": 0.5846, "step": 6870 }, { "epoch": 0.6675722879875801, "grad_norm": 1.5646052877933723, "learning_rate": 3.899993510286197e-05, "loss": 0.6222, "step": 6880 }, { "epoch": 0.6685425965457016, "grad_norm": 1.5750290096746722, "learning_rate": 3.898371081835291e-05, "loss": 0.6018, "step": 6890 }, { "epoch": 0.669512905103823, "grad_norm": 1.2258053581934112, "learning_rate": 3.896748653384386e-05, "loss": 0.5697, "step": 6900 }, { "epoch": 0.6704832136619445, "grad_norm": 1.9376975148901512, "learning_rate": 3.895126224933481e-05, "loss": 0.6168, "step": 6910 }, { "epoch": 0.6714535222200659, "grad_norm": 1.4670771869188424, "learning_rate": 3.8935037964825754e-05, "loss": 0.5964, "step": 6920 }, { "epoch": 0.6724238307781875, "grad_norm": 1.324084539118919, "learning_rate": 3.8918813680316704e-05, "loss": 0.5346, "step": 6930 }, { "epoch": 0.673394139336309, "grad_norm": 1.717496207353836, "learning_rate": 3.8902589395807647e-05, "loss": 0.6112, "step": 6940 }, { "epoch": 0.6743644478944304, "grad_norm": 2.019875964877309, "learning_rate": 3.8886365111298596e-05, "loss": 0.6522, "step": 6950 }, { "epoch": 0.6753347564525519, "grad_norm": 1.3354957180640497, "learning_rate": 3.887014082678954e-05, "loss": 0.6256, "step": 6960 }, { "epoch": 0.6763050650106734, "grad_norm": 1.6937952057212555, "learning_rate": 3.885391654228049e-05, "loss": 0.6176, "step": 6970 }, { "epoch": 0.6772753735687949, "grad_norm": 1.5422166727550382, "learning_rate": 3.883769225777143e-05, "loss": 0.5546, "step": 6980 }, { "epoch": 0.6782456821269164, "grad_norm": 1.7417491098472007, "learning_rate": 3.882146797326238e-05, "loss": 0.6267, "step": 6990 }, { "epoch": 0.6792159906850378, "grad_norm": 1.5366921907221318, "learning_rate": 3.880524368875333e-05, "loss": 0.6218, "step": 7000 }, { "epoch": 0.6792159906850378, "eval_loss": 0.678726851940155, "eval_runtime": 2469.9254, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 7000 }, { "epoch": 0.6801862992431593, "grad_norm": 1.5097589222422163, "learning_rate": 3.8789019404244275e-05, "loss": 0.5925, "step": 7010 }, { "epoch": 0.6811566078012808, "grad_norm": 2.1865443717654256, "learning_rate": 3.8772795119735224e-05, "loss": 0.6125, "step": 7020 }, { "epoch": 0.6821269163594023, "grad_norm": 1.3913795509553841, "learning_rate": 3.875657083522617e-05, "loss": 0.6366, "step": 7030 }, { "epoch": 0.6830972249175238, "grad_norm": 1.314507220249943, "learning_rate": 3.874034655071712e-05, "loss": 0.5609, "step": 7040 }, { "epoch": 0.6840675334756453, "grad_norm": 1.5985439411272098, "learning_rate": 3.872412226620806e-05, "loss": 0.5946, "step": 7050 }, { "epoch": 0.6850378420337667, "grad_norm": 1.8453526914168807, "learning_rate": 3.870789798169901e-05, "loss": 0.5389, "step": 7060 }, { "epoch": 0.6860081505918882, "grad_norm": 1.746744853463753, "learning_rate": 3.869167369718995e-05, "loss": 0.5821, "step": 7070 }, { "epoch": 0.6869784591500097, "grad_norm": 1.351619112366914, "learning_rate": 3.86754494126809e-05, "loss": 0.6333, "step": 7080 }, { "epoch": 0.6879487677081312, "grad_norm": 1.6766693580749343, "learning_rate": 3.865922512817185e-05, "loss": 0.5905, "step": 7090 }, { "epoch": 0.6889190762662527, "grad_norm": 1.6907020127934513, "learning_rate": 3.8643000843662795e-05, "loss": 0.583, "step": 7100 }, { "epoch": 0.6898893848243741, "grad_norm": 1.8609743061215689, "learning_rate": 3.8626776559153745e-05, "loss": 0.6131, "step": 7110 }, { "epoch": 0.6908596933824956, "grad_norm": 1.3507375736553595, "learning_rate": 3.861055227464469e-05, "loss": 0.6399, "step": 7120 }, { "epoch": 0.6918300019406172, "grad_norm": 1.7786145489846785, "learning_rate": 3.859432799013564e-05, "loss": 0.5148, "step": 7130 }, { "epoch": 0.6928003104987386, "grad_norm": 1.7336491727001198, "learning_rate": 3.857810370562658e-05, "loss": 0.6154, "step": 7140 }, { "epoch": 0.6937706190568601, "grad_norm": 1.4164103938896966, "learning_rate": 3.856187942111753e-05, "loss": 0.5768, "step": 7150 }, { "epoch": 0.6947409276149815, "grad_norm": 1.5705135987803418, "learning_rate": 3.854565513660848e-05, "loss": 0.6102, "step": 7160 }, { "epoch": 0.695711236173103, "grad_norm": 1.4121262983361098, "learning_rate": 3.8529430852099424e-05, "loss": 0.61, "step": 7170 }, { "epoch": 0.6966815447312246, "grad_norm": 1.8472104212242206, "learning_rate": 3.851320656759037e-05, "loss": 0.5911, "step": 7180 }, { "epoch": 0.697651853289346, "grad_norm": 1.5344754452736826, "learning_rate": 3.8496982283081316e-05, "loss": 0.606, "step": 7190 }, { "epoch": 0.6986221618474675, "grad_norm": 1.6606427535357149, "learning_rate": 3.8480757998572266e-05, "loss": 0.6062, "step": 7200 }, { "epoch": 0.699592470405589, "grad_norm": 1.633183446097436, "learning_rate": 3.846453371406321e-05, "loss": 0.5828, "step": 7210 }, { "epoch": 0.7005627789637104, "grad_norm": 1.646557901144212, "learning_rate": 3.844830942955416e-05, "loss": 0.5354, "step": 7220 }, { "epoch": 0.701533087521832, "grad_norm": 1.64918421348089, "learning_rate": 3.84320851450451e-05, "loss": 0.551, "step": 7230 }, { "epoch": 0.7025033960799534, "grad_norm": 1.6382049443147468, "learning_rate": 3.841586086053605e-05, "loss": 0.5723, "step": 7240 }, { "epoch": 0.7034737046380749, "grad_norm": 2.1034428309614523, "learning_rate": 3.8399636576027e-05, "loss": 0.6282, "step": 7250 }, { "epoch": 0.7044440131961964, "grad_norm": 1.4993896668880777, "learning_rate": 3.8383412291517944e-05, "loss": 0.5221, "step": 7260 }, { "epoch": 0.7054143217543178, "grad_norm": 1.5357479402580956, "learning_rate": 3.8367188007008894e-05, "loss": 0.6176, "step": 7270 }, { "epoch": 0.7063846303124394, "grad_norm": 1.5289777074279678, "learning_rate": 3.835096372249984e-05, "loss": 0.6045, "step": 7280 }, { "epoch": 0.7073549388705609, "grad_norm": 1.6907343435757922, "learning_rate": 3.833473943799079e-05, "loss": 0.5778, "step": 7290 }, { "epoch": 0.7083252474286823, "grad_norm": 1.546188967929013, "learning_rate": 3.831851515348173e-05, "loss": 0.5966, "step": 7300 }, { "epoch": 0.7092955559868038, "grad_norm": 1.6668813308937025, "learning_rate": 3.830229086897268e-05, "loss": 0.5443, "step": 7310 }, { "epoch": 0.7102658645449252, "grad_norm": 2.0411822746490444, "learning_rate": 3.828606658446362e-05, "loss": 0.5663, "step": 7320 }, { "epoch": 0.7112361731030468, "grad_norm": 1.5825597761459882, "learning_rate": 3.826984229995457e-05, "loss": 0.6143, "step": 7330 }, { "epoch": 0.7122064816611683, "grad_norm": 1.7867885913406227, "learning_rate": 3.825361801544552e-05, "loss": 0.6094, "step": 7340 }, { "epoch": 0.7131767902192897, "grad_norm": 1.4415937345324663, "learning_rate": 3.8237393730936465e-05, "loss": 0.552, "step": 7350 }, { "epoch": 0.7141470987774112, "grad_norm": 1.8623916924146842, "learning_rate": 3.8221169446427415e-05, "loss": 0.6035, "step": 7360 }, { "epoch": 0.7151174073355328, "grad_norm": 1.6112921767840875, "learning_rate": 3.820494516191836e-05, "loss": 0.5957, "step": 7370 }, { "epoch": 0.7160877158936542, "grad_norm": 1.6295510822010462, "learning_rate": 3.818872087740931e-05, "loss": 0.5906, "step": 7380 }, { "epoch": 0.7170580244517757, "grad_norm": 1.3787962703695553, "learning_rate": 3.817249659290025e-05, "loss": 0.5884, "step": 7390 }, { "epoch": 0.7180283330098971, "grad_norm": 1.5293927693421874, "learning_rate": 3.81562723083912e-05, "loss": 0.6277, "step": 7400 }, { "epoch": 0.7189986415680186, "grad_norm": 1.4147252260121417, "learning_rate": 3.8140048023882143e-05, "loss": 0.5725, "step": 7410 }, { "epoch": 0.7199689501261402, "grad_norm": 1.7736403425897478, "learning_rate": 3.812382373937309e-05, "loss": 0.533, "step": 7420 }, { "epoch": 0.7209392586842616, "grad_norm": 1.642532067779172, "learning_rate": 3.810759945486404e-05, "loss": 0.6405, "step": 7430 }, { "epoch": 0.7219095672423831, "grad_norm": 1.8702737255141961, "learning_rate": 3.8091375170354986e-05, "loss": 0.523, "step": 7440 }, { "epoch": 0.7228798758005046, "grad_norm": 1.82343375200645, "learning_rate": 3.8075150885845936e-05, "loss": 0.54, "step": 7450 }, { "epoch": 0.723850184358626, "grad_norm": 1.6212132399265973, "learning_rate": 3.805892660133688e-05, "loss": 0.6016, "step": 7460 }, { "epoch": 0.7248204929167475, "grad_norm": 1.7707895342391216, "learning_rate": 3.804270231682783e-05, "loss": 0.5724, "step": 7470 }, { "epoch": 0.725790801474869, "grad_norm": 1.4832754109529855, "learning_rate": 3.802647803231877e-05, "loss": 0.5554, "step": 7480 }, { "epoch": 0.7267611100329905, "grad_norm": 1.4890517160303556, "learning_rate": 3.801025374780972e-05, "loss": 0.6065, "step": 7490 }, { "epoch": 0.727731418591112, "grad_norm": 2.047151127095432, "learning_rate": 3.799402946330067e-05, "loss": 0.6299, "step": 7500 }, { "epoch": 0.727731418591112, "eval_loss": 0.6740881204605103, "eval_runtime": 2470.6314, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 7500 }, { "epoch": 0.7287017271492334, "grad_norm": 1.6234544208509818, "learning_rate": 3.7977805178791614e-05, "loss": 0.6013, "step": 7510 }, { "epoch": 0.729672035707355, "grad_norm": 1.4452320512475898, "learning_rate": 3.7961580894282564e-05, "loss": 0.5479, "step": 7520 }, { "epoch": 0.7306423442654765, "grad_norm": 1.2290296761164277, "learning_rate": 3.794535660977351e-05, "loss": 0.5471, "step": 7530 }, { "epoch": 0.7316126528235979, "grad_norm": 1.567003648013224, "learning_rate": 3.792913232526446e-05, "loss": 0.6092, "step": 7540 }, { "epoch": 0.7325829613817194, "grad_norm": 1.6011455152210408, "learning_rate": 3.7912908040755406e-05, "loss": 0.629, "step": 7550 }, { "epoch": 0.7335532699398408, "grad_norm": 2.1218593111989192, "learning_rate": 3.7896683756246356e-05, "loss": 0.5773, "step": 7560 }, { "epoch": 0.7345235784979623, "grad_norm": 1.8323104555880425, "learning_rate": 3.78804594717373e-05, "loss": 0.618, "step": 7570 }, { "epoch": 0.7354938870560839, "grad_norm": 1.4152516707860787, "learning_rate": 3.786423518722825e-05, "loss": 0.6062, "step": 7580 }, { "epoch": 0.7364641956142053, "grad_norm": 1.6284692342846199, "learning_rate": 3.784801090271919e-05, "loss": 0.5671, "step": 7590 }, { "epoch": 0.7374345041723268, "grad_norm": 1.426043120608999, "learning_rate": 3.783178661821014e-05, "loss": 0.5972, "step": 7600 }, { "epoch": 0.7384048127304483, "grad_norm": 1.893183882199419, "learning_rate": 3.7815562333701085e-05, "loss": 0.6193, "step": 7610 }, { "epoch": 0.7393751212885697, "grad_norm": 1.9982181406611617, "learning_rate": 3.7799338049192034e-05, "loss": 0.5532, "step": 7620 }, { "epoch": 0.7403454298466913, "grad_norm": 1.8557864601006913, "learning_rate": 3.7783113764682984e-05, "loss": 0.5777, "step": 7630 }, { "epoch": 0.7413157384048127, "grad_norm": 1.8775040642513798, "learning_rate": 3.776688948017393e-05, "loss": 0.6268, "step": 7640 }, { "epoch": 0.7422860469629342, "grad_norm": 1.758574155734976, "learning_rate": 3.775066519566488e-05, "loss": 0.5662, "step": 7650 }, { "epoch": 0.7432563555210557, "grad_norm": 2.194684585403217, "learning_rate": 3.773444091115582e-05, "loss": 0.6628, "step": 7660 }, { "epoch": 0.7442266640791771, "grad_norm": 1.3673983887882022, "learning_rate": 3.771821662664677e-05, "loss": 0.6061, "step": 7670 }, { "epoch": 0.7451969726372987, "grad_norm": 1.2724099570692131, "learning_rate": 3.770199234213771e-05, "loss": 0.6337, "step": 7680 }, { "epoch": 0.7461672811954202, "grad_norm": 1.6823171665935568, "learning_rate": 3.768576805762866e-05, "loss": 0.5528, "step": 7690 }, { "epoch": 0.7471375897535416, "grad_norm": 1.9633396330523931, "learning_rate": 3.7669543773119605e-05, "loss": 0.5808, "step": 7700 }, { "epoch": 0.7481078983116631, "grad_norm": 1.7782111547784767, "learning_rate": 3.7653319488610555e-05, "loss": 0.5767, "step": 7710 }, { "epoch": 0.7490782068697845, "grad_norm": 2.3110549543336205, "learning_rate": 3.7637095204101505e-05, "loss": 0.6062, "step": 7720 }, { "epoch": 0.7500485154279061, "grad_norm": 1.6939270574842946, "learning_rate": 3.762087091959245e-05, "loss": 0.6225, "step": 7730 }, { "epoch": 0.7510188239860276, "grad_norm": 1.4820399391564874, "learning_rate": 3.76046466350834e-05, "loss": 0.6047, "step": 7740 }, { "epoch": 0.751989132544149, "grad_norm": 1.5672189262381615, "learning_rate": 3.758842235057434e-05, "loss": 0.5879, "step": 7750 }, { "epoch": 0.7529594411022705, "grad_norm": 1.6472834744978406, "learning_rate": 3.757219806606529e-05, "loss": 0.5895, "step": 7760 }, { "epoch": 0.7539297496603921, "grad_norm": 1.1201299864920753, "learning_rate": 3.7555973781556234e-05, "loss": 0.5801, "step": 7770 }, { "epoch": 0.7549000582185135, "grad_norm": 2.1519831187317107, "learning_rate": 3.753974949704718e-05, "loss": 0.5806, "step": 7780 }, { "epoch": 0.755870366776635, "grad_norm": 1.6558911877127844, "learning_rate": 3.752352521253813e-05, "loss": 0.6243, "step": 7790 }, { "epoch": 0.7568406753347564, "grad_norm": 1.56259604588081, "learning_rate": 3.7507300928029076e-05, "loss": 0.5611, "step": 7800 }, { "epoch": 0.7578109838928779, "grad_norm": 1.7228227668603993, "learning_rate": 3.7491076643520026e-05, "loss": 0.5517, "step": 7810 }, { "epoch": 0.7587812924509995, "grad_norm": 1.4902669117767953, "learning_rate": 3.747485235901097e-05, "loss": 0.538, "step": 7820 }, { "epoch": 0.7597516010091209, "grad_norm": 1.782224663749718, "learning_rate": 3.745862807450192e-05, "loss": 0.5753, "step": 7830 }, { "epoch": 0.7607219095672424, "grad_norm": 1.9997818457050736, "learning_rate": 3.744240378999286e-05, "loss": 0.6007, "step": 7840 }, { "epoch": 0.7616922181253639, "grad_norm": 1.4094657223176301, "learning_rate": 3.742617950548381e-05, "loss": 0.6338, "step": 7850 }, { "epoch": 0.7626625266834853, "grad_norm": 1.9288686398933608, "learning_rate": 3.7409955220974754e-05, "loss": 0.5906, "step": 7860 }, { "epoch": 0.7636328352416069, "grad_norm": 1.7390584642787001, "learning_rate": 3.7393730936465704e-05, "loss": 0.5738, "step": 7870 }, { "epoch": 0.7646031437997283, "grad_norm": 1.9438974814009289, "learning_rate": 3.7377506651956654e-05, "loss": 0.5121, "step": 7880 }, { "epoch": 0.7655734523578498, "grad_norm": 1.740068812702714, "learning_rate": 3.73612823674476e-05, "loss": 0.557, "step": 7890 }, { "epoch": 0.7665437609159713, "grad_norm": 1.7983844321630307, "learning_rate": 3.734505808293855e-05, "loss": 0.5748, "step": 7900 }, { "epoch": 0.7675140694740927, "grad_norm": 1.5464273991421298, "learning_rate": 3.732883379842949e-05, "loss": 0.547, "step": 7910 }, { "epoch": 0.7684843780322143, "grad_norm": 1.5357637162840818, "learning_rate": 3.731260951392044e-05, "loss": 0.5628, "step": 7920 }, { "epoch": 0.7694546865903358, "grad_norm": 1.5933643507985389, "learning_rate": 3.729638522941138e-05, "loss": 0.6377, "step": 7930 }, { "epoch": 0.7704249951484572, "grad_norm": 1.885102854313203, "learning_rate": 3.728016094490233e-05, "loss": 0.5542, "step": 7940 }, { "epoch": 0.7713953037065787, "grad_norm": 1.2646336337440816, "learning_rate": 3.7263936660393275e-05, "loss": 0.5909, "step": 7950 }, { "epoch": 0.7723656122647001, "grad_norm": 1.5489929838962764, "learning_rate": 3.7247712375884225e-05, "loss": 0.6417, "step": 7960 }, { "epoch": 0.7733359208228217, "grad_norm": 1.5642843810424312, "learning_rate": 3.7231488091375175e-05, "loss": 0.5739, "step": 7970 }, { "epoch": 0.7743062293809432, "grad_norm": 1.2994344306176584, "learning_rate": 3.721526380686612e-05, "loss": 0.6521, "step": 7980 }, { "epoch": 0.7752765379390646, "grad_norm": 1.7547352757346097, "learning_rate": 3.719903952235707e-05, "loss": 0.6055, "step": 7990 }, { "epoch": 0.7762468464971861, "grad_norm": 1.8680201530781706, "learning_rate": 3.718281523784801e-05, "loss": 0.5468, "step": 8000 }, { "epoch": 0.7762468464971861, "eval_loss": 0.6728888750076294, "eval_runtime": 2470.4642, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 8000 }, { "epoch": 0.7772171550553076, "grad_norm": 1.8353606131921394, "learning_rate": 3.716659095333896e-05, "loss": 0.6019, "step": 8010 }, { "epoch": 0.778187463613429, "grad_norm": 2.1165186332460624, "learning_rate": 3.71503666688299e-05, "loss": 0.6144, "step": 8020 }, { "epoch": 0.7791577721715506, "grad_norm": 1.2983357938097837, "learning_rate": 3.713414238432085e-05, "loss": 0.5765, "step": 8030 }, { "epoch": 0.780128080729672, "grad_norm": 1.6998652219779764, "learning_rate": 3.7117918099811796e-05, "loss": 0.5673, "step": 8040 }, { "epoch": 0.7810983892877935, "grad_norm": 1.6651384645995448, "learning_rate": 3.7101693815302746e-05, "loss": 0.5813, "step": 8050 }, { "epoch": 0.782068697845915, "grad_norm": 1.711156688730374, "learning_rate": 3.7085469530793696e-05, "loss": 0.5804, "step": 8060 }, { "epoch": 0.7830390064040365, "grad_norm": 1.8429084688690953, "learning_rate": 3.706924524628464e-05, "loss": 0.6076, "step": 8070 }, { "epoch": 0.784009314962158, "grad_norm": 2.029890621987729, "learning_rate": 3.705302096177559e-05, "loss": 0.6248, "step": 8080 }, { "epoch": 0.7849796235202795, "grad_norm": 2.075042801349425, "learning_rate": 3.703679667726653e-05, "loss": 0.5546, "step": 8090 }, { "epoch": 0.7859499320784009, "grad_norm": 1.9442644766970887, "learning_rate": 3.702057239275748e-05, "loss": 0.5754, "step": 8100 }, { "epoch": 0.7869202406365224, "grad_norm": 2.2898169839511167, "learning_rate": 3.7004348108248424e-05, "loss": 0.5479, "step": 8110 }, { "epoch": 0.7878905491946439, "grad_norm": 1.1971567816899984, "learning_rate": 3.6988123823739374e-05, "loss": 0.5841, "step": 8120 }, { "epoch": 0.7888608577527654, "grad_norm": 1.6483060799493618, "learning_rate": 3.697189953923032e-05, "loss": 0.5993, "step": 8130 }, { "epoch": 0.7898311663108869, "grad_norm": 2.1844480030037094, "learning_rate": 3.695567525472127e-05, "loss": 0.5639, "step": 8140 }, { "epoch": 0.7908014748690083, "grad_norm": 1.9208684488820418, "learning_rate": 3.6939450970212216e-05, "loss": 0.5548, "step": 8150 }, { "epoch": 0.7917717834271298, "grad_norm": 1.7298769058966599, "learning_rate": 3.692322668570316e-05, "loss": 0.6189, "step": 8160 }, { "epoch": 0.7927420919852514, "grad_norm": 2.043188779971351, "learning_rate": 3.690700240119411e-05, "loss": 0.6341, "step": 8170 }, { "epoch": 0.7937124005433728, "grad_norm": 1.4155988967270856, "learning_rate": 3.689077811668505e-05, "loss": 0.5803, "step": 8180 }, { "epoch": 0.7946827091014943, "grad_norm": 1.876027618508082, "learning_rate": 3.6874553832176e-05, "loss": 0.5469, "step": 8190 }, { "epoch": 0.7956530176596157, "grad_norm": 1.8526890916404788, "learning_rate": 3.6858329547666945e-05, "loss": 0.5692, "step": 8200 }, { "epoch": 0.7966233262177372, "grad_norm": 1.454859578114591, "learning_rate": 3.6842105263157895e-05, "loss": 0.5861, "step": 8210 }, { "epoch": 0.7975936347758588, "grad_norm": 1.6396970355230962, "learning_rate": 3.6825880978648844e-05, "loss": 0.5864, "step": 8220 }, { "epoch": 0.7985639433339802, "grad_norm": 2.035219184211101, "learning_rate": 3.680965669413979e-05, "loss": 0.5761, "step": 8230 }, { "epoch": 0.7995342518921017, "grad_norm": 1.4771264704118183, "learning_rate": 3.679343240963074e-05, "loss": 0.5901, "step": 8240 }, { "epoch": 0.8005045604502232, "grad_norm": 1.366640822196709, "learning_rate": 3.677720812512168e-05, "loss": 0.5976, "step": 8250 }, { "epoch": 0.8014748690083446, "grad_norm": 1.6158534552665804, "learning_rate": 3.676098384061263e-05, "loss": 0.5676, "step": 8260 }, { "epoch": 0.8024451775664662, "grad_norm": 1.906704908176893, "learning_rate": 3.674475955610357e-05, "loss": 0.6014, "step": 8270 }, { "epoch": 0.8034154861245876, "grad_norm": 1.72316238215741, "learning_rate": 3.672853527159452e-05, "loss": 0.5341, "step": 8280 }, { "epoch": 0.8043857946827091, "grad_norm": 1.6228489307580705, "learning_rate": 3.6712310987085466e-05, "loss": 0.6174, "step": 8290 }, { "epoch": 0.8053561032408306, "grad_norm": 1.6343652614101287, "learning_rate": 3.6696086702576416e-05, "loss": 0.5628, "step": 8300 }, { "epoch": 0.806326411798952, "grad_norm": 1.4875484854044787, "learning_rate": 3.6679862418067365e-05, "loss": 0.5559, "step": 8310 }, { "epoch": 0.8072967203570736, "grad_norm": 1.7360867259906014, "learning_rate": 3.6663638133558315e-05, "loss": 0.6122, "step": 8320 }, { "epoch": 0.8082670289151951, "grad_norm": 1.3760924603009739, "learning_rate": 3.664741384904926e-05, "loss": 0.5369, "step": 8330 }, { "epoch": 0.8092373374733165, "grad_norm": 2.1326347379272033, "learning_rate": 3.663118956454021e-05, "loss": 0.6026, "step": 8340 }, { "epoch": 0.810207646031438, "grad_norm": 1.8445530020541556, "learning_rate": 3.661496528003116e-05, "loss": 0.5924, "step": 8350 }, { "epoch": 0.8111779545895594, "grad_norm": 2.1620839328051153, "learning_rate": 3.65987409955221e-05, "loss": 0.5556, "step": 8360 }, { "epoch": 0.812148263147681, "grad_norm": 1.91334025126679, "learning_rate": 3.658251671101305e-05, "loss": 0.5787, "step": 8370 }, { "epoch": 0.8131185717058025, "grad_norm": 1.6197394739350461, "learning_rate": 3.656629242650399e-05, "loss": 0.5364, "step": 8380 }, { "epoch": 0.8140888802639239, "grad_norm": 1.7496855520727714, "learning_rate": 3.655006814199494e-05, "loss": 0.604, "step": 8390 }, { "epoch": 0.8150591888220454, "grad_norm": 1.9072833025355298, "learning_rate": 3.6533843857485886e-05, "loss": 0.6127, "step": 8400 }, { "epoch": 0.816029497380167, "grad_norm": 1.693494475477864, "learning_rate": 3.6517619572976836e-05, "loss": 0.5708, "step": 8410 }, { "epoch": 0.8169998059382884, "grad_norm": 1.694549481689151, "learning_rate": 3.6501395288467786e-05, "loss": 0.544, "step": 8420 }, { "epoch": 0.8179701144964099, "grad_norm": 1.77015793724743, "learning_rate": 3.648517100395873e-05, "loss": 0.5973, "step": 8430 }, { "epoch": 0.8189404230545313, "grad_norm": 1.738049084314835, "learning_rate": 3.646894671944968e-05, "loss": 0.5912, "step": 8440 }, { "epoch": 0.8199107316126528, "grad_norm": 1.647267019109384, "learning_rate": 3.645272243494062e-05, "loss": 0.6346, "step": 8450 }, { "epoch": 0.8208810401707743, "grad_norm": 1.5240403667548303, "learning_rate": 3.643649815043157e-05, "loss": 0.6057, "step": 8460 }, { "epoch": 0.8218513487288958, "grad_norm": 1.9527577485511822, "learning_rate": 3.6420273865922514e-05, "loss": 0.5615, "step": 8470 }, { "epoch": 0.8228216572870173, "grad_norm": 1.937531929878238, "learning_rate": 3.6404049581413464e-05, "loss": 0.5625, "step": 8480 }, { "epoch": 0.8237919658451388, "grad_norm": 1.7329982779170732, "learning_rate": 3.638782529690441e-05, "loss": 0.613, "step": 8490 }, { "epoch": 0.8247622744032602, "grad_norm": 1.880843027338221, "learning_rate": 3.637160101239536e-05, "loss": 0.5869, "step": 8500 }, { "epoch": 0.8247622744032602, "eval_loss": 0.6701070070266724, "eval_runtime": 2471.9751, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 8500 }, { "epoch": 0.8257325829613817, "grad_norm": 1.2566468431839666, "learning_rate": 3.6355376727886307e-05, "loss": 0.5433, "step": 8510 }, { "epoch": 0.8267028915195032, "grad_norm": 1.8546070309693423, "learning_rate": 3.633915244337725e-05, "loss": 0.5669, "step": 8520 }, { "epoch": 0.8276732000776247, "grad_norm": 1.268249173975751, "learning_rate": 3.63229281588682e-05, "loss": 0.5895, "step": 8530 }, { "epoch": 0.8286435086357462, "grad_norm": 1.7652424871651577, "learning_rate": 3.630670387435914e-05, "loss": 0.5597, "step": 8540 }, { "epoch": 0.8296138171938676, "grad_norm": 1.5480894985375517, "learning_rate": 3.629047958985009e-05, "loss": 0.5486, "step": 8550 }, { "epoch": 0.8305841257519891, "grad_norm": 1.750414852898824, "learning_rate": 3.6274255305341035e-05, "loss": 0.584, "step": 8560 }, { "epoch": 0.8315544343101107, "grad_norm": 1.7102419104322222, "learning_rate": 3.6258031020831985e-05, "loss": 0.5668, "step": 8570 }, { "epoch": 0.8325247428682321, "grad_norm": 1.5528462400057457, "learning_rate": 3.624180673632293e-05, "loss": 0.5896, "step": 8580 }, { "epoch": 0.8334950514263536, "grad_norm": 1.4801001203063542, "learning_rate": 3.622558245181388e-05, "loss": 0.5997, "step": 8590 }, { "epoch": 0.834465359984475, "grad_norm": 1.391432340520497, "learning_rate": 3.620935816730483e-05, "loss": 0.5795, "step": 8600 }, { "epoch": 0.8354356685425965, "grad_norm": 1.4373875976848383, "learning_rate": 3.619313388279577e-05, "loss": 0.5769, "step": 8610 }, { "epoch": 0.8364059771007181, "grad_norm": 1.8411413590399401, "learning_rate": 3.617690959828672e-05, "loss": 0.5626, "step": 8620 }, { "epoch": 0.8373762856588395, "grad_norm": 1.6990663233303336, "learning_rate": 3.616068531377766e-05, "loss": 0.6057, "step": 8630 }, { "epoch": 0.838346594216961, "grad_norm": 1.9737814861957261, "learning_rate": 3.614446102926861e-05, "loss": 0.564, "step": 8640 }, { "epoch": 0.8393169027750825, "grad_norm": 1.6163853470752478, "learning_rate": 3.6128236744759556e-05, "loss": 0.5429, "step": 8650 }, { "epoch": 0.8402872113332039, "grad_norm": 1.5911941575436375, "learning_rate": 3.6112012460250506e-05, "loss": 0.6203, "step": 8660 }, { "epoch": 0.8412575198913255, "grad_norm": 2.101242238346334, "learning_rate": 3.609578817574145e-05, "loss": 0.5263, "step": 8670 }, { "epoch": 0.8422278284494469, "grad_norm": 1.671876183792032, "learning_rate": 3.60795638912324e-05, "loss": 0.5699, "step": 8680 }, { "epoch": 0.8431981370075684, "grad_norm": 1.7729527985320428, "learning_rate": 3.606333960672335e-05, "loss": 0.5525, "step": 8690 }, { "epoch": 0.8441684455656899, "grad_norm": 1.7198189021299524, "learning_rate": 3.604711532221429e-05, "loss": 0.5789, "step": 8700 }, { "epoch": 0.8451387541238113, "grad_norm": 1.754160746828024, "learning_rate": 3.603089103770524e-05, "loss": 0.5126, "step": 8710 }, { "epoch": 0.8461090626819329, "grad_norm": 1.8621321453517432, "learning_rate": 3.6014666753196184e-05, "loss": 0.593, "step": 8720 }, { "epoch": 0.8470793712400544, "grad_norm": 1.4506925576933114, "learning_rate": 3.5998442468687134e-05, "loss": 0.5489, "step": 8730 }, { "epoch": 0.8480496797981758, "grad_norm": 1.9426709583278723, "learning_rate": 3.598221818417808e-05, "loss": 0.498, "step": 8740 }, { "epoch": 0.8490199883562973, "grad_norm": 1.7346244341640757, "learning_rate": 3.5965993899669026e-05, "loss": 0.5583, "step": 8750 }, { "epoch": 0.8499902969144187, "grad_norm": 1.926381813212122, "learning_rate": 3.594976961515997e-05, "loss": 0.5931, "step": 8760 }, { "epoch": 0.8509606054725403, "grad_norm": 1.9797943448639521, "learning_rate": 3.593354533065092e-05, "loss": 0.5773, "step": 8770 }, { "epoch": 0.8519309140306618, "grad_norm": 2.243092907709638, "learning_rate": 3.591732104614187e-05, "loss": 0.5862, "step": 8780 }, { "epoch": 0.8529012225887832, "grad_norm": 1.6686507496640315, "learning_rate": 3.590109676163281e-05, "loss": 0.5861, "step": 8790 }, { "epoch": 0.8538715311469047, "grad_norm": 1.767743857045935, "learning_rate": 3.588487247712376e-05, "loss": 0.6008, "step": 8800 }, { "epoch": 0.8548418397050263, "grad_norm": 1.789840488197533, "learning_rate": 3.5868648192614705e-05, "loss": 0.5578, "step": 8810 }, { "epoch": 0.8558121482631477, "grad_norm": 1.826043320699609, "learning_rate": 3.5852423908105655e-05, "loss": 0.5568, "step": 8820 }, { "epoch": 0.8567824568212692, "grad_norm": 1.4847524529624125, "learning_rate": 3.58361996235966e-05, "loss": 0.5984, "step": 8830 }, { "epoch": 0.8577527653793906, "grad_norm": 1.5302240137475795, "learning_rate": 3.581997533908755e-05, "loss": 0.5619, "step": 8840 }, { "epoch": 0.8587230739375121, "grad_norm": 1.6025870419513641, "learning_rate": 3.58037510545785e-05, "loss": 0.601, "step": 8850 }, { "epoch": 0.8596933824956337, "grad_norm": 1.8930896480563768, "learning_rate": 3.578752677006944e-05, "loss": 0.5935, "step": 8860 }, { "epoch": 0.8606636910537551, "grad_norm": 1.6413896209556986, "learning_rate": 3.577130248556039e-05, "loss": 0.6019, "step": 8870 }, { "epoch": 0.8616339996118766, "grad_norm": 1.6030981794189743, "learning_rate": 3.575507820105133e-05, "loss": 0.5761, "step": 8880 }, { "epoch": 0.8626043081699981, "grad_norm": 1.4798866694040977, "learning_rate": 3.573885391654228e-05, "loss": 0.5561, "step": 8890 }, { "epoch": 0.8635746167281195, "grad_norm": 1.5790323051766768, "learning_rate": 3.5722629632033226e-05, "loss": 0.5986, "step": 8900 }, { "epoch": 0.864544925286241, "grad_norm": 1.8409121182605548, "learning_rate": 3.5706405347524175e-05, "loss": 0.5878, "step": 8910 }, { "epoch": 0.8655152338443625, "grad_norm": 1.4956048954650922, "learning_rate": 3.569018106301512e-05, "loss": 0.5548, "step": 8920 }, { "epoch": 0.866485542402484, "grad_norm": 1.6969221289264929, "learning_rate": 3.567395677850607e-05, "loss": 0.5937, "step": 8930 }, { "epoch": 0.8674558509606055, "grad_norm": 1.8986285895813184, "learning_rate": 3.565773249399702e-05, "loss": 0.5742, "step": 8940 }, { "epoch": 0.8684261595187269, "grad_norm": 1.5434045122281053, "learning_rate": 3.564150820948796e-05, "loss": 0.561, "step": 8950 }, { "epoch": 0.8693964680768484, "grad_norm": 1.69523497453855, "learning_rate": 3.562528392497891e-05, "loss": 0.5386, "step": 8960 }, { "epoch": 0.87036677663497, "grad_norm": 1.382817285428295, "learning_rate": 3.5609059640469854e-05, "loss": 0.555, "step": 8970 }, { "epoch": 0.8713370851930914, "grad_norm": 1.7354010988203836, "learning_rate": 3.5592835355960803e-05, "loss": 0.5963, "step": 8980 }, { "epoch": 0.8723073937512129, "grad_norm": 1.9761497522561469, "learning_rate": 3.5576611071451746e-05, "loss": 0.5859, "step": 8990 }, { "epoch": 0.8732777023093343, "grad_norm": 1.6264062899083487, "learning_rate": 3.5560386786942696e-05, "loss": 0.5596, "step": 9000 }, { "epoch": 0.8732777023093343, "eval_loss": 0.664995551109314, "eval_runtime": 2472.6792, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 9000 }, { "epoch": 0.8742480108674558, "grad_norm": 1.7886226308798359, "learning_rate": 3.554416250243364e-05, "loss": 0.5484, "step": 9010 }, { "epoch": 0.8752183194255774, "grad_norm": 1.5768009291424698, "learning_rate": 3.552793821792459e-05, "loss": 0.5511, "step": 9020 }, { "epoch": 0.8761886279836988, "grad_norm": 1.5632964481761753, "learning_rate": 3.551171393341554e-05, "loss": 0.5173, "step": 9030 }, { "epoch": 0.8771589365418203, "grad_norm": 2.0563484372782264, "learning_rate": 3.549548964890648e-05, "loss": 0.5586, "step": 9040 }, { "epoch": 0.8781292450999418, "grad_norm": 1.9786114879020535, "learning_rate": 3.547926536439743e-05, "loss": 0.5767, "step": 9050 }, { "epoch": 0.8790995536580632, "grad_norm": 1.2143656544779613, "learning_rate": 3.5463041079888375e-05, "loss": 0.5469, "step": 9060 }, { "epoch": 0.8800698622161848, "grad_norm": 1.696976506773579, "learning_rate": 3.5446816795379324e-05, "loss": 0.5819, "step": 9070 }, { "epoch": 0.8810401707743062, "grad_norm": 1.8002454640393335, "learning_rate": 3.543059251087027e-05, "loss": 0.5947, "step": 9080 }, { "epoch": 0.8820104793324277, "grad_norm": 1.826272101277873, "learning_rate": 3.541436822636122e-05, "loss": 0.632, "step": 9090 }, { "epoch": 0.8829807878905492, "grad_norm": 1.8609691197855038, "learning_rate": 3.539814394185217e-05, "loss": 0.649, "step": 9100 }, { "epoch": 0.8839510964486706, "grad_norm": 1.7165708438826977, "learning_rate": 3.5381919657343117e-05, "loss": 0.5746, "step": 9110 }, { "epoch": 0.8849214050067922, "grad_norm": 2.0390486982174454, "learning_rate": 3.536569537283406e-05, "loss": 0.5834, "step": 9120 }, { "epoch": 0.8858917135649137, "grad_norm": 1.3107579501174733, "learning_rate": 3.534947108832501e-05, "loss": 0.5605, "step": 9130 }, { "epoch": 0.8868620221230351, "grad_norm": 1.7750714401302556, "learning_rate": 3.533324680381596e-05, "loss": 0.5778, "step": 9140 }, { "epoch": 0.8878323306811566, "grad_norm": 1.9002696712889475, "learning_rate": 3.53170225193069e-05, "loss": 0.5928, "step": 9150 }, { "epoch": 0.888802639239278, "grad_norm": 1.602034501274119, "learning_rate": 3.530079823479785e-05, "loss": 0.5941, "step": 9160 }, { "epoch": 0.8897729477973996, "grad_norm": 1.5338224625276715, "learning_rate": 3.5284573950288795e-05, "loss": 0.5678, "step": 9170 }, { "epoch": 0.8907432563555211, "grad_norm": 1.6656771631689804, "learning_rate": 3.5268349665779745e-05, "loss": 0.5621, "step": 9180 }, { "epoch": 0.8917135649136425, "grad_norm": 1.6734086655118368, "learning_rate": 3.525212538127069e-05, "loss": 0.5836, "step": 9190 }, { "epoch": 0.892683873471764, "grad_norm": 1.7321566684016627, "learning_rate": 3.523590109676164e-05, "loss": 0.6049, "step": 9200 }, { "epoch": 0.8936541820298856, "grad_norm": 1.8468990646584322, "learning_rate": 3.521967681225258e-05, "loss": 0.5265, "step": 9210 }, { "epoch": 0.894624490588007, "grad_norm": 1.8260102068484456, "learning_rate": 3.520345252774353e-05, "loss": 0.6003, "step": 9220 }, { "epoch": 0.8955947991461285, "grad_norm": 1.5049364704966368, "learning_rate": 3.518722824323448e-05, "loss": 0.6362, "step": 9230 }, { "epoch": 0.8965651077042499, "grad_norm": 1.8567290478944525, "learning_rate": 3.517100395872542e-05, "loss": 0.5666, "step": 9240 }, { "epoch": 0.8975354162623714, "grad_norm": 1.83554173576104, "learning_rate": 3.515477967421637e-05, "loss": 0.525, "step": 9250 }, { "epoch": 0.898505724820493, "grad_norm": 1.756580666977339, "learning_rate": 3.5138555389707316e-05, "loss": 0.5297, "step": 9260 }, { "epoch": 0.8994760333786144, "grad_norm": 1.3709870685247603, "learning_rate": 3.5122331105198265e-05, "loss": 0.5529, "step": 9270 }, { "epoch": 0.9004463419367359, "grad_norm": 1.910129116074302, "learning_rate": 3.510610682068921e-05, "loss": 0.5575, "step": 9280 }, { "epoch": 0.9014166504948574, "grad_norm": 1.457636418218358, "learning_rate": 3.508988253618016e-05, "loss": 0.6219, "step": 9290 }, { "epoch": 0.9023869590529788, "grad_norm": 1.7400018831329018, "learning_rate": 3.50736582516711e-05, "loss": 0.597, "step": 9300 }, { "epoch": 0.9033572676111004, "grad_norm": 1.7711972448898297, "learning_rate": 3.505743396716205e-05, "loss": 0.5953, "step": 9310 }, { "epoch": 0.9043275761692218, "grad_norm": 1.3127391028092956, "learning_rate": 3.5041209682653e-05, "loss": 0.5771, "step": 9320 }, { "epoch": 0.9052978847273433, "grad_norm": 1.707083390377331, "learning_rate": 3.5024985398143944e-05, "loss": 0.5941, "step": 9330 }, { "epoch": 0.9062681932854648, "grad_norm": 2.3395444029475425, "learning_rate": 3.5008761113634894e-05, "loss": 0.5249, "step": 9340 }, { "epoch": 0.9072385018435862, "grad_norm": 2.15851320522422, "learning_rate": 3.4992536829125837e-05, "loss": 0.5754, "step": 9350 }, { "epoch": 0.9082088104017078, "grad_norm": 1.566455099295683, "learning_rate": 3.4976312544616786e-05, "loss": 0.5982, "step": 9360 }, { "epoch": 0.9091791189598293, "grad_norm": 1.6447262177778976, "learning_rate": 3.496008826010773e-05, "loss": 0.5819, "step": 9370 }, { "epoch": 0.9101494275179507, "grad_norm": 1.9281752331049982, "learning_rate": 3.494386397559868e-05, "loss": 0.5789, "step": 9380 }, { "epoch": 0.9111197360760722, "grad_norm": 1.9237214019216216, "learning_rate": 3.492763969108962e-05, "loss": 0.5773, "step": 9390 }, { "epoch": 0.9120900446341936, "grad_norm": 1.5099620370471458, "learning_rate": 3.491141540658057e-05, "loss": 0.5954, "step": 9400 }, { "epoch": 0.9130603531923152, "grad_norm": 1.861214942110368, "learning_rate": 3.489519112207152e-05, "loss": 0.5956, "step": 9410 }, { "epoch": 0.9140306617504367, "grad_norm": 1.780788157697634, "learning_rate": 3.4878966837562465e-05, "loss": 0.5492, "step": 9420 }, { "epoch": 0.9150009703085581, "grad_norm": 1.7391155816623414, "learning_rate": 3.4862742553053414e-05, "loss": 0.5984, "step": 9430 }, { "epoch": 0.9159712788666796, "grad_norm": 2.1229665459752125, "learning_rate": 3.484651826854436e-05, "loss": 0.5924, "step": 9440 }, { "epoch": 0.9169415874248011, "grad_norm": 2.1306613515387625, "learning_rate": 3.483029398403531e-05, "loss": 0.5148, "step": 9450 }, { "epoch": 0.9179118959829226, "grad_norm": 1.312566076533743, "learning_rate": 3.481406969952625e-05, "loss": 0.5919, "step": 9460 }, { "epoch": 0.9188822045410441, "grad_norm": 1.4886826509371758, "learning_rate": 3.47978454150172e-05, "loss": 0.5853, "step": 9470 }, { "epoch": 0.9198525130991655, "grad_norm": 2.0143652338550098, "learning_rate": 3.478162113050815e-05, "loss": 0.5689, "step": 9480 }, { "epoch": 0.920822821657287, "grad_norm": 1.1368414203141723, "learning_rate": 3.476539684599909e-05, "loss": 0.5546, "step": 9490 }, { "epoch": 0.9217931302154085, "grad_norm": 1.6341362563091877, "learning_rate": 3.474917256149004e-05, "loss": 0.5481, "step": 9500 }, { "epoch": 0.9217931302154085, "eval_loss": 0.6622401475906372, "eval_runtime": 2474.1087, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 9500 }, { "epoch": 0.92276343877353, "grad_norm": 1.8501035490125046, "learning_rate": 3.4732948276980985e-05, "loss": 0.519, "step": 9510 }, { "epoch": 0.9237337473316515, "grad_norm": 1.788403250607821, "learning_rate": 3.4716723992471935e-05, "loss": 0.5934, "step": 9520 }, { "epoch": 0.924704055889773, "grad_norm": 2.0089841746536212, "learning_rate": 3.470049970796288e-05, "loss": 0.5463, "step": 9530 }, { "epoch": 0.9256743644478944, "grad_norm": 1.4797021959834935, "learning_rate": 3.468427542345383e-05, "loss": 0.538, "step": 9540 }, { "epoch": 0.9266446730060159, "grad_norm": 1.5018994978955122, "learning_rate": 3.466805113894477e-05, "loss": 0.5859, "step": 9550 }, { "epoch": 0.9276149815641374, "grad_norm": 1.6445575492561615, "learning_rate": 3.465182685443572e-05, "loss": 0.5335, "step": 9560 }, { "epoch": 0.9285852901222589, "grad_norm": 1.910218513281919, "learning_rate": 3.463560256992667e-05, "loss": 0.5284, "step": 9570 }, { "epoch": 0.9295555986803804, "grad_norm": 1.5061058308088753, "learning_rate": 3.4619378285417614e-05, "loss": 0.5332, "step": 9580 }, { "epoch": 0.9305259072385018, "grad_norm": 1.5869569364806828, "learning_rate": 3.460315400090856e-05, "loss": 0.5591, "step": 9590 }, { "epoch": 0.9314962157966233, "grad_norm": 1.709001551959916, "learning_rate": 3.4586929716399506e-05, "loss": 0.5499, "step": 9600 }, { "epoch": 0.9324665243547449, "grad_norm": 1.5648665735772118, "learning_rate": 3.4570705431890456e-05, "loss": 0.5677, "step": 9610 }, { "epoch": 0.9334368329128663, "grad_norm": 2.297106138114182, "learning_rate": 3.45544811473814e-05, "loss": 0.5687, "step": 9620 }, { "epoch": 0.9344071414709878, "grad_norm": 1.940344115414216, "learning_rate": 3.453825686287235e-05, "loss": 0.5614, "step": 9630 }, { "epoch": 0.9353774500291092, "grad_norm": 2.2931746226047336, "learning_rate": 3.452203257836329e-05, "loss": 0.5888, "step": 9640 }, { "epoch": 0.9363477585872307, "grad_norm": 1.6726396768074983, "learning_rate": 3.450580829385424e-05, "loss": 0.555, "step": 9650 }, { "epoch": 0.9373180671453523, "grad_norm": 1.5245738077800575, "learning_rate": 3.448958400934519e-05, "loss": 0.5464, "step": 9660 }, { "epoch": 0.9382883757034737, "grad_norm": 1.3863585051832457, "learning_rate": 3.4473359724836134e-05, "loss": 0.5033, "step": 9670 }, { "epoch": 0.9392586842615952, "grad_norm": 1.2261387961871664, "learning_rate": 3.4457135440327084e-05, "loss": 0.5229, "step": 9680 }, { "epoch": 0.9402289928197167, "grad_norm": 1.8933274253957586, "learning_rate": 3.444091115581803e-05, "loss": 0.552, "step": 9690 }, { "epoch": 0.9411993013778381, "grad_norm": 1.7235797326078635, "learning_rate": 3.442468687130898e-05, "loss": 0.5899, "step": 9700 }, { "epoch": 0.9421696099359597, "grad_norm": 1.428965938918239, "learning_rate": 3.440846258679992e-05, "loss": 0.5733, "step": 9710 }, { "epoch": 0.9431399184940811, "grad_norm": 1.4190853376920558, "learning_rate": 3.439223830229087e-05, "loss": 0.5762, "step": 9720 }, { "epoch": 0.9441102270522026, "grad_norm": 1.569528489090731, "learning_rate": 3.437601401778181e-05, "loss": 0.5535, "step": 9730 }, { "epoch": 0.9450805356103241, "grad_norm": 1.869896863596011, "learning_rate": 3.435978973327276e-05, "loss": 0.5651, "step": 9740 }, { "epoch": 0.9460508441684455, "grad_norm": 2.3801435462427785, "learning_rate": 3.434356544876371e-05, "loss": 0.5366, "step": 9750 }, { "epoch": 0.9470211527265671, "grad_norm": 1.7543924621581104, "learning_rate": 3.4327341164254655e-05, "loss": 0.5407, "step": 9760 }, { "epoch": 0.9479914612846886, "grad_norm": 1.6645160945117223, "learning_rate": 3.4311116879745605e-05, "loss": 0.5776, "step": 9770 }, { "epoch": 0.94896176984281, "grad_norm": 1.5226969850196896, "learning_rate": 3.429489259523655e-05, "loss": 0.5189, "step": 9780 }, { "epoch": 0.9499320784009315, "grad_norm": 1.7957341969322531, "learning_rate": 3.42786683107275e-05, "loss": 0.5771, "step": 9790 }, { "epoch": 0.9509023869590529, "grad_norm": 1.4922624955935235, "learning_rate": 3.426244402621844e-05, "loss": 0.5339, "step": 9800 }, { "epoch": 0.9518726955171745, "grad_norm": 1.9179842292383775, "learning_rate": 3.424621974170939e-05, "loss": 0.5408, "step": 9810 }, { "epoch": 0.952843004075296, "grad_norm": 1.8930907076270356, "learning_rate": 3.4229995457200333e-05, "loss": 0.5622, "step": 9820 }, { "epoch": 0.9538133126334174, "grad_norm": 1.781854001943581, "learning_rate": 3.421377117269128e-05, "loss": 0.5654, "step": 9830 }, { "epoch": 0.9547836211915389, "grad_norm": 1.9058249916201926, "learning_rate": 3.419754688818223e-05, "loss": 0.6089, "step": 9840 }, { "epoch": 0.9557539297496604, "grad_norm": 1.5813398061915347, "learning_rate": 3.4181322603673176e-05, "loss": 0.5444, "step": 9850 }, { "epoch": 0.9567242383077819, "grad_norm": 1.6480219585268394, "learning_rate": 3.4165098319164126e-05, "loss": 0.5215, "step": 9860 }, { "epoch": 0.9576945468659034, "grad_norm": 1.6302754198626406, "learning_rate": 3.414887403465507e-05, "loss": 0.5225, "step": 9870 }, { "epoch": 0.9586648554240248, "grad_norm": 1.9428020773502297, "learning_rate": 3.4132649750146025e-05, "loss": 0.5227, "step": 9880 }, { "epoch": 0.9596351639821463, "grad_norm": 2.1394044994306376, "learning_rate": 3.411642546563697e-05, "loss": 0.5227, "step": 9890 }, { "epoch": 0.9606054725402678, "grad_norm": 1.4958041091695313, "learning_rate": 3.410020118112792e-05, "loss": 0.5536, "step": 9900 }, { "epoch": 0.9615757810983893, "grad_norm": 1.65783511931855, "learning_rate": 3.408397689661886e-05, "loss": 0.6116, "step": 9910 }, { "epoch": 0.9625460896565108, "grad_norm": 1.2687727005359897, "learning_rate": 3.406775261210981e-05, "loss": 0.582, "step": 9920 }, { "epoch": 0.9635163982146323, "grad_norm": 1.642309856430725, "learning_rate": 3.4051528327600754e-05, "loss": 0.5156, "step": 9930 }, { "epoch": 0.9644867067727537, "grad_norm": 1.5540151182331825, "learning_rate": 3.4035304043091704e-05, "loss": 0.514, "step": 9940 }, { "epoch": 0.9654570153308752, "grad_norm": 1.6334411063744383, "learning_rate": 3.401907975858265e-05, "loss": 0.585, "step": 9950 }, { "epoch": 0.9664273238889967, "grad_norm": 1.5262322683208274, "learning_rate": 3.4002855474073596e-05, "loss": 0.5493, "step": 9960 }, { "epoch": 0.9673976324471182, "grad_norm": 2.041216469634578, "learning_rate": 3.3986631189564546e-05, "loss": 0.5701, "step": 9970 }, { "epoch": 0.9683679410052397, "grad_norm": 1.6826626866998198, "learning_rate": 3.397040690505549e-05, "loss": 0.5522, "step": 9980 }, { "epoch": 0.9693382495633611, "grad_norm": 1.3784779820091337, "learning_rate": 3.395418262054644e-05, "loss": 0.5401, "step": 9990 }, { "epoch": 0.9703085581214826, "grad_norm": 2.164538127382688, "learning_rate": 3.393795833603738e-05, "loss": 0.5493, "step": 10000 }, { "epoch": 0.9703085581214826, "eval_loss": 0.6604536771774292, "eval_runtime": 2471.8323, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 10000 }, { "epoch": 0.9712788666796042, "grad_norm": 1.641886541364288, "learning_rate": 3.392173405152833e-05, "loss": 0.5592, "step": 10010 }, { "epoch": 0.9722491752377256, "grad_norm": 1.676600577801569, "learning_rate": 3.3905509767019275e-05, "loss": 0.5675, "step": 10020 }, { "epoch": 0.9732194837958471, "grad_norm": 1.575522175535914, "learning_rate": 3.3889285482510224e-05, "loss": 0.5564, "step": 10030 }, { "epoch": 0.9741897923539685, "grad_norm": 1.4256611852011571, "learning_rate": 3.3873061198001174e-05, "loss": 0.5654, "step": 10040 }, { "epoch": 0.97516010091209, "grad_norm": 1.9766555201121603, "learning_rate": 3.385683691349212e-05, "loss": 0.5853, "step": 10050 }, { "epoch": 0.9761304094702116, "grad_norm": 1.4257493175991105, "learning_rate": 3.384061262898307e-05, "loss": 0.602, "step": 10060 }, { "epoch": 0.977100718028333, "grad_norm": 1.7942732122646, "learning_rate": 3.382438834447401e-05, "loss": 0.5083, "step": 10070 }, { "epoch": 0.9780710265864545, "grad_norm": 1.970290799031593, "learning_rate": 3.380816405996496e-05, "loss": 0.5353, "step": 10080 }, { "epoch": 0.979041335144576, "grad_norm": 2.177029103778447, "learning_rate": 3.37919397754559e-05, "loss": 0.5364, "step": 10090 }, { "epoch": 0.9800116437026974, "grad_norm": 1.6319237609838204, "learning_rate": 3.377571549094685e-05, "loss": 0.5498, "step": 10100 }, { "epoch": 0.980981952260819, "grad_norm": 1.9418003047270103, "learning_rate": 3.3759491206437795e-05, "loss": 0.6225, "step": 10110 }, { "epoch": 0.9819522608189404, "grad_norm": 1.5233382574211756, "learning_rate": 3.3743266921928745e-05, "loss": 0.5488, "step": 10120 }, { "epoch": 0.9829225693770619, "grad_norm": 1.7034150218696569, "learning_rate": 3.3727042637419695e-05, "loss": 0.5912, "step": 10130 }, { "epoch": 0.9838928779351834, "grad_norm": 1.9808892841805323, "learning_rate": 3.371081835291064e-05, "loss": 0.5657, "step": 10140 }, { "epoch": 0.9848631864933048, "grad_norm": 1.9429830219605533, "learning_rate": 3.369459406840159e-05, "loss": 0.563, "step": 10150 }, { "epoch": 0.9858334950514264, "grad_norm": 1.7823608925927663, "learning_rate": 3.367836978389253e-05, "loss": 0.5315, "step": 10160 }, { "epoch": 0.9868038036095479, "grad_norm": 1.6317868773777158, "learning_rate": 3.366214549938348e-05, "loss": 0.5663, "step": 10170 }, { "epoch": 0.9877741121676693, "grad_norm": 1.9950884655573202, "learning_rate": 3.3645921214874424e-05, "loss": 0.5704, "step": 10180 }, { "epoch": 0.9887444207257908, "grad_norm": 1.6320762630806733, "learning_rate": 3.362969693036537e-05, "loss": 0.5746, "step": 10190 }, { "epoch": 0.9897147292839122, "grad_norm": 1.685041060441873, "learning_rate": 3.361347264585632e-05, "loss": 0.5917, "step": 10200 }, { "epoch": 0.9906850378420338, "grad_norm": 1.7530117501023248, "learning_rate": 3.3597248361347266e-05, "loss": 0.5365, "step": 10210 }, { "epoch": 0.9916553464001553, "grad_norm": 1.9196791146989973, "learning_rate": 3.3581024076838216e-05, "loss": 0.5795, "step": 10220 }, { "epoch": 0.9926256549582767, "grad_norm": 1.9870737501998446, "learning_rate": 3.356479979232916e-05, "loss": 0.6544, "step": 10230 }, { "epoch": 0.9935959635163982, "grad_norm": 1.491102870770748, "learning_rate": 3.354857550782011e-05, "loss": 0.5083, "step": 10240 }, { "epoch": 0.9945662720745198, "grad_norm": 1.5900809359608934, "learning_rate": 3.353235122331105e-05, "loss": 0.5693, "step": 10250 }, { "epoch": 0.9955365806326412, "grad_norm": 1.6635095304395011, "learning_rate": 3.3516126938802e-05, "loss": 0.5744, "step": 10260 }, { "epoch": 0.9965068891907627, "grad_norm": 2.0049230325912957, "learning_rate": 3.3499902654292944e-05, "loss": 0.5461, "step": 10270 }, { "epoch": 0.9974771977488841, "grad_norm": 1.50147581851131, "learning_rate": 3.3483678369783894e-05, "loss": 0.5803, "step": 10280 }, { "epoch": 0.9984475063070056, "grad_norm": 1.8064338359868768, "learning_rate": 3.3467454085274844e-05, "loss": 0.506, "step": 10290 }, { "epoch": 0.9994178148651272, "grad_norm": 2.052231872791701, "learning_rate": 3.345122980076579e-05, "loss": 0.5752, "step": 10300 }, { "epoch": 1.0003881234232486, "grad_norm": 1.7418958102493116, "learning_rate": 3.343500551625674e-05, "loss": 0.6082, "step": 10310 }, { "epoch": 1.00135843198137, "grad_norm": 1.8975860607542987, "learning_rate": 3.341878123174768e-05, "loss": 0.5524, "step": 10320 }, { "epoch": 1.0023287405394916, "grad_norm": 1.7076807320811012, "learning_rate": 3.340255694723863e-05, "loss": 0.6075, "step": 10330 }, { "epoch": 1.003299049097613, "grad_norm": 1.4300451205657956, "learning_rate": 3.338633266272957e-05, "loss": 0.5326, "step": 10340 }, { "epoch": 1.0042693576557344, "grad_norm": 2.0682797223020777, "learning_rate": 3.337010837822052e-05, "loss": 0.5484, "step": 10350 }, { "epoch": 1.005239666213856, "grad_norm": 1.536692096590345, "learning_rate": 3.3353884093711465e-05, "loss": 0.546, "step": 10360 }, { "epoch": 1.0062099747719775, "grad_norm": 1.7848861749442593, "learning_rate": 3.3337659809202415e-05, "loss": 0.5763, "step": 10370 }, { "epoch": 1.007180283330099, "grad_norm": 1.4925347515246201, "learning_rate": 3.3321435524693365e-05, "loss": 0.5005, "step": 10380 }, { "epoch": 1.0081505918882205, "grad_norm": 1.4245048085109102, "learning_rate": 3.330521124018431e-05, "loss": 0.5352, "step": 10390 }, { "epoch": 1.009120900446342, "grad_norm": 1.6528542775713155, "learning_rate": 3.328898695567526e-05, "loss": 0.5794, "step": 10400 }, { "epoch": 1.0100912090044634, "grad_norm": 1.7391134864648952, "learning_rate": 3.32727626711662e-05, "loss": 0.6133, "step": 10410 }, { "epoch": 1.011061517562585, "grad_norm": 1.8040782879083466, "learning_rate": 3.325653838665715e-05, "loss": 0.5124, "step": 10420 }, { "epoch": 1.0120318261207064, "grad_norm": 1.730132568689756, "learning_rate": 3.324031410214809e-05, "loss": 0.4803, "step": 10430 }, { "epoch": 1.0130021346788278, "grad_norm": 1.8540575748734034, "learning_rate": 3.322408981763904e-05, "loss": 0.5188, "step": 10440 }, { "epoch": 1.0139724432369492, "grad_norm": 1.7683659307789739, "learning_rate": 3.3207865533129986e-05, "loss": 0.5341, "step": 10450 }, { "epoch": 1.0149427517950709, "grad_norm": 1.7745239019667731, "learning_rate": 3.3191641248620936e-05, "loss": 0.5858, "step": 10460 }, { "epoch": 1.0159130603531923, "grad_norm": 1.5664467690196044, "learning_rate": 3.3175416964111886e-05, "loss": 0.6587, "step": 10470 }, { "epoch": 1.0168833689113137, "grad_norm": 1.92833985335436, "learning_rate": 3.315919267960283e-05, "loss": 0.5532, "step": 10480 }, { "epoch": 1.0178536774694353, "grad_norm": 1.646924763771934, "learning_rate": 3.314296839509378e-05, "loss": 0.5638, "step": 10490 }, { "epoch": 1.0188239860275567, "grad_norm": 1.477844051399051, "learning_rate": 3.312674411058472e-05, "loss": 0.5254, "step": 10500 }, { "epoch": 1.0188239860275567, "eval_loss": 0.6569487452507019, "eval_runtime": 2472.3859, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 10500 }, { "epoch": 1.0197942945856782, "grad_norm": 1.7732534556121307, "learning_rate": 3.311051982607567e-05, "loss": 0.5098, "step": 10510 }, { "epoch": 1.0207646031437998, "grad_norm": 1.8592910812190746, "learning_rate": 3.3094295541566614e-05, "loss": 0.5836, "step": 10520 }, { "epoch": 1.0217349117019212, "grad_norm": 1.7186074858012894, "learning_rate": 3.3078071257057564e-05, "loss": 0.5445, "step": 10530 }, { "epoch": 1.0227052202600426, "grad_norm": 2.043653022917622, "learning_rate": 3.306184697254851e-05, "loss": 0.5627, "step": 10540 }, { "epoch": 1.0236755288181643, "grad_norm": 1.7054473164163695, "learning_rate": 3.304562268803946e-05, "loss": 0.6426, "step": 10550 }, { "epoch": 1.0246458373762857, "grad_norm": 1.5920397197878904, "learning_rate": 3.3029398403530406e-05, "loss": 0.5542, "step": 10560 }, { "epoch": 1.025616145934407, "grad_norm": 1.6885876369635333, "learning_rate": 3.301317411902135e-05, "loss": 0.5399, "step": 10570 }, { "epoch": 1.0265864544925287, "grad_norm": 1.712761359550596, "learning_rate": 3.29969498345123e-05, "loss": 0.5609, "step": 10580 }, { "epoch": 1.0275567630506501, "grad_norm": 1.849118205679511, "learning_rate": 3.298072555000324e-05, "loss": 0.5817, "step": 10590 }, { "epoch": 1.0285270716087715, "grad_norm": 1.7919320749353798, "learning_rate": 3.296450126549419e-05, "loss": 0.562, "step": 10600 }, { "epoch": 1.0294973801668932, "grad_norm": 1.9867833035377922, "learning_rate": 3.2948276980985135e-05, "loss": 0.5266, "step": 10610 }, { "epoch": 1.0304676887250146, "grad_norm": 1.4844422926334193, "learning_rate": 3.2932052696476085e-05, "loss": 0.5989, "step": 10620 }, { "epoch": 1.031437997283136, "grad_norm": 1.902397349373601, "learning_rate": 3.2915828411967034e-05, "loss": 0.4869, "step": 10630 }, { "epoch": 1.0324083058412574, "grad_norm": 1.595731562090853, "learning_rate": 3.289960412745798e-05, "loss": 0.5014, "step": 10640 }, { "epoch": 1.033378614399379, "grad_norm": 1.641533553754366, "learning_rate": 3.288337984294893e-05, "loss": 0.5847, "step": 10650 }, { "epoch": 1.0343489229575005, "grad_norm": 1.6793350674365874, "learning_rate": 3.286715555843988e-05, "loss": 0.5433, "step": 10660 }, { "epoch": 1.0353192315156219, "grad_norm": 1.7886777094595252, "learning_rate": 3.285093127393083e-05, "loss": 0.5936, "step": 10670 }, { "epoch": 1.0362895400737435, "grad_norm": 1.7628097025479255, "learning_rate": 3.283470698942177e-05, "loss": 0.5176, "step": 10680 }, { "epoch": 1.037259848631865, "grad_norm": 1.7117894064748884, "learning_rate": 3.281848270491272e-05, "loss": 0.5166, "step": 10690 }, { "epoch": 1.0382301571899863, "grad_norm": 1.8595449574787446, "learning_rate": 3.280225842040366e-05, "loss": 0.5511, "step": 10700 }, { "epoch": 1.039200465748108, "grad_norm": 1.5833347089731324, "learning_rate": 3.278603413589461e-05, "loss": 0.5808, "step": 10710 }, { "epoch": 1.0401707743062294, "grad_norm": 1.5168916043350162, "learning_rate": 3.2769809851385555e-05, "loss": 0.5295, "step": 10720 }, { "epoch": 1.0411410828643508, "grad_norm": 1.9583788202686643, "learning_rate": 3.2753585566876505e-05, "loss": 0.5859, "step": 10730 }, { "epoch": 1.0421113914224724, "grad_norm": 1.6490551146953607, "learning_rate": 3.273736128236745e-05, "loss": 0.5988, "step": 10740 }, { "epoch": 1.0430816999805939, "grad_norm": 2.2945646534561734, "learning_rate": 3.27211369978584e-05, "loss": 0.5481, "step": 10750 }, { "epoch": 1.0440520085387153, "grad_norm": 1.6823894633457166, "learning_rate": 3.270491271334935e-05, "loss": 0.566, "step": 10760 }, { "epoch": 1.0450223170968367, "grad_norm": 1.9698263256075523, "learning_rate": 3.268868842884029e-05, "loss": 0.582, "step": 10770 }, { "epoch": 1.0459926256549583, "grad_norm": 1.6140673075775909, "learning_rate": 3.267246414433124e-05, "loss": 0.4717, "step": 10780 }, { "epoch": 1.0469629342130797, "grad_norm": 1.916378749987929, "learning_rate": 3.265623985982218e-05, "loss": 0.5244, "step": 10790 }, { "epoch": 1.0479332427712011, "grad_norm": 1.9482061690949193, "learning_rate": 3.264001557531313e-05, "loss": 0.5973, "step": 10800 }, { "epoch": 1.0489035513293228, "grad_norm": 2.142491022649256, "learning_rate": 3.2623791290804076e-05, "loss": 0.5317, "step": 10810 }, { "epoch": 1.0498738598874442, "grad_norm": 1.706373160041174, "learning_rate": 3.2607567006295026e-05, "loss": 0.5136, "step": 10820 }, { "epoch": 1.0508441684455656, "grad_norm": 1.8959475848770957, "learning_rate": 3.2591342721785976e-05, "loss": 0.5655, "step": 10830 }, { "epoch": 1.0518144770036872, "grad_norm": 1.8921424824926663, "learning_rate": 3.257511843727692e-05, "loss": 0.5164, "step": 10840 }, { "epoch": 1.0527847855618087, "grad_norm": 2.0234555341020664, "learning_rate": 3.255889415276787e-05, "loss": 0.5335, "step": 10850 }, { "epoch": 1.05375509411993, "grad_norm": 2.0614821240519947, "learning_rate": 3.254266986825881e-05, "loss": 0.5022, "step": 10860 }, { "epoch": 1.0547254026780517, "grad_norm": 1.7142183353166638, "learning_rate": 3.252644558374976e-05, "loss": 0.5452, "step": 10870 }, { "epoch": 1.0556957112361731, "grad_norm": 1.8265079239939517, "learning_rate": 3.2510221299240704e-05, "loss": 0.5302, "step": 10880 }, { "epoch": 1.0566660197942945, "grad_norm": 1.7953764101996608, "learning_rate": 3.2493997014731654e-05, "loss": 0.5418, "step": 10890 }, { "epoch": 1.0576363283524162, "grad_norm": 1.8762589557600082, "learning_rate": 3.24777727302226e-05, "loss": 0.5743, "step": 10900 }, { "epoch": 1.0586066369105376, "grad_norm": 1.4926319666858994, "learning_rate": 3.246154844571355e-05, "loss": 0.5291, "step": 10910 }, { "epoch": 1.059576945468659, "grad_norm": 1.9278698099129787, "learning_rate": 3.2445324161204497e-05, "loss": 0.5409, "step": 10920 }, { "epoch": 1.0605472540267806, "grad_norm": 1.8973044877439684, "learning_rate": 3.242909987669544e-05, "loss": 0.5466, "step": 10930 }, { "epoch": 1.061517562584902, "grad_norm": 2.174563710599117, "learning_rate": 3.241287559218639e-05, "loss": 0.516, "step": 10940 }, { "epoch": 1.0624878711430235, "grad_norm": 2.1559495316330786, "learning_rate": 3.239665130767733e-05, "loss": 0.5951, "step": 10950 }, { "epoch": 1.0634581797011449, "grad_norm": 2.224958308598168, "learning_rate": 3.238042702316828e-05, "loss": 0.5216, "step": 10960 }, { "epoch": 1.0644284882592665, "grad_norm": 1.5388616251824314, "learning_rate": 3.2364202738659225e-05, "loss": 0.551, "step": 10970 }, { "epoch": 1.065398796817388, "grad_norm": 1.8563910144302744, "learning_rate": 3.2347978454150175e-05, "loss": 0.5325, "step": 10980 }, { "epoch": 1.0663691053755093, "grad_norm": 1.249386464606806, "learning_rate": 3.233175416964112e-05, "loss": 0.5223, "step": 10990 }, { "epoch": 1.067339413933631, "grad_norm": 1.7788595735717652, "learning_rate": 3.231552988513207e-05, "loss": 0.5353, "step": 11000 }, { "epoch": 1.067339413933631, "eval_loss": 0.6541542410850525, "eval_runtime": 2467.7272, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 11000 }, { "epoch": 1.0683097224917524, "grad_norm": 2.093397486341294, "learning_rate": 3.229930560062302e-05, "loss": 0.5373, "step": 11010 }, { "epoch": 1.0692800310498738, "grad_norm": 1.6292327190910305, "learning_rate": 3.228308131611396e-05, "loss": 0.5417, "step": 11020 }, { "epoch": 1.0702503396079954, "grad_norm": 1.6545192933610855, "learning_rate": 3.226685703160491e-05, "loss": 0.5508, "step": 11030 }, { "epoch": 1.0712206481661168, "grad_norm": 1.8906889663679678, "learning_rate": 3.225063274709585e-05, "loss": 0.5268, "step": 11040 }, { "epoch": 1.0721909567242383, "grad_norm": 1.392471317144995, "learning_rate": 3.22344084625868e-05, "loss": 0.5226, "step": 11050 }, { "epoch": 1.0731612652823599, "grad_norm": 1.9171337047147172, "learning_rate": 3.2218184178077746e-05, "loss": 0.5212, "step": 11060 }, { "epoch": 1.0741315738404813, "grad_norm": 1.4857874915171814, "learning_rate": 3.2201959893568696e-05, "loss": 0.5335, "step": 11070 }, { "epoch": 1.0751018823986027, "grad_norm": 1.7538116790409093, "learning_rate": 3.218573560905964e-05, "loss": 0.542, "step": 11080 }, { "epoch": 1.0760721909567241, "grad_norm": 1.6310285074576256, "learning_rate": 3.216951132455059e-05, "loss": 0.5114, "step": 11090 }, { "epoch": 1.0770424995148458, "grad_norm": 1.6921457584921478, "learning_rate": 3.215328704004154e-05, "loss": 0.4924, "step": 11100 }, { "epoch": 1.0780128080729672, "grad_norm": 1.7745242342830565, "learning_rate": 3.213706275553248e-05, "loss": 0.5567, "step": 11110 }, { "epoch": 1.0789831166310886, "grad_norm": 1.69496703066604, "learning_rate": 3.212083847102343e-05, "loss": 0.5237, "step": 11120 }, { "epoch": 1.0799534251892102, "grad_norm": 1.7029551999976154, "learning_rate": 3.2104614186514374e-05, "loss": 0.5023, "step": 11130 }, { "epoch": 1.0809237337473316, "grad_norm": 1.71464384875071, "learning_rate": 3.2088389902005324e-05, "loss": 0.5111, "step": 11140 }, { "epoch": 1.081894042305453, "grad_norm": 1.745797205638, "learning_rate": 3.207216561749627e-05, "loss": 0.5259, "step": 11150 }, { "epoch": 1.0828643508635747, "grad_norm": 1.8756703389755518, "learning_rate": 3.2055941332987216e-05, "loss": 0.5135, "step": 11160 }, { "epoch": 1.083834659421696, "grad_norm": 1.7773801380695633, "learning_rate": 3.203971704847816e-05, "loss": 0.606, "step": 11170 }, { "epoch": 1.0848049679798175, "grad_norm": 1.9149914262933831, "learning_rate": 3.202349276396911e-05, "loss": 0.504, "step": 11180 }, { "epoch": 1.0857752765379391, "grad_norm": 1.5792891192161234, "learning_rate": 3.200726847946006e-05, "loss": 0.4899, "step": 11190 }, { "epoch": 1.0867455850960606, "grad_norm": 1.3849113367423667, "learning_rate": 3.1991044194951e-05, "loss": 0.5435, "step": 11200 }, { "epoch": 1.087715893654182, "grad_norm": 1.836894182259094, "learning_rate": 3.197481991044195e-05, "loss": 0.5417, "step": 11210 }, { "epoch": 1.0886862022123036, "grad_norm": 2.5716819476436723, "learning_rate": 3.1958595625932895e-05, "loss": 0.5487, "step": 11220 }, { "epoch": 1.089656510770425, "grad_norm": 1.7054225586630418, "learning_rate": 3.1942371341423845e-05, "loss": 0.5366, "step": 11230 }, { "epoch": 1.0906268193285464, "grad_norm": 1.9132021835433188, "learning_rate": 3.192614705691479e-05, "loss": 0.5212, "step": 11240 }, { "epoch": 1.091597127886668, "grad_norm": 1.5687725973259348, "learning_rate": 3.190992277240574e-05, "loss": 0.5013, "step": 11250 }, { "epoch": 1.0925674364447895, "grad_norm": 1.7910741705827617, "learning_rate": 3.189369848789669e-05, "loss": 0.5359, "step": 11260 }, { "epoch": 1.093537745002911, "grad_norm": 1.4238221915445326, "learning_rate": 3.187747420338763e-05, "loss": 0.5729, "step": 11270 }, { "epoch": 1.0945080535610323, "grad_norm": 1.8958035882900321, "learning_rate": 3.186124991887858e-05, "loss": 0.5349, "step": 11280 }, { "epoch": 1.095478362119154, "grad_norm": 1.5644842460614365, "learning_rate": 3.184502563436952e-05, "loss": 0.5275, "step": 11290 }, { "epoch": 1.0964486706772754, "grad_norm": 1.6966656078568068, "learning_rate": 3.182880134986047e-05, "loss": 0.5714, "step": 11300 }, { "epoch": 1.0974189792353968, "grad_norm": 1.5845176878742038, "learning_rate": 3.1812577065351416e-05, "loss": 0.547, "step": 11310 }, { "epoch": 1.0983892877935184, "grad_norm": 1.9638352416110092, "learning_rate": 3.1796352780842365e-05, "loss": 0.5371, "step": 11320 }, { "epoch": 1.0993595963516398, "grad_norm": 1.3333867282862815, "learning_rate": 3.178012849633331e-05, "loss": 0.5558, "step": 11330 }, { "epoch": 1.1003299049097612, "grad_norm": 2.028988375070847, "learning_rate": 3.176390421182426e-05, "loss": 0.5425, "step": 11340 }, { "epoch": 1.1013002134678829, "grad_norm": 2.0233727263417745, "learning_rate": 3.174767992731521e-05, "loss": 0.5128, "step": 11350 }, { "epoch": 1.1022705220260043, "grad_norm": 1.6415117808780524, "learning_rate": 3.173145564280615e-05, "loss": 0.476, "step": 11360 }, { "epoch": 1.1032408305841257, "grad_norm": 2.0488869521619972, "learning_rate": 3.17152313582971e-05, "loss": 0.5221, "step": 11370 }, { "epoch": 1.1042111391422473, "grad_norm": 2.11079167436828, "learning_rate": 3.1699007073788044e-05, "loss": 0.5566, "step": 11380 }, { "epoch": 1.1051814477003687, "grad_norm": 2.0421176306313398, "learning_rate": 3.1682782789278993e-05, "loss": 0.5061, "step": 11390 }, { "epoch": 1.1061517562584902, "grad_norm": 1.9992500086060474, "learning_rate": 3.1666558504769936e-05, "loss": 0.542, "step": 11400 }, { "epoch": 1.1071220648166116, "grad_norm": 1.872752246092324, "learning_rate": 3.1650334220260886e-05, "loss": 0.5602, "step": 11410 }, { "epoch": 1.1080923733747332, "grad_norm": 2.0723616338374646, "learning_rate": 3.163410993575183e-05, "loss": 0.5052, "step": 11420 }, { "epoch": 1.1090626819328546, "grad_norm": 1.7543258598505558, "learning_rate": 3.161788565124278e-05, "loss": 0.5951, "step": 11430 }, { "epoch": 1.110032990490976, "grad_norm": 1.741111503275317, "learning_rate": 3.160166136673373e-05, "loss": 0.5481, "step": 11440 }, { "epoch": 1.1110032990490977, "grad_norm": 1.3801479018015765, "learning_rate": 3.158543708222468e-05, "loss": 0.5104, "step": 11450 }, { "epoch": 1.111973607607219, "grad_norm": 1.9442170634773426, "learning_rate": 3.156921279771563e-05, "loss": 0.5129, "step": 11460 }, { "epoch": 1.1129439161653405, "grad_norm": 1.9331832081031561, "learning_rate": 3.155298851320657e-05, "loss": 0.5139, "step": 11470 }, { "epoch": 1.1139142247234621, "grad_norm": 1.5810959938815903, "learning_rate": 3.153676422869752e-05, "loss": 0.603, "step": 11480 }, { "epoch": 1.1148845332815835, "grad_norm": 1.7478463817804297, "learning_rate": 3.1520539944188464e-05, "loss": 0.5573, "step": 11490 }, { "epoch": 1.115854841839705, "grad_norm": 1.512993291404137, "learning_rate": 3.1504315659679414e-05, "loss": 0.4983, "step": 11500 }, { "epoch": 1.115854841839705, "eval_loss": 0.6542506814002991, "eval_runtime": 2471.8207, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 11500 }, { "epoch": 1.1168251503978266, "grad_norm": 1.9794498366078679, "learning_rate": 3.148809137517036e-05, "loss": 0.5397, "step": 11510 }, { "epoch": 1.117795458955948, "grad_norm": 1.6989190060539383, "learning_rate": 3.1471867090661307e-05, "loss": 0.4416, "step": 11520 }, { "epoch": 1.1187657675140694, "grad_norm": 1.8417193394056512, "learning_rate": 3.145564280615225e-05, "loss": 0.5761, "step": 11530 }, { "epoch": 1.119736076072191, "grad_norm": 1.8784314737872851, "learning_rate": 3.14394185216432e-05, "loss": 0.5179, "step": 11540 }, { "epoch": 1.1207063846303125, "grad_norm": 1.724566892104882, "learning_rate": 3.142319423713415e-05, "loss": 0.5145, "step": 11550 }, { "epoch": 1.1216766931884339, "grad_norm": 1.6271566863595974, "learning_rate": 3.140696995262509e-05, "loss": 0.5557, "step": 11560 }, { "epoch": 1.1226470017465555, "grad_norm": 1.632474409823301, "learning_rate": 3.139074566811604e-05, "loss": 0.5351, "step": 11570 }, { "epoch": 1.123617310304677, "grad_norm": 1.8885164133175332, "learning_rate": 3.1374521383606985e-05, "loss": 0.5066, "step": 11580 }, { "epoch": 1.1245876188627983, "grad_norm": 1.86325603833634, "learning_rate": 3.1358297099097935e-05, "loss": 0.5561, "step": 11590 }, { "epoch": 1.1255579274209198, "grad_norm": 1.8325010805036752, "learning_rate": 3.134207281458888e-05, "loss": 0.5518, "step": 11600 }, { "epoch": 1.1265282359790414, "grad_norm": 1.7732624170282982, "learning_rate": 3.132584853007983e-05, "loss": 0.5264, "step": 11610 }, { "epoch": 1.1274985445371628, "grad_norm": 1.6849511508626207, "learning_rate": 3.130962424557077e-05, "loss": 0.4938, "step": 11620 }, { "epoch": 1.1284688530952842, "grad_norm": 1.7088287322634965, "learning_rate": 3.129339996106172e-05, "loss": 0.5432, "step": 11630 }, { "epoch": 1.1294391616534059, "grad_norm": 2.4521787808904345, "learning_rate": 3.127717567655267e-05, "loss": 0.5222, "step": 11640 }, { "epoch": 1.1304094702115273, "grad_norm": 1.8568933343609308, "learning_rate": 3.126095139204361e-05, "loss": 0.5155, "step": 11650 }, { "epoch": 1.1313797787696487, "grad_norm": 1.6581222418410275, "learning_rate": 3.124472710753456e-05, "loss": 0.5039, "step": 11660 }, { "epoch": 1.1323500873277703, "grad_norm": 1.8783913354540864, "learning_rate": 3.1228502823025506e-05, "loss": 0.5083, "step": 11670 }, { "epoch": 1.1333203958858917, "grad_norm": 1.7156898303987873, "learning_rate": 3.1212278538516455e-05, "loss": 0.5184, "step": 11680 }, { "epoch": 1.1342907044440131, "grad_norm": 2.1185799396739093, "learning_rate": 3.11960542540074e-05, "loss": 0.5252, "step": 11690 }, { "epoch": 1.1352610130021348, "grad_norm": 1.2978327151961555, "learning_rate": 3.117982996949835e-05, "loss": 0.5548, "step": 11700 }, { "epoch": 1.1362313215602562, "grad_norm": 1.6276998380365333, "learning_rate": 3.116360568498929e-05, "loss": 0.5412, "step": 11710 }, { "epoch": 1.1372016301183776, "grad_norm": 1.5330498087853446, "learning_rate": 3.114738140048024e-05, "loss": 0.4811, "step": 11720 }, { "epoch": 1.138171938676499, "grad_norm": 1.545547307525802, "learning_rate": 3.113115711597119e-05, "loss": 0.4841, "step": 11730 }, { "epoch": 1.1391422472346207, "grad_norm": 1.8963148998956563, "learning_rate": 3.1114932831462134e-05, "loss": 0.5048, "step": 11740 }, { "epoch": 1.140112555792742, "grad_norm": 1.927144240645965, "learning_rate": 3.1098708546953084e-05, "loss": 0.5437, "step": 11750 }, { "epoch": 1.1410828643508635, "grad_norm": 1.322758693894623, "learning_rate": 3.1082484262444027e-05, "loss": 0.5824, "step": 11760 }, { "epoch": 1.1420531729089851, "grad_norm": 1.9210557222184033, "learning_rate": 3.1066259977934976e-05, "loss": 0.5208, "step": 11770 }, { "epoch": 1.1430234814671065, "grad_norm": 1.8904898905530507, "learning_rate": 3.105003569342592e-05, "loss": 0.5646, "step": 11780 }, { "epoch": 1.143993790025228, "grad_norm": 2.2135650729717478, "learning_rate": 3.103381140891687e-05, "loss": 0.5823, "step": 11790 }, { "epoch": 1.1449640985833496, "grad_norm": 1.8257267748894057, "learning_rate": 3.101758712440781e-05, "loss": 0.4957, "step": 11800 }, { "epoch": 1.145934407141471, "grad_norm": 1.8399910987928954, "learning_rate": 3.100136283989876e-05, "loss": 0.5589, "step": 11810 }, { "epoch": 1.1469047156995924, "grad_norm": 1.751016659591738, "learning_rate": 3.098513855538971e-05, "loss": 0.4636, "step": 11820 }, { "epoch": 1.147875024257714, "grad_norm": 2.3784962245655743, "learning_rate": 3.0968914270880655e-05, "loss": 0.558, "step": 11830 }, { "epoch": 1.1488453328158355, "grad_norm": 1.7453204083727232, "learning_rate": 3.0952689986371604e-05, "loss": 0.5152, "step": 11840 }, { "epoch": 1.1498156413739569, "grad_norm": 1.6493432346857455, "learning_rate": 3.093646570186255e-05, "loss": 0.5396, "step": 11850 }, { "epoch": 1.1507859499320783, "grad_norm": 1.4016634963337722, "learning_rate": 3.09202414173535e-05, "loss": 0.5392, "step": 11860 }, { "epoch": 1.1517562584902, "grad_norm": 1.5144392427573337, "learning_rate": 3.090401713284444e-05, "loss": 0.5625, "step": 11870 }, { "epoch": 1.1527265670483213, "grad_norm": 1.9911432006869803, "learning_rate": 3.088779284833539e-05, "loss": 0.5187, "step": 11880 }, { "epoch": 1.153696875606443, "grad_norm": 1.989669700307941, "learning_rate": 3.087156856382634e-05, "loss": 0.516, "step": 11890 }, { "epoch": 1.1546671841645644, "grad_norm": 2.0321346469591717, "learning_rate": 3.085534427931728e-05, "loss": 0.5028, "step": 11900 }, { "epoch": 1.1556374927226858, "grad_norm": 1.6419270337849394, "learning_rate": 3.083911999480823e-05, "loss": 0.5114, "step": 11910 }, { "epoch": 1.1566078012808072, "grad_norm": 1.8199861134594042, "learning_rate": 3.0822895710299175e-05, "loss": 0.4862, "step": 11920 }, { "epoch": 1.1575781098389288, "grad_norm": 2.092599945357918, "learning_rate": 3.0806671425790125e-05, "loss": 0.4862, "step": 11930 }, { "epoch": 1.1585484183970503, "grad_norm": 1.6390208725289623, "learning_rate": 3.079044714128107e-05, "loss": 0.5518, "step": 11940 }, { "epoch": 1.1595187269551717, "grad_norm": 1.4035901760825538, "learning_rate": 3.077422285677202e-05, "loss": 0.5195, "step": 11950 }, { "epoch": 1.1604890355132933, "grad_norm": 1.965204556535071, "learning_rate": 3.075799857226296e-05, "loss": 0.5198, "step": 11960 }, { "epoch": 1.1614593440714147, "grad_norm": 1.797317897425162, "learning_rate": 3.074177428775391e-05, "loss": 0.5223, "step": 11970 }, { "epoch": 1.1624296526295361, "grad_norm": 2.0562783101788713, "learning_rate": 3.072555000324486e-05, "loss": 0.5207, "step": 11980 }, { "epoch": 1.1633999611876578, "grad_norm": 1.674272970925229, "learning_rate": 3.0709325718735804e-05, "loss": 0.5034, "step": 11990 }, { "epoch": 1.1643702697457792, "grad_norm": 1.8613409955734859, "learning_rate": 3.069310143422675e-05, "loss": 0.4921, "step": 12000 }, { "epoch": 1.1643702697457792, "eval_loss": 0.6506599187850952, "eval_runtime": 2467.2044, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 12000 }, { "epoch": 1.1653405783039006, "grad_norm": 1.561512098916604, "learning_rate": 3.0676877149717696e-05, "loss": 0.5497, "step": 12010 }, { "epoch": 1.1663108868620222, "grad_norm": 1.865230398957055, "learning_rate": 3.0660652865208646e-05, "loss": 0.4925, "step": 12020 }, { "epoch": 1.1672811954201436, "grad_norm": 2.2512724716981434, "learning_rate": 3.064442858069959e-05, "loss": 0.5295, "step": 12030 }, { "epoch": 1.168251503978265, "grad_norm": 1.8002247664216207, "learning_rate": 3.062820429619054e-05, "loss": 0.4996, "step": 12040 }, { "epoch": 1.1692218125363865, "grad_norm": 2.1116351677440943, "learning_rate": 3.061198001168148e-05, "loss": 0.5191, "step": 12050 }, { "epoch": 1.170192121094508, "grad_norm": 1.6725748372150215, "learning_rate": 3.059575572717243e-05, "loss": 0.5336, "step": 12060 }, { "epoch": 1.1711624296526295, "grad_norm": 1.9025453675497637, "learning_rate": 3.057953144266338e-05, "loss": 0.5188, "step": 12070 }, { "epoch": 1.172132738210751, "grad_norm": 1.8069530215074574, "learning_rate": 3.0563307158154324e-05, "loss": 0.5495, "step": 12080 }, { "epoch": 1.1731030467688726, "grad_norm": 1.4756589453476456, "learning_rate": 3.0547082873645274e-05, "loss": 0.5248, "step": 12090 }, { "epoch": 1.174073355326994, "grad_norm": 1.894556403114157, "learning_rate": 3.053085858913622e-05, "loss": 0.5604, "step": 12100 }, { "epoch": 1.1750436638851154, "grad_norm": 1.7674666934401635, "learning_rate": 3.0514634304627167e-05, "loss": 0.5355, "step": 12110 }, { "epoch": 1.176013972443237, "grad_norm": 1.6165024582050291, "learning_rate": 3.0498410020118113e-05, "loss": 0.4861, "step": 12120 }, { "epoch": 1.1769842810013584, "grad_norm": 1.7586790770235476, "learning_rate": 3.048218573560906e-05, "loss": 0.5009, "step": 12130 }, { "epoch": 1.1779545895594798, "grad_norm": 1.424893034597034, "learning_rate": 3.0465961451100006e-05, "loss": 0.5441, "step": 12140 }, { "epoch": 1.1789248981176015, "grad_norm": 1.79883718769592, "learning_rate": 3.0449737166590952e-05, "loss": 0.5313, "step": 12150 }, { "epoch": 1.179895206675723, "grad_norm": 1.2166009796658896, "learning_rate": 3.04335128820819e-05, "loss": 0.5771, "step": 12160 }, { "epoch": 1.1808655152338443, "grad_norm": 1.8319881864090852, "learning_rate": 3.0417288597572845e-05, "loss": 0.5341, "step": 12170 }, { "epoch": 1.1818358237919657, "grad_norm": 2.000590117325487, "learning_rate": 3.040106431306379e-05, "loss": 0.5434, "step": 12180 }, { "epoch": 1.1828061323500874, "grad_norm": 1.7597266341378504, "learning_rate": 3.038484002855474e-05, "loss": 0.4637, "step": 12190 }, { "epoch": 1.1837764409082088, "grad_norm": 1.7547988130290746, "learning_rate": 3.0368615744045688e-05, "loss": 0.5238, "step": 12200 }, { "epoch": 1.1847467494663304, "grad_norm": 2.072598486347679, "learning_rate": 3.0352391459536634e-05, "loss": 0.5198, "step": 12210 }, { "epoch": 1.1857170580244518, "grad_norm": 1.801682666637873, "learning_rate": 3.033616717502758e-05, "loss": 0.5549, "step": 12220 }, { "epoch": 1.1866873665825732, "grad_norm": 1.7660188156759438, "learning_rate": 3.0319942890518534e-05, "loss": 0.5415, "step": 12230 }, { "epoch": 1.1876576751406946, "grad_norm": 1.531355321026242, "learning_rate": 3.030371860600948e-05, "loss": 0.5125, "step": 12240 }, { "epoch": 1.1886279836988163, "grad_norm": 1.6289177265084471, "learning_rate": 3.0287494321500426e-05, "loss": 0.5131, "step": 12250 }, { "epoch": 1.1895982922569377, "grad_norm": 2.3228651312748423, "learning_rate": 3.0271270036991373e-05, "loss": 0.4859, "step": 12260 }, { "epoch": 1.190568600815059, "grad_norm": 2.076037584027454, "learning_rate": 3.025504575248232e-05, "loss": 0.5234, "step": 12270 }, { "epoch": 1.1915389093731807, "grad_norm": 1.4111596404765325, "learning_rate": 3.0238821467973266e-05, "loss": 0.5226, "step": 12280 }, { "epoch": 1.1925092179313022, "grad_norm": 1.976122066748625, "learning_rate": 3.0222597183464212e-05, "loss": 0.5753, "step": 12290 }, { "epoch": 1.1934795264894236, "grad_norm": 1.6905440972221555, "learning_rate": 3.0206372898955158e-05, "loss": 0.5559, "step": 12300 }, { "epoch": 1.1944498350475452, "grad_norm": 1.549005200001293, "learning_rate": 3.0190148614446108e-05, "loss": 0.5199, "step": 12310 }, { "epoch": 1.1954201436056666, "grad_norm": 1.7886858738633538, "learning_rate": 3.0173924329937054e-05, "loss": 0.5524, "step": 12320 }, { "epoch": 1.196390452163788, "grad_norm": 2.226393124620313, "learning_rate": 3.0157700045428e-05, "loss": 0.511, "step": 12330 }, { "epoch": 1.1973607607219097, "grad_norm": 1.84134054205334, "learning_rate": 3.0141475760918947e-05, "loss": 0.5537, "step": 12340 }, { "epoch": 1.198331069280031, "grad_norm": 1.938676760265039, "learning_rate": 3.0125251476409894e-05, "loss": 0.5782, "step": 12350 }, { "epoch": 1.1993013778381525, "grad_norm": 1.6671922582631973, "learning_rate": 3.010902719190084e-05, "loss": 0.5146, "step": 12360 }, { "epoch": 1.200271686396274, "grad_norm": 2.09828561481085, "learning_rate": 3.0092802907391786e-05, "loss": 0.5292, "step": 12370 }, { "epoch": 1.2012419949543955, "grad_norm": 1.782738185716262, "learning_rate": 3.0076578622882733e-05, "loss": 0.5797, "step": 12380 }, { "epoch": 1.202212303512517, "grad_norm": 1.7386393975624728, "learning_rate": 3.006035433837368e-05, "loss": 0.5459, "step": 12390 }, { "epoch": 1.2031826120706384, "grad_norm": 1.834619533516341, "learning_rate": 3.004413005386463e-05, "loss": 0.5711, "step": 12400 }, { "epoch": 1.20415292062876, "grad_norm": 1.6555949649402653, "learning_rate": 3.0027905769355575e-05, "loss": 0.5447, "step": 12410 }, { "epoch": 1.2051232291868814, "grad_norm": 1.997714330931175, "learning_rate": 3.001168148484652e-05, "loss": 0.5001, "step": 12420 }, { "epoch": 1.2060935377450028, "grad_norm": 1.943566532939107, "learning_rate": 2.9995457200337468e-05, "loss": 0.5612, "step": 12430 }, { "epoch": 1.2070638463031245, "grad_norm": 2.295630424579193, "learning_rate": 2.9979232915828414e-05, "loss": 0.5443, "step": 12440 }, { "epoch": 1.2080341548612459, "grad_norm": 1.6087261613477206, "learning_rate": 2.996300863131936e-05, "loss": 0.4929, "step": 12450 }, { "epoch": 1.2090044634193673, "grad_norm": 1.7507917723489945, "learning_rate": 2.9946784346810307e-05, "loss": 0.4915, "step": 12460 }, { "epoch": 1.209974771977489, "grad_norm": 1.712263283374138, "learning_rate": 2.9930560062301254e-05, "loss": 0.5184, "step": 12470 }, { "epoch": 1.2109450805356103, "grad_norm": 2.1091363117437427, "learning_rate": 2.9914335777792203e-05, "loss": 0.5123, "step": 12480 }, { "epoch": 1.2119153890937318, "grad_norm": 2.404178772933857, "learning_rate": 2.989811149328315e-05, "loss": 0.4568, "step": 12490 }, { "epoch": 1.2128856976518532, "grad_norm": 1.6898907246159178, "learning_rate": 2.9881887208774096e-05, "loss": 0.5114, "step": 12500 }, { "epoch": 1.2128856976518532, "eval_loss": 0.6506454348564148, "eval_runtime": 2464.983, "eval_samples_per_second": 0.727, "eval_steps_per_second": 0.363, "step": 12500 }, { "epoch": 1.2138560062099748, "grad_norm": 1.5632796322726878, "learning_rate": 2.9865662924265042e-05, "loss": 0.5381, "step": 12510 }, { "epoch": 1.2148263147680962, "grad_norm": 2.2060623757238482, "learning_rate": 2.984943863975599e-05, "loss": 0.6286, "step": 12520 }, { "epoch": 1.2157966233262179, "grad_norm": 1.6896138867780373, "learning_rate": 2.9833214355246935e-05, "loss": 0.4824, "step": 12530 }, { "epoch": 1.2167669318843393, "grad_norm": 1.6264014630619223, "learning_rate": 2.981699007073788e-05, "loss": 0.5231, "step": 12540 }, { "epoch": 1.2177372404424607, "grad_norm": 2.288555955501704, "learning_rate": 2.9800765786228828e-05, "loss": 0.5415, "step": 12550 }, { "epoch": 1.218707549000582, "grad_norm": 1.6328806432164462, "learning_rate": 2.9784541501719774e-05, "loss": 0.5282, "step": 12560 }, { "epoch": 1.2196778575587037, "grad_norm": 1.9940506922760688, "learning_rate": 2.9768317217210724e-05, "loss": 0.5283, "step": 12570 }, { "epoch": 1.2206481661168251, "grad_norm": 1.9369189438911159, "learning_rate": 2.975209293270167e-05, "loss": 0.4722, "step": 12580 }, { "epoch": 1.2216184746749466, "grad_norm": 1.150006209597975, "learning_rate": 2.9735868648192617e-05, "loss": 0.5184, "step": 12590 }, { "epoch": 1.2225887832330682, "grad_norm": 1.7210167191672803, "learning_rate": 2.9719644363683563e-05, "loss": 0.5031, "step": 12600 }, { "epoch": 1.2235590917911896, "grad_norm": 1.9388550988757736, "learning_rate": 2.970342007917451e-05, "loss": 0.5099, "step": 12610 }, { "epoch": 1.224529400349311, "grad_norm": 1.6524197083323393, "learning_rate": 2.9687195794665456e-05, "loss": 0.4447, "step": 12620 }, { "epoch": 1.2254997089074326, "grad_norm": 1.89736479001966, "learning_rate": 2.9670971510156402e-05, "loss": 0.4983, "step": 12630 }, { "epoch": 1.226470017465554, "grad_norm": 1.895097527141105, "learning_rate": 2.965474722564735e-05, "loss": 0.5039, "step": 12640 }, { "epoch": 1.2274403260236755, "grad_norm": 1.976305435076919, "learning_rate": 2.96385229411383e-05, "loss": 0.5337, "step": 12650 }, { "epoch": 1.2284106345817971, "grad_norm": 1.6796293491451193, "learning_rate": 2.9622298656629245e-05, "loss": 0.506, "step": 12660 }, { "epoch": 1.2293809431399185, "grad_norm": 1.6431597372306554, "learning_rate": 2.960607437212019e-05, "loss": 0.5797, "step": 12670 }, { "epoch": 1.23035125169804, "grad_norm": 1.8020203988119472, "learning_rate": 2.9589850087611138e-05, "loss": 0.4957, "step": 12680 }, { "epoch": 1.2313215602561614, "grad_norm": 2.0575623836232935, "learning_rate": 2.9573625803102084e-05, "loss": 0.5125, "step": 12690 }, { "epoch": 1.232291868814283, "grad_norm": 2.0440836316269544, "learning_rate": 2.955740151859303e-05, "loss": 0.5186, "step": 12700 }, { "epoch": 1.2332621773724044, "grad_norm": 1.6684531467277435, "learning_rate": 2.9541177234083977e-05, "loss": 0.5348, "step": 12710 }, { "epoch": 1.2342324859305258, "grad_norm": 1.6949248831996988, "learning_rate": 2.9524952949574923e-05, "loss": 0.5484, "step": 12720 }, { "epoch": 1.2352027944886474, "grad_norm": 1.773704436016878, "learning_rate": 2.950872866506587e-05, "loss": 0.5033, "step": 12730 }, { "epoch": 1.2361731030467689, "grad_norm": 1.9246733654165475, "learning_rate": 2.949250438055682e-05, "loss": 0.5164, "step": 12740 }, { "epoch": 1.2371434116048903, "grad_norm": 1.7869787657786207, "learning_rate": 2.9476280096047766e-05, "loss": 0.549, "step": 12750 }, { "epoch": 1.238113720163012, "grad_norm": 1.5427226807712424, "learning_rate": 2.9460055811538712e-05, "loss": 0.5453, "step": 12760 }, { "epoch": 1.2390840287211333, "grad_norm": 2.1243893484204706, "learning_rate": 2.944383152702966e-05, "loss": 0.5232, "step": 12770 }, { "epoch": 1.2400543372792547, "grad_norm": 1.7624693076719502, "learning_rate": 2.9427607242520605e-05, "loss": 0.4767, "step": 12780 }, { "epoch": 1.2410246458373764, "grad_norm": 1.8048923369800416, "learning_rate": 2.941138295801155e-05, "loss": 0.5786, "step": 12790 }, { "epoch": 1.2419949543954978, "grad_norm": 1.557577350338282, "learning_rate": 2.9395158673502498e-05, "loss": 0.5358, "step": 12800 }, { "epoch": 1.2429652629536192, "grad_norm": 1.7796545697030264, "learning_rate": 2.9378934388993444e-05, "loss": 0.5447, "step": 12810 }, { "epoch": 1.2439355715117406, "grad_norm": 1.5233346835390529, "learning_rate": 2.936271010448439e-05, "loss": 0.5434, "step": 12820 }, { "epoch": 1.2449058800698622, "grad_norm": 1.8281296123454516, "learning_rate": 2.934648581997534e-05, "loss": 0.5311, "step": 12830 }, { "epoch": 1.2458761886279837, "grad_norm": 1.8761299295652716, "learning_rate": 2.9330261535466287e-05, "loss": 0.561, "step": 12840 }, { "epoch": 1.2468464971861053, "grad_norm": 1.571060091128229, "learning_rate": 2.9314037250957233e-05, "loss": 0.5293, "step": 12850 }, { "epoch": 1.2478168057442267, "grad_norm": 1.3554175730214915, "learning_rate": 2.929781296644818e-05, "loss": 0.5259, "step": 12860 }, { "epoch": 1.2487871143023481, "grad_norm": 1.5048611859450334, "learning_rate": 2.9281588681939126e-05, "loss": 0.5526, "step": 12870 }, { "epoch": 1.2497574228604695, "grad_norm": 1.944003477785508, "learning_rate": 2.9265364397430072e-05, "loss": 0.5319, "step": 12880 }, { "epoch": 1.2507277314185912, "grad_norm": 1.6137183858737645, "learning_rate": 2.924914011292102e-05, "loss": 0.5219, "step": 12890 }, { "epoch": 1.2516980399767126, "grad_norm": 1.8792239391042322, "learning_rate": 2.9232915828411965e-05, "loss": 0.4996, "step": 12900 }, { "epoch": 1.252668348534834, "grad_norm": 1.6048948758745463, "learning_rate": 2.9216691543902915e-05, "loss": 0.5094, "step": 12910 }, { "epoch": 1.2536386570929556, "grad_norm": 2.06874670039208, "learning_rate": 2.920046725939386e-05, "loss": 0.5412, "step": 12920 }, { "epoch": 1.254608965651077, "grad_norm": 1.8299215479076065, "learning_rate": 2.9184242974884808e-05, "loss": 0.5215, "step": 12930 }, { "epoch": 1.2555792742091985, "grad_norm": 1.768175531798484, "learning_rate": 2.9168018690375754e-05, "loss": 0.5466, "step": 12940 }, { "epoch": 1.2565495827673199, "grad_norm": 2.19878862821484, "learning_rate": 2.91517944058667e-05, "loss": 0.5728, "step": 12950 }, { "epoch": 1.2575198913254415, "grad_norm": 1.5410059750444967, "learning_rate": 2.9135570121357647e-05, "loss": 0.5248, "step": 12960 }, { "epoch": 1.258490199883563, "grad_norm": 1.9319057652262193, "learning_rate": 2.9119345836848593e-05, "loss": 0.4906, "step": 12970 }, { "epoch": 1.2594605084416846, "grad_norm": 1.4285564772766584, "learning_rate": 2.910312155233954e-05, "loss": 0.4976, "step": 12980 }, { "epoch": 1.260430816999806, "grad_norm": 1.9098028563216323, "learning_rate": 2.9086897267830486e-05, "loss": 0.4841, "step": 12990 }, { "epoch": 1.2614011255579274, "grad_norm": 2.155481185125366, "learning_rate": 2.9070672983321436e-05, "loss": 0.5486, "step": 13000 }, { "epoch": 1.2614011255579274, "eval_loss": 0.6476565003395081, "eval_runtime": 2469.4767, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 13000 }, { "epoch": 1.2623714341160488, "grad_norm": 1.7583302051951548, "learning_rate": 2.9054448698812385e-05, "loss": 0.5287, "step": 13010 }, { "epoch": 1.2633417426741704, "grad_norm": 2.1364231232324777, "learning_rate": 2.9038224414303332e-05, "loss": 0.5338, "step": 13020 }, { "epoch": 1.2643120512322918, "grad_norm": 1.858948662905276, "learning_rate": 2.902200012979428e-05, "loss": 0.5371, "step": 13030 }, { "epoch": 1.2652823597904135, "grad_norm": 1.434526008901497, "learning_rate": 2.9005775845285228e-05, "loss": 0.5199, "step": 13040 }, { "epoch": 1.266252668348535, "grad_norm": 1.8326954509581932, "learning_rate": 2.8989551560776174e-05, "loss": 0.4626, "step": 13050 }, { "epoch": 1.2672229769066563, "grad_norm": 1.929734623290484, "learning_rate": 2.897332727626712e-05, "loss": 0.5095, "step": 13060 }, { "epoch": 1.2681932854647777, "grad_norm": 1.7342388298669522, "learning_rate": 2.8957102991758067e-05, "loss": 0.5334, "step": 13070 }, { "epoch": 1.2691635940228994, "grad_norm": 1.8419675684147427, "learning_rate": 2.8940878707249013e-05, "loss": 0.5215, "step": 13080 }, { "epoch": 1.2701339025810208, "grad_norm": 2.0003719776284434, "learning_rate": 2.892465442273996e-05, "loss": 0.5291, "step": 13090 }, { "epoch": 1.2711042111391422, "grad_norm": 1.5672070400575155, "learning_rate": 2.8908430138230906e-05, "loss": 0.5302, "step": 13100 }, { "epoch": 1.2720745196972638, "grad_norm": 1.6329476161915306, "learning_rate": 2.8892205853721856e-05, "loss": 0.5004, "step": 13110 }, { "epoch": 1.2730448282553852, "grad_norm": 1.3430643020518607, "learning_rate": 2.8875981569212802e-05, "loss": 0.5586, "step": 13120 }, { "epoch": 1.2740151368135066, "grad_norm": 1.865116300464639, "learning_rate": 2.885975728470375e-05, "loss": 0.5176, "step": 13130 }, { "epoch": 1.274985445371628, "grad_norm": 1.9537429368278099, "learning_rate": 2.8843533000194695e-05, "loss": 0.5167, "step": 13140 }, { "epoch": 1.2759557539297497, "grad_norm": 2.059620522389882, "learning_rate": 2.882730871568564e-05, "loss": 0.4654, "step": 13150 }, { "epoch": 1.276926062487871, "grad_norm": 2.1139159674960317, "learning_rate": 2.8811084431176588e-05, "loss": 0.5365, "step": 13160 }, { "epoch": 1.2778963710459927, "grad_norm": 1.9675564766085183, "learning_rate": 2.8794860146667534e-05, "loss": 0.5567, "step": 13170 }, { "epoch": 1.2788666796041142, "grad_norm": 1.4714857260607748, "learning_rate": 2.877863586215848e-05, "loss": 0.5543, "step": 13180 }, { "epoch": 1.2798369881622356, "grad_norm": 1.997029571976202, "learning_rate": 2.8762411577649427e-05, "loss": 0.5349, "step": 13190 }, { "epoch": 1.280807296720357, "grad_norm": 1.7739211918572755, "learning_rate": 2.8746187293140377e-05, "loss": 0.4893, "step": 13200 }, { "epoch": 1.2817776052784786, "grad_norm": 1.7069871948143287, "learning_rate": 2.8729963008631323e-05, "loss": 0.4907, "step": 13210 }, { "epoch": 1.2827479138366, "grad_norm": 1.9981954512910625, "learning_rate": 2.871373872412227e-05, "loss": 0.5672, "step": 13220 }, { "epoch": 1.2837182223947214, "grad_norm": 1.6613620495892372, "learning_rate": 2.8697514439613216e-05, "loss": 0.541, "step": 13230 }, { "epoch": 1.284688530952843, "grad_norm": 2.3272590952107675, "learning_rate": 2.8681290155104162e-05, "loss": 0.5411, "step": 13240 }, { "epoch": 1.2856588395109645, "grad_norm": 2.12223916886103, "learning_rate": 2.866506587059511e-05, "loss": 0.4957, "step": 13250 }, { "epoch": 1.286629148069086, "grad_norm": 1.4853001948712596, "learning_rate": 2.8648841586086055e-05, "loss": 0.5432, "step": 13260 }, { "epoch": 1.2875994566272073, "grad_norm": 2.095035318310476, "learning_rate": 2.8632617301577e-05, "loss": 0.5466, "step": 13270 }, { "epoch": 1.288569765185329, "grad_norm": 1.8562546228217478, "learning_rate": 2.8616393017067948e-05, "loss": 0.5393, "step": 13280 }, { "epoch": 1.2895400737434504, "grad_norm": 1.960721539695441, "learning_rate": 2.8600168732558898e-05, "loss": 0.5737, "step": 13290 }, { "epoch": 1.290510382301572, "grad_norm": 1.8998278241439668, "learning_rate": 2.8583944448049844e-05, "loss": 0.4965, "step": 13300 }, { "epoch": 1.2914806908596934, "grad_norm": 1.648594607244266, "learning_rate": 2.856772016354079e-05, "loss": 0.5691, "step": 13310 }, { "epoch": 1.2924509994178148, "grad_norm": 1.65007773003003, "learning_rate": 2.8551495879031737e-05, "loss": 0.5, "step": 13320 }, { "epoch": 1.2934213079759362, "grad_norm": 2.0573651261935293, "learning_rate": 2.8535271594522683e-05, "loss": 0.4926, "step": 13330 }, { "epoch": 1.2943916165340579, "grad_norm": 2.328992496419527, "learning_rate": 2.851904731001363e-05, "loss": 0.5221, "step": 13340 }, { "epoch": 1.2953619250921793, "grad_norm": 1.8509833991470588, "learning_rate": 2.8502823025504576e-05, "loss": 0.5018, "step": 13350 }, { "epoch": 1.296332233650301, "grad_norm": 1.895399492110669, "learning_rate": 2.8486598740995522e-05, "loss": 0.4724, "step": 13360 }, { "epoch": 1.2973025422084223, "grad_norm": 1.667924410333677, "learning_rate": 2.8470374456486472e-05, "loss": 0.5165, "step": 13370 }, { "epoch": 1.2982728507665438, "grad_norm": 2.1618438681975722, "learning_rate": 2.845415017197742e-05, "loss": 0.5208, "step": 13380 }, { "epoch": 1.2992431593246652, "grad_norm": 2.0324515358778434, "learning_rate": 2.8437925887468365e-05, "loss": 0.4944, "step": 13390 }, { "epoch": 1.3002134678827868, "grad_norm": 1.4374011148146266, "learning_rate": 2.842170160295931e-05, "loss": 0.556, "step": 13400 }, { "epoch": 1.3011837764409082, "grad_norm": 2.0625833924885075, "learning_rate": 2.8405477318450258e-05, "loss": 0.533, "step": 13410 }, { "epoch": 1.3021540849990296, "grad_norm": 1.7327381778160986, "learning_rate": 2.8389253033941204e-05, "loss": 0.5428, "step": 13420 }, { "epoch": 1.3031243935571513, "grad_norm": 1.5980081031800506, "learning_rate": 2.837302874943215e-05, "loss": 0.5169, "step": 13430 }, { "epoch": 1.3040947021152727, "grad_norm": 1.4133329538382045, "learning_rate": 2.8356804464923097e-05, "loss": 0.5365, "step": 13440 }, { "epoch": 1.305065010673394, "grad_norm": 1.6052634633267397, "learning_rate": 2.8340580180414043e-05, "loss": 0.5062, "step": 13450 }, { "epoch": 1.3060353192315155, "grad_norm": 1.8789186495289818, "learning_rate": 2.8324355895904993e-05, "loss": 0.5301, "step": 13460 }, { "epoch": 1.3070056277896371, "grad_norm": 1.7539318856514114, "learning_rate": 2.830813161139594e-05, "loss": 0.5191, "step": 13470 }, { "epoch": 1.3079759363477585, "grad_norm": 1.652081891063903, "learning_rate": 2.8291907326886886e-05, "loss": 0.5162, "step": 13480 }, { "epoch": 1.3089462449058802, "grad_norm": 1.9042372320107943, "learning_rate": 2.8275683042377832e-05, "loss": 0.5202, "step": 13490 }, { "epoch": 1.3099165534640016, "grad_norm": 1.9149924815733665, "learning_rate": 2.825945875786878e-05, "loss": 0.5785, "step": 13500 }, { "epoch": 1.3099165534640016, "eval_loss": 0.6458503007888794, "eval_runtime": 2472.3879, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 13500 }, { "epoch": 1.310886862022123, "grad_norm": 1.6405768771538807, "learning_rate": 2.8243234473359725e-05, "loss": 0.5037, "step": 13510 }, { "epoch": 1.3118571705802444, "grad_norm": 1.8018934132744455, "learning_rate": 2.822701018885067e-05, "loss": 0.5465, "step": 13520 }, { "epoch": 1.312827479138366, "grad_norm": 2.013531086384291, "learning_rate": 2.8210785904341618e-05, "loss": 0.5011, "step": 13530 }, { "epoch": 1.3137977876964875, "grad_norm": 2.022552703035907, "learning_rate": 2.8194561619832567e-05, "loss": 0.5657, "step": 13540 }, { "epoch": 1.3147680962546089, "grad_norm": 1.734149403661694, "learning_rate": 2.8178337335323514e-05, "loss": 0.5074, "step": 13550 }, { "epoch": 1.3157384048127305, "grad_norm": 2.1508604289357476, "learning_rate": 2.816211305081446e-05, "loss": 0.4873, "step": 13560 }, { "epoch": 1.316708713370852, "grad_norm": 1.9266405893568426, "learning_rate": 2.8145888766305406e-05, "loss": 0.5231, "step": 13570 }, { "epoch": 1.3176790219289733, "grad_norm": 1.8623995402610134, "learning_rate": 2.8129664481796353e-05, "loss": 0.5748, "step": 13580 }, { "epoch": 1.3186493304870948, "grad_norm": 1.5283427217297032, "learning_rate": 2.81134401972873e-05, "loss": 0.5322, "step": 13590 }, { "epoch": 1.3196196390452164, "grad_norm": 1.7148189442620572, "learning_rate": 2.8097215912778246e-05, "loss": 0.4896, "step": 13600 }, { "epoch": 1.3205899476033378, "grad_norm": 1.4473714452022712, "learning_rate": 2.8080991628269192e-05, "loss": 0.4974, "step": 13610 }, { "epoch": 1.3215602561614594, "grad_norm": 1.8643931328762333, "learning_rate": 2.806476734376014e-05, "loss": 0.5707, "step": 13620 }, { "epoch": 1.3225305647195809, "grad_norm": 2.1475243574365264, "learning_rate": 2.8048543059251088e-05, "loss": 0.5167, "step": 13630 }, { "epoch": 1.3235008732777023, "grad_norm": 1.6293038645225775, "learning_rate": 2.8032318774742035e-05, "loss": 0.5268, "step": 13640 }, { "epoch": 1.3244711818358237, "grad_norm": 1.8987021880259773, "learning_rate": 2.801609449023298e-05, "loss": 0.5432, "step": 13650 }, { "epoch": 1.3254414903939453, "grad_norm": 1.5753855821090716, "learning_rate": 2.7999870205723927e-05, "loss": 0.4827, "step": 13660 }, { "epoch": 1.3264117989520667, "grad_norm": 1.350257936732941, "learning_rate": 2.7983645921214874e-05, "loss": 0.5544, "step": 13670 }, { "epoch": 1.3273821075101884, "grad_norm": 1.9293055864878257, "learning_rate": 2.796742163670582e-05, "loss": 0.5441, "step": 13680 }, { "epoch": 1.3283524160683098, "grad_norm": 2.2701614440224813, "learning_rate": 2.7951197352196766e-05, "loss": 0.5237, "step": 13690 }, { "epoch": 1.3293227246264312, "grad_norm": 1.2019327117406085, "learning_rate": 2.7934973067687713e-05, "loss": 0.5434, "step": 13700 }, { "epoch": 1.3302930331845526, "grad_norm": 1.406257025731888, "learning_rate": 2.7918748783178663e-05, "loss": 0.5063, "step": 13710 }, { "epoch": 1.3312633417426742, "grad_norm": 1.666513597838276, "learning_rate": 2.790252449866961e-05, "loss": 0.5068, "step": 13720 }, { "epoch": 1.3322336503007957, "grad_norm": 1.7668183417692156, "learning_rate": 2.7886300214160555e-05, "loss": 0.5141, "step": 13730 }, { "epoch": 1.333203958858917, "grad_norm": 1.433593593379768, "learning_rate": 2.7870075929651502e-05, "loss": 0.4626, "step": 13740 }, { "epoch": 1.3341742674170387, "grad_norm": 2.009965700363568, "learning_rate": 2.7853851645142448e-05, "loss": 0.5618, "step": 13750 }, { "epoch": 1.3351445759751601, "grad_norm": 1.763668408586817, "learning_rate": 2.7837627360633395e-05, "loss": 0.5369, "step": 13760 }, { "epoch": 1.3361148845332815, "grad_norm": 2.1411324301354053, "learning_rate": 2.782140307612434e-05, "loss": 0.5084, "step": 13770 }, { "epoch": 1.337085193091403, "grad_norm": 1.436827138346562, "learning_rate": 2.7805178791615287e-05, "loss": 0.5032, "step": 13780 }, { "epoch": 1.3380555016495246, "grad_norm": 1.9606877958295938, "learning_rate": 2.778895450710624e-05, "loss": 0.5327, "step": 13790 }, { "epoch": 1.339025810207646, "grad_norm": 2.0522393469060396, "learning_rate": 2.7772730222597187e-05, "loss": 0.5398, "step": 13800 }, { "epoch": 1.3399961187657676, "grad_norm": 1.9786107551791654, "learning_rate": 2.7756505938088133e-05, "loss": 0.5009, "step": 13810 }, { "epoch": 1.340966427323889, "grad_norm": 2.090584551627846, "learning_rate": 2.774028165357908e-05, "loss": 0.5296, "step": 13820 }, { "epoch": 1.3419367358820105, "grad_norm": 2.1298950160365613, "learning_rate": 2.772405736907003e-05, "loss": 0.4865, "step": 13830 }, { "epoch": 1.3429070444401319, "grad_norm": 1.7629297128429193, "learning_rate": 2.7707833084560976e-05, "loss": 0.5324, "step": 13840 }, { "epoch": 1.3438773529982535, "grad_norm": 1.8397197756904764, "learning_rate": 2.7691608800051922e-05, "loss": 0.5732, "step": 13850 }, { "epoch": 1.344847661556375, "grad_norm": 1.8108747182833855, "learning_rate": 2.767538451554287e-05, "loss": 0.4947, "step": 13860 }, { "epoch": 1.3458179701144963, "grad_norm": 1.8199614245355178, "learning_rate": 2.7659160231033815e-05, "loss": 0.5557, "step": 13870 }, { "epoch": 1.346788278672618, "grad_norm": 1.3431248134057203, "learning_rate": 2.764293594652476e-05, "loss": 0.5404, "step": 13880 }, { "epoch": 1.3477585872307394, "grad_norm": 1.5574090523973907, "learning_rate": 2.7626711662015708e-05, "loss": 0.568, "step": 13890 }, { "epoch": 1.3487288957888608, "grad_norm": 1.4115013178507176, "learning_rate": 2.7610487377506654e-05, "loss": 0.4716, "step": 13900 }, { "epoch": 1.3496992043469822, "grad_norm": 2.1040010286866444, "learning_rate": 2.75942630929976e-05, "loss": 0.4934, "step": 13910 }, { "epoch": 1.3506695129051038, "grad_norm": 1.7498060957230301, "learning_rate": 2.757803880848855e-05, "loss": 0.5454, "step": 13920 }, { "epoch": 1.3516398214632253, "grad_norm": 1.8910220759501428, "learning_rate": 2.7561814523979497e-05, "loss": 0.5641, "step": 13930 }, { "epoch": 1.352610130021347, "grad_norm": 1.6170015266249442, "learning_rate": 2.7545590239470443e-05, "loss": 0.5378, "step": 13940 }, { "epoch": 1.3535804385794683, "grad_norm": 1.7666178388963019, "learning_rate": 2.752936595496139e-05, "loss": 0.5448, "step": 13950 }, { "epoch": 1.3545507471375897, "grad_norm": 2.052806059730038, "learning_rate": 2.7513141670452336e-05, "loss": 0.5158, "step": 13960 }, { "epoch": 1.3555210556957111, "grad_norm": 1.9222501285118103, "learning_rate": 2.7496917385943282e-05, "loss": 0.4982, "step": 13970 }, { "epoch": 1.3564913642538328, "grad_norm": 2.1370232742150903, "learning_rate": 2.748069310143423e-05, "loss": 0.5409, "step": 13980 }, { "epoch": 1.3574616728119542, "grad_norm": 1.7396628585118512, "learning_rate": 2.7464468816925175e-05, "loss": 0.5327, "step": 13990 }, { "epoch": 1.3584319813700758, "grad_norm": 1.990716927328341, "learning_rate": 2.7448244532416125e-05, "loss": 0.5291, "step": 14000 }, { "epoch": 1.3584319813700758, "eval_loss": 0.6447646021842957, "eval_runtime": 2468.8402, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 14000 }, { "epoch": 1.3594022899281972, "grad_norm": 1.7121198472067753, "learning_rate": 2.743202024790707e-05, "loss": 0.4758, "step": 14010 }, { "epoch": 1.3603725984863186, "grad_norm": 1.6674489776942332, "learning_rate": 2.7415795963398017e-05, "loss": 0.5484, "step": 14020 }, { "epoch": 1.36134290704444, "grad_norm": 1.458164078535125, "learning_rate": 2.7399571678888964e-05, "loss": 0.5523, "step": 14030 }, { "epoch": 1.3623132156025617, "grad_norm": 1.8775916401381276, "learning_rate": 2.738334739437991e-05, "loss": 0.5254, "step": 14040 }, { "epoch": 1.363283524160683, "grad_norm": 1.671470903392816, "learning_rate": 2.7367123109870857e-05, "loss": 0.5005, "step": 14050 }, { "epoch": 1.3642538327188045, "grad_norm": 1.9806763146120538, "learning_rate": 2.7350898825361803e-05, "loss": 0.5134, "step": 14060 }, { "epoch": 1.3652241412769262, "grad_norm": 1.514946401551448, "learning_rate": 2.733467454085275e-05, "loss": 0.5597, "step": 14070 }, { "epoch": 1.3661944498350476, "grad_norm": 1.6635049106626887, "learning_rate": 2.7318450256343696e-05, "loss": 0.5377, "step": 14080 }, { "epoch": 1.367164758393169, "grad_norm": 1.7564332843089718, "learning_rate": 2.7302225971834645e-05, "loss": 0.5027, "step": 14090 }, { "epoch": 1.3681350669512904, "grad_norm": 2.0454872453162682, "learning_rate": 2.7286001687325592e-05, "loss": 0.5056, "step": 14100 }, { "epoch": 1.369105375509412, "grad_norm": 2.0786210322749032, "learning_rate": 2.7269777402816538e-05, "loss": 0.5835, "step": 14110 }, { "epoch": 1.3700756840675334, "grad_norm": 2.1275215610783555, "learning_rate": 2.7253553118307485e-05, "loss": 0.5235, "step": 14120 }, { "epoch": 1.371045992625655, "grad_norm": 1.7013208723478948, "learning_rate": 2.723732883379843e-05, "loss": 0.5105, "step": 14130 }, { "epoch": 1.3720163011837765, "grad_norm": 1.586441401545233, "learning_rate": 2.7221104549289377e-05, "loss": 0.5221, "step": 14140 }, { "epoch": 1.372986609741898, "grad_norm": 1.533260185383275, "learning_rate": 2.7204880264780324e-05, "loss": 0.4714, "step": 14150 }, { "epoch": 1.3739569183000193, "grad_norm": 1.99495942930363, "learning_rate": 2.718865598027127e-05, "loss": 0.5127, "step": 14160 }, { "epoch": 1.374927226858141, "grad_norm": 1.5455163764317608, "learning_rate": 2.717243169576222e-05, "loss": 0.5331, "step": 14170 }, { "epoch": 1.3758975354162624, "grad_norm": 2.542301246782172, "learning_rate": 2.7156207411253166e-05, "loss": 0.5196, "step": 14180 }, { "epoch": 1.3768678439743838, "grad_norm": 2.367306381136639, "learning_rate": 2.7139983126744113e-05, "loss": 0.5173, "step": 14190 }, { "epoch": 1.3778381525325054, "grad_norm": 1.748922567904988, "learning_rate": 2.712375884223506e-05, "loss": 0.5301, "step": 14200 }, { "epoch": 1.3788084610906268, "grad_norm": 1.9185950864572987, "learning_rate": 2.7107534557726005e-05, "loss": 0.4242, "step": 14210 }, { "epoch": 1.3797787696487482, "grad_norm": 1.7876319295098282, "learning_rate": 2.7091310273216952e-05, "loss": 0.5251, "step": 14220 }, { "epoch": 1.3807490782068697, "grad_norm": 1.7638194028118903, "learning_rate": 2.7075085988707898e-05, "loss": 0.5087, "step": 14230 }, { "epoch": 1.3817193867649913, "grad_norm": 1.781232728742025, "learning_rate": 2.7058861704198845e-05, "loss": 0.46, "step": 14240 }, { "epoch": 1.3826896953231127, "grad_norm": 2.0454750065003564, "learning_rate": 2.704263741968979e-05, "loss": 0.542, "step": 14250 }, { "epoch": 1.3836600038812343, "grad_norm": 2.2368401504756, "learning_rate": 2.702641313518074e-05, "loss": 0.501, "step": 14260 }, { "epoch": 1.3846303124393557, "grad_norm": 1.4717190899560046, "learning_rate": 2.7010188850671687e-05, "loss": 0.5146, "step": 14270 }, { "epoch": 1.3856006209974772, "grad_norm": 1.8752670210283335, "learning_rate": 2.6993964566162634e-05, "loss": 0.4999, "step": 14280 }, { "epoch": 1.3865709295555986, "grad_norm": 2.065234795556922, "learning_rate": 2.697774028165358e-05, "loss": 0.4447, "step": 14290 }, { "epoch": 1.3875412381137202, "grad_norm": 2.125070528749461, "learning_rate": 2.6961515997144526e-05, "loss": 0.4797, "step": 14300 }, { "epoch": 1.3885115466718416, "grad_norm": 1.5859308448291956, "learning_rate": 2.6945291712635473e-05, "loss": 0.5336, "step": 14310 }, { "epoch": 1.3894818552299633, "grad_norm": 2.861319646114515, "learning_rate": 2.692906742812642e-05, "loss": 0.4812, "step": 14320 }, { "epoch": 1.3904521637880847, "grad_norm": 1.7597224976071428, "learning_rate": 2.6912843143617365e-05, "loss": 0.55, "step": 14330 }, { "epoch": 1.391422472346206, "grad_norm": 1.681866583110354, "learning_rate": 2.6896618859108312e-05, "loss": 0.5046, "step": 14340 }, { "epoch": 1.3923927809043275, "grad_norm": 1.5215374726522628, "learning_rate": 2.688039457459926e-05, "loss": 0.5579, "step": 14350 }, { "epoch": 1.3933630894624491, "grad_norm": 2.079812021813028, "learning_rate": 2.6864170290090208e-05, "loss": 0.5495, "step": 14360 }, { "epoch": 1.3943333980205705, "grad_norm": 1.617321032110613, "learning_rate": 2.6847946005581154e-05, "loss": 0.529, "step": 14370 }, { "epoch": 1.395303706578692, "grad_norm": 1.9418293727425169, "learning_rate": 2.68317217210721e-05, "loss": 0.5503, "step": 14380 }, { "epoch": 1.3962740151368136, "grad_norm": 1.6562011692834846, "learning_rate": 2.6815497436563047e-05, "loss": 0.5245, "step": 14390 }, { "epoch": 1.397244323694935, "grad_norm": 1.8991877294518344, "learning_rate": 2.6799273152053994e-05, "loss": 0.4976, "step": 14400 }, { "epoch": 1.3982146322530564, "grad_norm": 1.9301826202288486, "learning_rate": 2.678304886754494e-05, "loss": 0.4878, "step": 14410 }, { "epoch": 1.3991849408111778, "grad_norm": 2.019905342024004, "learning_rate": 2.6766824583035886e-05, "loss": 0.5093, "step": 14420 }, { "epoch": 1.4001552493692995, "grad_norm": 2.364843712084718, "learning_rate": 2.6750600298526836e-05, "loss": 0.5005, "step": 14430 }, { "epoch": 1.4011255579274209, "grad_norm": 1.9751607681902115, "learning_rate": 2.6734376014017782e-05, "loss": 0.4958, "step": 14440 }, { "epoch": 1.4020958664855425, "grad_norm": 1.9625121046714047, "learning_rate": 2.671815172950873e-05, "loss": 0.5251, "step": 14450 }, { "epoch": 1.403066175043664, "grad_norm": 1.4828612193168078, "learning_rate": 2.6701927444999675e-05, "loss": 0.5496, "step": 14460 }, { "epoch": 1.4040364836017853, "grad_norm": 1.5015720843906, "learning_rate": 2.668570316049062e-05, "loss": 0.535, "step": 14470 }, { "epoch": 1.4050067921599068, "grad_norm": 1.3851757562899687, "learning_rate": 2.6669478875981568e-05, "loss": 0.5628, "step": 14480 }, { "epoch": 1.4059771007180284, "grad_norm": 1.6822047277997916, "learning_rate": 2.6653254591472514e-05, "loss": 0.5605, "step": 14490 }, { "epoch": 1.4069474092761498, "grad_norm": 1.9002135762249894, "learning_rate": 2.663703030696346e-05, "loss": 0.5348, "step": 14500 }, { "epoch": 1.4069474092761498, "eval_loss": 0.6422961950302124, "eval_runtime": 2474.9423, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 14500 }, { "epoch": 1.4079177178342712, "grad_norm": 1.7272883282125664, "learning_rate": 2.6620806022454407e-05, "loss": 0.5518, "step": 14510 }, { "epoch": 1.4088880263923929, "grad_norm": 1.7836182665219666, "learning_rate": 2.6604581737945357e-05, "loss": 0.5195, "step": 14520 }, { "epoch": 1.4098583349505143, "grad_norm": 1.7860780128722327, "learning_rate": 2.6588357453436303e-05, "loss": 0.5061, "step": 14530 }, { "epoch": 1.4108286435086357, "grad_norm": 1.8487543077145125, "learning_rate": 2.657213316892725e-05, "loss": 0.5261, "step": 14540 }, { "epoch": 1.411798952066757, "grad_norm": 1.8960118987299062, "learning_rate": 2.6555908884418196e-05, "loss": 0.523, "step": 14550 }, { "epoch": 1.4127692606248787, "grad_norm": 1.3767948578941858, "learning_rate": 2.6539684599909142e-05, "loss": 0.4654, "step": 14560 }, { "epoch": 1.4137395691830001, "grad_norm": 1.7605168074790472, "learning_rate": 2.652346031540009e-05, "loss": 0.4711, "step": 14570 }, { "epoch": 1.4147098777411218, "grad_norm": 1.424973688569186, "learning_rate": 2.6507236030891042e-05, "loss": 0.5639, "step": 14580 }, { "epoch": 1.4156801862992432, "grad_norm": 1.9215824187230874, "learning_rate": 2.649101174638199e-05, "loss": 0.5137, "step": 14590 }, { "epoch": 1.4166504948573646, "grad_norm": 1.439592415452285, "learning_rate": 2.6474787461872935e-05, "loss": 0.488, "step": 14600 }, { "epoch": 1.417620803415486, "grad_norm": 1.6446340356434774, "learning_rate": 2.645856317736388e-05, "loss": 0.5324, "step": 14610 }, { "epoch": 1.4185911119736077, "grad_norm": 1.9264081435604268, "learning_rate": 2.6442338892854827e-05, "loss": 0.5297, "step": 14620 }, { "epoch": 1.419561420531729, "grad_norm": 1.5948112951701827, "learning_rate": 2.6426114608345777e-05, "loss": 0.5291, "step": 14630 }, { "epoch": 1.4205317290898507, "grad_norm": 1.8654562276618851, "learning_rate": 2.6409890323836724e-05, "loss": 0.5588, "step": 14640 }, { "epoch": 1.4215020376479721, "grad_norm": 1.8410472433514884, "learning_rate": 2.639366603932767e-05, "loss": 0.5167, "step": 14650 }, { "epoch": 1.4224723462060935, "grad_norm": 1.9663683443045321, "learning_rate": 2.6377441754818616e-05, "loss": 0.5166, "step": 14660 }, { "epoch": 1.423442654764215, "grad_norm": 1.77295818801796, "learning_rate": 2.6361217470309563e-05, "loss": 0.5557, "step": 14670 }, { "epoch": 1.4244129633223366, "grad_norm": 1.542816761347417, "learning_rate": 2.634499318580051e-05, "loss": 0.5147, "step": 14680 }, { "epoch": 1.425383271880458, "grad_norm": 2.13234320369828, "learning_rate": 2.6328768901291456e-05, "loss": 0.5421, "step": 14690 }, { "epoch": 1.4263535804385794, "grad_norm": 1.4543036251195376, "learning_rate": 2.6312544616782402e-05, "loss": 0.488, "step": 14700 }, { "epoch": 1.427323888996701, "grad_norm": 1.5267685947307574, "learning_rate": 2.6296320332273348e-05, "loss": 0.5352, "step": 14710 }, { "epoch": 1.4282941975548225, "grad_norm": 2.05420183759504, "learning_rate": 2.6280096047764298e-05, "loss": 0.5489, "step": 14720 }, { "epoch": 1.4292645061129439, "grad_norm": 1.5141763719248076, "learning_rate": 2.6263871763255244e-05, "loss": 0.5627, "step": 14730 }, { "epoch": 1.4302348146710653, "grad_norm": 1.7887916433402153, "learning_rate": 2.624764747874619e-05, "loss": 0.5129, "step": 14740 }, { "epoch": 1.431205123229187, "grad_norm": 1.9817836633125971, "learning_rate": 2.6231423194237137e-05, "loss": 0.5129, "step": 14750 }, { "epoch": 1.4321754317873083, "grad_norm": 2.358539950484514, "learning_rate": 2.6215198909728084e-05, "loss": 0.482, "step": 14760 }, { "epoch": 1.43314574034543, "grad_norm": 1.7444853271955691, "learning_rate": 2.619897462521903e-05, "loss": 0.4948, "step": 14770 }, { "epoch": 1.4341160489035514, "grad_norm": 1.573301988778271, "learning_rate": 2.6182750340709976e-05, "loss": 0.4998, "step": 14780 }, { "epoch": 1.4350863574616728, "grad_norm": 1.878570997397064, "learning_rate": 2.6166526056200923e-05, "loss": 0.489, "step": 14790 }, { "epoch": 1.4360566660197942, "grad_norm": 2.0345633471458444, "learning_rate": 2.615030177169187e-05, "loss": 0.5395, "step": 14800 }, { "epoch": 1.4370269745779158, "grad_norm": 1.7616258348677174, "learning_rate": 2.613407748718282e-05, "loss": 0.5575, "step": 14810 }, { "epoch": 1.4379972831360373, "grad_norm": 2.202827489308336, "learning_rate": 2.6117853202673765e-05, "loss": 0.5378, "step": 14820 }, { "epoch": 1.4389675916941587, "grad_norm": 2.0172801246892496, "learning_rate": 2.610162891816471e-05, "loss": 0.565, "step": 14830 }, { "epoch": 1.4399379002522803, "grad_norm": 1.460733632748616, "learning_rate": 2.6085404633655658e-05, "loss": 0.4866, "step": 14840 }, { "epoch": 1.4409082088104017, "grad_norm": 1.5702235924410088, "learning_rate": 2.6069180349146604e-05, "loss": 0.4814, "step": 14850 }, { "epoch": 1.4418785173685231, "grad_norm": 2.3366982495878013, "learning_rate": 2.605295606463755e-05, "loss": 0.5204, "step": 14860 }, { "epoch": 1.4428488259266445, "grad_norm": 2.0441313559069805, "learning_rate": 2.6036731780128497e-05, "loss": 0.4861, "step": 14870 }, { "epoch": 1.4438191344847662, "grad_norm": 2.2095151787420417, "learning_rate": 2.6020507495619444e-05, "loss": 0.5376, "step": 14880 }, { "epoch": 1.4447894430428876, "grad_norm": 1.9134281253559753, "learning_rate": 2.6004283211110393e-05, "loss": 0.5102, "step": 14890 }, { "epoch": 1.4457597516010092, "grad_norm": 1.7832517565992747, "learning_rate": 2.598805892660134e-05, "loss": 0.5303, "step": 14900 }, { "epoch": 1.4467300601591306, "grad_norm": 1.7558924047130664, "learning_rate": 2.5971834642092286e-05, "loss": 0.4994, "step": 14910 }, { "epoch": 1.447700368717252, "grad_norm": 1.7883915957856114, "learning_rate": 2.5955610357583233e-05, "loss": 0.4677, "step": 14920 }, { "epoch": 1.4486706772753735, "grad_norm": 2.077257189743679, "learning_rate": 2.593938607307418e-05, "loss": 0.515, "step": 14930 }, { "epoch": 1.449640985833495, "grad_norm": 1.9045010104323963, "learning_rate": 2.5923161788565125e-05, "loss": 0.5024, "step": 14940 }, { "epoch": 1.4506112943916165, "grad_norm": 2.106289689638874, "learning_rate": 2.590693750405607e-05, "loss": 0.5242, "step": 14950 }, { "epoch": 1.4515816029497381, "grad_norm": 1.9221754156642648, "learning_rate": 2.5890713219547018e-05, "loss": 0.4712, "step": 14960 }, { "epoch": 1.4525519115078596, "grad_norm": 1.8046920974227167, "learning_rate": 2.5874488935037964e-05, "loss": 0.5017, "step": 14970 }, { "epoch": 1.453522220065981, "grad_norm": 1.855646189779827, "learning_rate": 2.5858264650528914e-05, "loss": 0.5146, "step": 14980 }, { "epoch": 1.4544925286241024, "grad_norm": 1.8308672850602437, "learning_rate": 2.584204036601986e-05, "loss": 0.5293, "step": 14990 }, { "epoch": 1.455462837182224, "grad_norm": 1.6235231403062087, "learning_rate": 2.5825816081510807e-05, "loss": 0.4968, "step": 15000 }, { "epoch": 1.455462837182224, "eval_loss": 0.6395026445388794, "eval_runtime": 2472.4459, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 15000 }, { "epoch": 1.4564331457403454, "grad_norm": 1.9674866651938567, "learning_rate": 2.5809591797001753e-05, "loss": 0.5402, "step": 15010 }, { "epoch": 1.4574034542984668, "grad_norm": 1.7667679072452773, "learning_rate": 2.57933675124927e-05, "loss": 0.5598, "step": 15020 }, { "epoch": 1.4583737628565885, "grad_norm": 1.6236668551346622, "learning_rate": 2.5777143227983646e-05, "loss": 0.4819, "step": 15030 }, { "epoch": 1.45934407141471, "grad_norm": 1.5162521396018838, "learning_rate": 2.5760918943474592e-05, "loss": 0.5609, "step": 15040 }, { "epoch": 1.4603143799728313, "grad_norm": 2.248488105603888, "learning_rate": 2.574469465896554e-05, "loss": 0.4987, "step": 15050 }, { "epoch": 1.4612846885309527, "grad_norm": 1.5876501247710053, "learning_rate": 2.572847037445649e-05, "loss": 0.4794, "step": 15060 }, { "epoch": 1.4622549970890744, "grad_norm": 1.9484672186526921, "learning_rate": 2.5712246089947435e-05, "loss": 0.5115, "step": 15070 }, { "epoch": 1.4632253056471958, "grad_norm": 1.9931906967691606, "learning_rate": 2.569602180543838e-05, "loss": 0.4848, "step": 15080 }, { "epoch": 1.4641956142053174, "grad_norm": 1.9798564517490436, "learning_rate": 2.5679797520929328e-05, "loss": 0.5136, "step": 15090 }, { "epoch": 1.4651659227634388, "grad_norm": 1.7543081259876927, "learning_rate": 2.5663573236420274e-05, "loss": 0.5179, "step": 15100 }, { "epoch": 1.4661362313215602, "grad_norm": 1.797610941181892, "learning_rate": 2.564734895191122e-05, "loss": 0.5134, "step": 15110 }, { "epoch": 1.4671065398796816, "grad_norm": 1.6254067495518276, "learning_rate": 2.5631124667402167e-05, "loss": 0.497, "step": 15120 }, { "epoch": 1.4680768484378033, "grad_norm": 2.0916697354749743, "learning_rate": 2.5614900382893113e-05, "loss": 0.5035, "step": 15130 }, { "epoch": 1.4690471569959247, "grad_norm": 1.7408478320862355, "learning_rate": 2.559867609838406e-05, "loss": 0.5165, "step": 15140 }, { "epoch": 1.470017465554046, "grad_norm": 1.6971971097300078, "learning_rate": 2.558245181387501e-05, "loss": 0.5416, "step": 15150 }, { "epoch": 1.4709877741121677, "grad_norm": 1.7615798102754638, "learning_rate": 2.5566227529365956e-05, "loss": 0.5028, "step": 15160 }, { "epoch": 1.4719580826702892, "grad_norm": 2.126034120697344, "learning_rate": 2.5550003244856902e-05, "loss": 0.4983, "step": 15170 }, { "epoch": 1.4729283912284106, "grad_norm": 1.7930301180931063, "learning_rate": 2.553377896034785e-05, "loss": 0.4836, "step": 15180 }, { "epoch": 1.473898699786532, "grad_norm": 1.945479896374108, "learning_rate": 2.5517554675838795e-05, "loss": 0.4874, "step": 15190 }, { "epoch": 1.4748690083446536, "grad_norm": 1.659537976782885, "learning_rate": 2.550133039132974e-05, "loss": 0.485, "step": 15200 }, { "epoch": 1.475839316902775, "grad_norm": 1.7852824594767274, "learning_rate": 2.5485106106820688e-05, "loss": 0.5314, "step": 15210 }, { "epoch": 1.4768096254608967, "grad_norm": 1.7757412059616349, "learning_rate": 2.5468881822311634e-05, "loss": 0.4906, "step": 15220 }, { "epoch": 1.477779934019018, "grad_norm": 1.7940563485432668, "learning_rate": 2.5452657537802584e-05, "loss": 0.4973, "step": 15230 }, { "epoch": 1.4787502425771395, "grad_norm": 1.8223131394278327, "learning_rate": 2.543643325329353e-05, "loss": 0.5122, "step": 15240 }, { "epoch": 1.479720551135261, "grad_norm": 1.9217304666232693, "learning_rate": 2.5420208968784477e-05, "loss": 0.5497, "step": 15250 }, { "epoch": 1.4806908596933825, "grad_norm": 1.6084194486566938, "learning_rate": 2.5403984684275423e-05, "loss": 0.4971, "step": 15260 }, { "epoch": 1.481661168251504, "grad_norm": 2.1056710345080827, "learning_rate": 2.538776039976637e-05, "loss": 0.5379, "step": 15270 }, { "epoch": 1.4826314768096256, "grad_norm": 2.2545586744739015, "learning_rate": 2.5371536115257316e-05, "loss": 0.4239, "step": 15280 }, { "epoch": 1.483601785367747, "grad_norm": 2.0015642103213063, "learning_rate": 2.5355311830748262e-05, "loss": 0.5283, "step": 15290 }, { "epoch": 1.4845720939258684, "grad_norm": 2.0206628423435387, "learning_rate": 2.533908754623921e-05, "loss": 0.4533, "step": 15300 }, { "epoch": 1.4855424024839898, "grad_norm": 2.0392064010865263, "learning_rate": 2.5322863261730155e-05, "loss": 0.516, "step": 15310 }, { "epoch": 1.4865127110421115, "grad_norm": 1.6397938987534753, "learning_rate": 2.5306638977221105e-05, "loss": 0.4915, "step": 15320 }, { "epoch": 1.4874830196002329, "grad_norm": 2.0519457829568615, "learning_rate": 2.529041469271205e-05, "loss": 0.4938, "step": 15330 }, { "epoch": 1.4884533281583543, "grad_norm": 1.883771979267065, "learning_rate": 2.5274190408202998e-05, "loss": 0.5454, "step": 15340 }, { "epoch": 1.489423636716476, "grad_norm": 1.8963017429804823, "learning_rate": 2.5257966123693944e-05, "loss": 0.4733, "step": 15350 }, { "epoch": 1.4903939452745973, "grad_norm": 2.005144587549119, "learning_rate": 2.5241741839184897e-05, "loss": 0.4967, "step": 15360 }, { "epoch": 1.4913642538327188, "grad_norm": 2.151826141060965, "learning_rate": 2.5225517554675843e-05, "loss": 0.5466, "step": 15370 }, { "epoch": 1.4923345623908402, "grad_norm": 1.6751197267270117, "learning_rate": 2.520929327016679e-05, "loss": 0.5456, "step": 15380 }, { "epoch": 1.4933048709489618, "grad_norm": 1.9009493553059222, "learning_rate": 2.5193068985657736e-05, "loss": 0.483, "step": 15390 }, { "epoch": 1.4942751795070832, "grad_norm": 2.1457921969425757, "learning_rate": 2.5176844701148683e-05, "loss": 0.5458, "step": 15400 }, { "epoch": 1.4952454880652049, "grad_norm": 1.9369152546010822, "learning_rate": 2.516062041663963e-05, "loss": 0.5477, "step": 15410 }, { "epoch": 1.4962157966233263, "grad_norm": 1.9226350314538543, "learning_rate": 2.5144396132130575e-05, "loss": 0.5469, "step": 15420 }, { "epoch": 1.4971861051814477, "grad_norm": 1.768970891771466, "learning_rate": 2.5128171847621522e-05, "loss": 0.4789, "step": 15430 }, { "epoch": 1.498156413739569, "grad_norm": 1.6324753131013463, "learning_rate": 2.511194756311247e-05, "loss": 0.4984, "step": 15440 }, { "epoch": 1.4991267222976907, "grad_norm": 1.5500314116241656, "learning_rate": 2.5095723278603418e-05, "loss": 0.5313, "step": 15450 }, { "epoch": 1.5000970308558121, "grad_norm": 1.8222773979858036, "learning_rate": 2.5079498994094364e-05, "loss": 0.4935, "step": 15460 }, { "epoch": 1.5010673394139338, "grad_norm": 1.6457715681286798, "learning_rate": 2.506327470958531e-05, "loss": 0.4739, "step": 15470 }, { "epoch": 1.5020376479720552, "grad_norm": 1.6810719922194195, "learning_rate": 2.5047050425076257e-05, "loss": 0.478, "step": 15480 }, { "epoch": 1.5030079565301766, "grad_norm": 1.9891968136548206, "learning_rate": 2.5030826140567203e-05, "loss": 0.5365, "step": 15490 }, { "epoch": 1.503978265088298, "grad_norm": 1.836679974736217, "learning_rate": 2.501460185605815e-05, "loss": 0.5497, "step": 15500 }, { "epoch": 1.503978265088298, "eval_loss": 0.6403182148933411, "eval_runtime": 2470.3613, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 15500 }, { "epoch": 1.5049485736464194, "grad_norm": 1.9808212956686209, "learning_rate": 2.4998377571549096e-05, "loss": 0.5195, "step": 15510 }, { "epoch": 1.505918882204541, "grad_norm": 1.696799638403568, "learning_rate": 2.4982153287040043e-05, "loss": 0.481, "step": 15520 }, { "epoch": 1.5068891907626625, "grad_norm": 1.559487018818546, "learning_rate": 2.496592900253099e-05, "loss": 0.4849, "step": 15530 }, { "epoch": 1.5078594993207841, "grad_norm": 2.0106819741019635, "learning_rate": 2.4949704718021935e-05, "loss": 0.5068, "step": 15540 }, { "epoch": 1.5088298078789055, "grad_norm": 1.554558550656832, "learning_rate": 2.4933480433512885e-05, "loss": 0.5542, "step": 15550 }, { "epoch": 1.509800116437027, "grad_norm": 2.1570334538409597, "learning_rate": 2.491725614900383e-05, "loss": 0.5537, "step": 15560 }, { "epoch": 1.5107704249951484, "grad_norm": 2.082418119992958, "learning_rate": 2.4901031864494778e-05, "loss": 0.4495, "step": 15570 }, { "epoch": 1.51174073355327, "grad_norm": 1.7903747855746062, "learning_rate": 2.4884807579985724e-05, "loss": 0.4755, "step": 15580 }, { "epoch": 1.5127110421113914, "grad_norm": 1.8907675365997436, "learning_rate": 2.486858329547667e-05, "loss": 0.5377, "step": 15590 }, { "epoch": 1.513681350669513, "grad_norm": 1.6823616833089232, "learning_rate": 2.4852359010967617e-05, "loss": 0.4747, "step": 15600 }, { "epoch": 1.5146516592276345, "grad_norm": 1.8263050453987495, "learning_rate": 2.4836134726458567e-05, "loss": 0.4962, "step": 15610 }, { "epoch": 1.5156219677857559, "grad_norm": 1.855857638521949, "learning_rate": 2.4819910441949513e-05, "loss": 0.5306, "step": 15620 }, { "epoch": 1.5165922763438773, "grad_norm": 1.7196056977172254, "learning_rate": 2.480368615744046e-05, "loss": 0.507, "step": 15630 }, { "epoch": 1.5175625849019987, "grad_norm": 2.2274627334622354, "learning_rate": 2.4787461872931406e-05, "loss": 0.4741, "step": 15640 }, { "epoch": 1.5185328934601203, "grad_norm": 1.7506693514893716, "learning_rate": 2.4771237588422352e-05, "loss": 0.4512, "step": 15650 }, { "epoch": 1.519503202018242, "grad_norm": 1.5102515980183955, "learning_rate": 2.47550133039133e-05, "loss": 0.5656, "step": 15660 }, { "epoch": 1.5204735105763634, "grad_norm": 1.7638756809733325, "learning_rate": 2.4738789019404245e-05, "loss": 0.5209, "step": 15670 }, { "epoch": 1.5214438191344848, "grad_norm": 1.67804925991046, "learning_rate": 2.472256473489519e-05, "loss": 0.5442, "step": 15680 }, { "epoch": 1.5224141276926062, "grad_norm": 1.6503421070503612, "learning_rate": 2.470634045038614e-05, "loss": 0.5516, "step": 15690 }, { "epoch": 1.5233844362507276, "grad_norm": 1.7388522657721959, "learning_rate": 2.4690116165877088e-05, "loss": 0.4894, "step": 15700 }, { "epoch": 1.5243547448088492, "grad_norm": 1.955414988998346, "learning_rate": 2.4673891881368034e-05, "loss": 0.5493, "step": 15710 }, { "epoch": 1.5253250533669707, "grad_norm": 2.1089020897683324, "learning_rate": 2.465766759685898e-05, "loss": 0.5173, "step": 15720 }, { "epoch": 1.5262953619250923, "grad_norm": 2.1627868553616416, "learning_rate": 2.4641443312349927e-05, "loss": 0.5165, "step": 15730 }, { "epoch": 1.5272656704832137, "grad_norm": 1.9351405181934673, "learning_rate": 2.4625219027840873e-05, "loss": 0.4873, "step": 15740 }, { "epoch": 1.5282359790413351, "grad_norm": 1.8488349390401109, "learning_rate": 2.460899474333182e-05, "loss": 0.5121, "step": 15750 }, { "epoch": 1.5292062875994565, "grad_norm": 1.9814673588646732, "learning_rate": 2.4592770458822766e-05, "loss": 0.4942, "step": 15760 }, { "epoch": 1.530176596157578, "grad_norm": 1.8985936905786858, "learning_rate": 2.4576546174313712e-05, "loss": 0.5227, "step": 15770 }, { "epoch": 1.5311469047156996, "grad_norm": 2.144939705295627, "learning_rate": 2.4560321889804662e-05, "loss": 0.5584, "step": 15780 }, { "epoch": 1.5321172132738212, "grad_norm": 2.196125539672085, "learning_rate": 2.454409760529561e-05, "loss": 0.4592, "step": 15790 }, { "epoch": 1.5330875218319426, "grad_norm": 1.5896290702039733, "learning_rate": 2.4527873320786555e-05, "loss": 0.5183, "step": 15800 }, { "epoch": 1.534057830390064, "grad_norm": 1.3926787086096604, "learning_rate": 2.45116490362775e-05, "loss": 0.4929, "step": 15810 }, { "epoch": 1.5350281389481855, "grad_norm": 1.8947619224673768, "learning_rate": 2.4495424751768448e-05, "loss": 0.487, "step": 15820 }, { "epoch": 1.5359984475063069, "grad_norm": 1.9366715293465946, "learning_rate": 2.4479200467259394e-05, "loss": 0.5126, "step": 15830 }, { "epoch": 1.5369687560644285, "grad_norm": 1.8854258468503662, "learning_rate": 2.446297618275034e-05, "loss": 0.4732, "step": 15840 }, { "epoch": 1.53793906462255, "grad_norm": 1.7897154496692322, "learning_rate": 2.4446751898241287e-05, "loss": 0.4869, "step": 15850 }, { "epoch": 1.5389093731806716, "grad_norm": 1.9691769621461568, "learning_rate": 2.4430527613732233e-05, "loss": 0.5204, "step": 15860 }, { "epoch": 1.539879681738793, "grad_norm": 1.5159729039901195, "learning_rate": 2.4414303329223183e-05, "loss": 0.5067, "step": 15870 }, { "epoch": 1.5408499902969144, "grad_norm": 2.0085743327171364, "learning_rate": 2.439807904471413e-05, "loss": 0.5855, "step": 15880 }, { "epoch": 1.5418202988550358, "grad_norm": 1.808826518921202, "learning_rate": 2.4381854760205076e-05, "loss": 0.511, "step": 15890 }, { "epoch": 1.5427906074131574, "grad_norm": 2.144838816182226, "learning_rate": 2.4365630475696022e-05, "loss": 0.5151, "step": 15900 }, { "epoch": 1.5437609159712788, "grad_norm": 1.9134282869959454, "learning_rate": 2.434940619118697e-05, "loss": 0.547, "step": 15910 }, { "epoch": 1.5447312245294005, "grad_norm": 1.4718585956659067, "learning_rate": 2.4333181906677915e-05, "loss": 0.5035, "step": 15920 }, { "epoch": 1.545701533087522, "grad_norm": 1.9846910792449015, "learning_rate": 2.431695762216886e-05, "loss": 0.4603, "step": 15930 }, { "epoch": 1.5466718416456433, "grad_norm": 2.237689406521008, "learning_rate": 2.430073333765981e-05, "loss": 0.4835, "step": 15940 }, { "epoch": 1.5476421502037647, "grad_norm": 1.8521877806585876, "learning_rate": 2.4284509053150757e-05, "loss": 0.486, "step": 15950 }, { "epoch": 1.5486124587618861, "grad_norm": 1.6410601205984716, "learning_rate": 2.4268284768641704e-05, "loss": 0.5362, "step": 15960 }, { "epoch": 1.5495827673200078, "grad_norm": 2.0223928767474524, "learning_rate": 2.4252060484132653e-05, "loss": 0.5954, "step": 15970 }, { "epoch": 1.5505530758781294, "grad_norm": 2.531432809831189, "learning_rate": 2.42358361996236e-05, "loss": 0.5618, "step": 15980 }, { "epoch": 1.5515233844362508, "grad_norm": 2.0740258540735788, "learning_rate": 2.4219611915114546e-05, "loss": 0.4733, "step": 15990 }, { "epoch": 1.5524936929943722, "grad_norm": 1.7587497399634056, "learning_rate": 2.4203387630605493e-05, "loss": 0.5099, "step": 16000 }, { "epoch": 1.5524936929943722, "eval_loss": 0.635991096496582, "eval_runtime": 2468.0382, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 16000 }, { "epoch": 1.5534640015524936, "grad_norm": 1.670336846718715, "learning_rate": 2.418716334609644e-05, "loss": 0.4553, "step": 16010 }, { "epoch": 1.554434310110615, "grad_norm": 2.0177796015243237, "learning_rate": 2.4170939061587385e-05, "loss": 0.5115, "step": 16020 }, { "epoch": 1.5554046186687367, "grad_norm": 1.8858883624862002, "learning_rate": 2.4154714777078332e-05, "loss": 0.5172, "step": 16030 }, { "epoch": 1.556374927226858, "grad_norm": 2.0359679981766647, "learning_rate": 2.4138490492569278e-05, "loss": 0.4589, "step": 16040 }, { "epoch": 1.5573452357849797, "grad_norm": 1.2444792958823963, "learning_rate": 2.4122266208060228e-05, "loss": 0.5134, "step": 16050 }, { "epoch": 1.5583155443431012, "grad_norm": 1.5062153000531946, "learning_rate": 2.4106041923551174e-05, "loss": 0.4869, "step": 16060 }, { "epoch": 1.5592858529012226, "grad_norm": 2.2139374575219364, "learning_rate": 2.408981763904212e-05, "loss": 0.4811, "step": 16070 }, { "epoch": 1.560256161459344, "grad_norm": 1.7542756535220294, "learning_rate": 2.4073593354533067e-05, "loss": 0.4895, "step": 16080 }, { "epoch": 1.5612264700174654, "grad_norm": 1.9663056975292839, "learning_rate": 2.4057369070024013e-05, "loss": 0.5046, "step": 16090 }, { "epoch": 1.562196778575587, "grad_norm": 1.760446721510282, "learning_rate": 2.404114478551496e-05, "loss": 0.495, "step": 16100 }, { "epoch": 1.5631670871337087, "grad_norm": 1.6132518828061082, "learning_rate": 2.4024920501005906e-05, "loss": 0.4985, "step": 16110 }, { "epoch": 1.56413739569183, "grad_norm": 1.7640461059330637, "learning_rate": 2.4008696216496853e-05, "loss": 0.5186, "step": 16120 }, { "epoch": 1.5651077042499515, "grad_norm": 1.769814216957158, "learning_rate": 2.39924719319878e-05, "loss": 0.5413, "step": 16130 }, { "epoch": 1.566078012808073, "grad_norm": 2.293241964432802, "learning_rate": 2.397624764747875e-05, "loss": 0.4809, "step": 16140 }, { "epoch": 1.5670483213661943, "grad_norm": 1.8759559371438301, "learning_rate": 2.3960023362969695e-05, "loss": 0.5262, "step": 16150 }, { "epoch": 1.568018629924316, "grad_norm": 2.3087090615314114, "learning_rate": 2.394379907846064e-05, "loss": 0.4841, "step": 16160 }, { "epoch": 1.5689889384824374, "grad_norm": 1.6108810314362396, "learning_rate": 2.3927574793951588e-05, "loss": 0.5188, "step": 16170 }, { "epoch": 1.569959247040559, "grad_norm": 2.241869048530712, "learning_rate": 2.3911350509442534e-05, "loss": 0.5005, "step": 16180 }, { "epoch": 1.5709295555986804, "grad_norm": 2.1127428890150215, "learning_rate": 2.389512622493348e-05, "loss": 0.5063, "step": 16190 }, { "epoch": 1.5718998641568018, "grad_norm": 1.7046555426171013, "learning_rate": 2.3878901940424427e-05, "loss": 0.5154, "step": 16200 }, { "epoch": 1.5728701727149232, "grad_norm": 1.6974188480153705, "learning_rate": 2.3862677655915373e-05, "loss": 0.4775, "step": 16210 }, { "epoch": 1.5738404812730449, "grad_norm": 1.9773772891509165, "learning_rate": 2.384645337140632e-05, "loss": 0.489, "step": 16220 }, { "epoch": 1.5748107898311663, "grad_norm": 2.3323506244488645, "learning_rate": 2.383022908689727e-05, "loss": 0.5252, "step": 16230 }, { "epoch": 1.575781098389288, "grad_norm": 1.4580240462073564, "learning_rate": 2.3814004802388216e-05, "loss": 0.4809, "step": 16240 }, { "epoch": 1.5767514069474093, "grad_norm": 1.9799244985970428, "learning_rate": 2.3797780517879162e-05, "loss": 0.4938, "step": 16250 }, { "epoch": 1.5777217155055308, "grad_norm": 2.2967773637028315, "learning_rate": 2.378155623337011e-05, "loss": 0.48, "step": 16260 }, { "epoch": 1.5786920240636522, "grad_norm": 1.6715177397086511, "learning_rate": 2.3765331948861055e-05, "loss": 0.5112, "step": 16270 }, { "epoch": 1.5796623326217736, "grad_norm": 1.7543224867437088, "learning_rate": 2.3749107664352e-05, "loss": 0.5355, "step": 16280 }, { "epoch": 1.5806326411798952, "grad_norm": 1.7059823329477148, "learning_rate": 2.3732883379842948e-05, "loss": 0.4885, "step": 16290 }, { "epoch": 1.5816029497380169, "grad_norm": 1.808906067511985, "learning_rate": 2.3716659095333894e-05, "loss": 0.4883, "step": 16300 }, { "epoch": 1.5825732582961383, "grad_norm": 1.456280506138187, "learning_rate": 2.3700434810824844e-05, "loss": 0.4986, "step": 16310 }, { "epoch": 1.5835435668542597, "grad_norm": 1.7844142401171645, "learning_rate": 2.368421052631579e-05, "loss": 0.5726, "step": 16320 }, { "epoch": 1.584513875412381, "grad_norm": 1.8511812344459693, "learning_rate": 2.3667986241806737e-05, "loss": 0.5418, "step": 16330 }, { "epoch": 1.5854841839705025, "grad_norm": 1.6886781249735945, "learning_rate": 2.3651761957297687e-05, "loss": 0.4871, "step": 16340 }, { "epoch": 1.5864544925286241, "grad_norm": 1.6386080299672316, "learning_rate": 2.3635537672788633e-05, "loss": 0.5502, "step": 16350 }, { "epoch": 1.5874248010867456, "grad_norm": 1.6152930914041828, "learning_rate": 2.361931338827958e-05, "loss": 0.4602, "step": 16360 }, { "epoch": 1.5883951096448672, "grad_norm": 1.9896290423806842, "learning_rate": 2.3603089103770526e-05, "loss": 0.5517, "step": 16370 }, { "epoch": 1.5893654182029886, "grad_norm": 1.785358677833909, "learning_rate": 2.3586864819261472e-05, "loss": 0.5108, "step": 16380 }, { "epoch": 1.59033572676111, "grad_norm": 1.8919452155428813, "learning_rate": 2.357064053475242e-05, "loss": 0.5586, "step": 16390 }, { "epoch": 1.5913060353192314, "grad_norm": 2.1765937296152376, "learning_rate": 2.3554416250243365e-05, "loss": 0.4978, "step": 16400 }, { "epoch": 1.5922763438773528, "grad_norm": 1.9463613536174769, "learning_rate": 2.3538191965734315e-05, "loss": 0.534, "step": 16410 }, { "epoch": 1.5932466524354745, "grad_norm": 1.483428953982187, "learning_rate": 2.352196768122526e-05, "loss": 0.4481, "step": 16420 }, { "epoch": 1.594216960993596, "grad_norm": 1.8946130151442728, "learning_rate": 2.3505743396716207e-05, "loss": 0.4499, "step": 16430 }, { "epoch": 1.5951872695517175, "grad_norm": 1.496172632655443, "learning_rate": 2.3489519112207154e-05, "loss": 0.4719, "step": 16440 }, { "epoch": 1.596157578109839, "grad_norm": 1.5582678654830617, "learning_rate": 2.34732948276981e-05, "loss": 0.5023, "step": 16450 }, { "epoch": 1.5971278866679604, "grad_norm": 1.9101490466966036, "learning_rate": 2.3457070543189047e-05, "loss": 0.4442, "step": 16460 }, { "epoch": 1.5980981952260818, "grad_norm": 2.5712850986928757, "learning_rate": 2.3440846258679993e-05, "loss": 0.5008, "step": 16470 }, { "epoch": 1.5990685037842034, "grad_norm": 2.1177587573552805, "learning_rate": 2.342462197417094e-05, "loss": 0.5014, "step": 16480 }, { "epoch": 1.6000388123423248, "grad_norm": 2.4990786716780455, "learning_rate": 2.3408397689661886e-05, "loss": 0.4583, "step": 16490 }, { "epoch": 1.6010091209004464, "grad_norm": 1.91427095412827, "learning_rate": 2.3392173405152835e-05, "loss": 0.5037, "step": 16500 }, { "epoch": 1.6010091209004464, "eval_loss": 0.6359612345695496, "eval_runtime": 2467.4852, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 16500 }, { "epoch": 1.6019794294585679, "grad_norm": 1.8370649227416649, "learning_rate": 2.3375949120643782e-05, "loss": 0.5399, "step": 16510 }, { "epoch": 1.6029497380166893, "grad_norm": 1.6932084838539212, "learning_rate": 2.3359724836134728e-05, "loss": 0.5212, "step": 16520 }, { "epoch": 1.6039200465748107, "grad_norm": 1.68790954641985, "learning_rate": 2.3343500551625675e-05, "loss": 0.4837, "step": 16530 }, { "epoch": 1.6048903551329323, "grad_norm": 1.5773573029174093, "learning_rate": 2.332727626711662e-05, "loss": 0.5125, "step": 16540 }, { "epoch": 1.6058606636910537, "grad_norm": 1.7640635185794997, "learning_rate": 2.3311051982607567e-05, "loss": 0.5013, "step": 16550 }, { "epoch": 1.6068309722491754, "grad_norm": 1.7477056146726457, "learning_rate": 2.3294827698098514e-05, "loss": 0.5095, "step": 16560 }, { "epoch": 1.6078012808072968, "grad_norm": 1.0733428522302542, "learning_rate": 2.327860341358946e-05, "loss": 0.5, "step": 16570 }, { "epoch": 1.6087715893654182, "grad_norm": 1.9564292094193194, "learning_rate": 2.326237912908041e-05, "loss": 0.5058, "step": 16580 }, { "epoch": 1.6097418979235396, "grad_norm": 2.1045720941333466, "learning_rate": 2.3246154844571356e-05, "loss": 0.5101, "step": 16590 }, { "epoch": 1.610712206481661, "grad_norm": 2.171187966701804, "learning_rate": 2.3229930560062303e-05, "loss": 0.5069, "step": 16600 }, { "epoch": 1.6116825150397827, "grad_norm": 1.9610313890988438, "learning_rate": 2.321370627555325e-05, "loss": 0.4904, "step": 16610 }, { "epoch": 1.6126528235979043, "grad_norm": 1.941252888287077, "learning_rate": 2.3197481991044195e-05, "loss": 0.4865, "step": 16620 }, { "epoch": 1.6136231321560257, "grad_norm": 1.897218836962928, "learning_rate": 2.3181257706535142e-05, "loss": 0.4966, "step": 16630 }, { "epoch": 1.6145934407141471, "grad_norm": 2.201820668610594, "learning_rate": 2.3165033422026088e-05, "loss": 0.4895, "step": 16640 }, { "epoch": 1.6155637492722685, "grad_norm": 1.5720192215380684, "learning_rate": 2.3148809137517035e-05, "loss": 0.4918, "step": 16650 }, { "epoch": 1.61653405783039, "grad_norm": 2.227704713223705, "learning_rate": 2.313258485300798e-05, "loss": 0.5251, "step": 16660 }, { "epoch": 1.6175043663885116, "grad_norm": 2.0170271312798707, "learning_rate": 2.311636056849893e-05, "loss": 0.4965, "step": 16670 }, { "epoch": 1.618474674946633, "grad_norm": 1.9688156858912216, "learning_rate": 2.3100136283989877e-05, "loss": 0.4685, "step": 16680 }, { "epoch": 1.6194449835047546, "grad_norm": 2.0958524562959675, "learning_rate": 2.3083911999480824e-05, "loss": 0.4884, "step": 16690 }, { "epoch": 1.620415292062876, "grad_norm": 2.0973339157843447, "learning_rate": 2.306768771497177e-05, "loss": 0.4394, "step": 16700 }, { "epoch": 1.6213856006209975, "grad_norm": 1.5852203386480506, "learning_rate": 2.3051463430462716e-05, "loss": 0.5068, "step": 16710 }, { "epoch": 1.6223559091791189, "grad_norm": 1.907182215766834, "learning_rate": 2.3035239145953663e-05, "loss": 0.461, "step": 16720 }, { "epoch": 1.6233262177372403, "grad_norm": 1.9903511844679715, "learning_rate": 2.3019014861444612e-05, "loss": 0.4861, "step": 16730 }, { "epoch": 1.624296526295362, "grad_norm": 1.8734845445192632, "learning_rate": 2.300279057693556e-05, "loss": 0.5203, "step": 16740 }, { "epoch": 1.6252668348534836, "grad_norm": 1.4830887784438085, "learning_rate": 2.2986566292426505e-05, "loss": 0.5504, "step": 16750 }, { "epoch": 1.626237143411605, "grad_norm": 1.9852817046238187, "learning_rate": 2.297034200791745e-05, "loss": 0.5727, "step": 16760 }, { "epoch": 1.6272074519697264, "grad_norm": 1.9574779506866022, "learning_rate": 2.29541177234084e-05, "loss": 0.4932, "step": 16770 }, { "epoch": 1.6281777605278478, "grad_norm": 1.8045639100087034, "learning_rate": 2.2937893438899348e-05, "loss": 0.4874, "step": 16780 }, { "epoch": 1.6291480690859692, "grad_norm": 1.9467881794111201, "learning_rate": 2.2921669154390294e-05, "loss": 0.4231, "step": 16790 }, { "epoch": 1.6301183776440908, "grad_norm": 2.115958262042766, "learning_rate": 2.290544486988124e-05, "loss": 0.5085, "step": 16800 }, { "epoch": 1.6310886862022123, "grad_norm": 1.9242055352600025, "learning_rate": 2.2889220585372187e-05, "loss": 0.4961, "step": 16810 }, { "epoch": 1.632058994760334, "grad_norm": 1.7473969264432643, "learning_rate": 2.2872996300863133e-05, "loss": 0.4896, "step": 16820 }, { "epoch": 1.6330293033184553, "grad_norm": 1.9086620383723776, "learning_rate": 2.285677201635408e-05, "loss": 0.5793, "step": 16830 }, { "epoch": 1.6339996118765767, "grad_norm": 1.8179786231819581, "learning_rate": 2.2840547731845026e-05, "loss": 0.47, "step": 16840 }, { "epoch": 1.6349699204346981, "grad_norm": 2.3742959935176993, "learning_rate": 2.2824323447335972e-05, "loss": 0.4601, "step": 16850 }, { "epoch": 1.6359402289928198, "grad_norm": 1.5831090068828182, "learning_rate": 2.2808099162826922e-05, "loss": 0.527, "step": 16860 }, { "epoch": 1.6369105375509412, "grad_norm": 2.429466466556868, "learning_rate": 2.279187487831787e-05, "loss": 0.5041, "step": 16870 }, { "epoch": 1.6378808461090628, "grad_norm": 2.0666827078950676, "learning_rate": 2.2775650593808815e-05, "loss": 0.5093, "step": 16880 }, { "epoch": 1.6388511546671842, "grad_norm": 1.7376049636619024, "learning_rate": 2.275942630929976e-05, "loss": 0.4899, "step": 16890 }, { "epoch": 1.6398214632253056, "grad_norm": 1.9743952968077716, "learning_rate": 2.2743202024790708e-05, "loss": 0.4483, "step": 16900 }, { "epoch": 1.640791771783427, "grad_norm": 1.7020982614990035, "learning_rate": 2.2726977740281654e-05, "loss": 0.4746, "step": 16910 }, { "epoch": 1.6417620803415485, "grad_norm": 1.5288620918579647, "learning_rate": 2.27107534557726e-05, "loss": 0.5386, "step": 16920 }, { "epoch": 1.64273238889967, "grad_norm": 2.2418459177194725, "learning_rate": 2.2694529171263547e-05, "loss": 0.5221, "step": 16930 }, { "epoch": 1.6437026974577917, "grad_norm": 1.7801442444217204, "learning_rate": 2.2678304886754497e-05, "loss": 0.502, "step": 16940 }, { "epoch": 1.6446730060159132, "grad_norm": 2.750428317407271, "learning_rate": 2.2662080602245443e-05, "loss": 0.4947, "step": 16950 }, { "epoch": 1.6456433145740346, "grad_norm": 1.5310655899660852, "learning_rate": 2.264585631773639e-05, "loss": 0.5228, "step": 16960 }, { "epoch": 1.646613623132156, "grad_norm": 1.7325660800102027, "learning_rate": 2.2629632033227336e-05, "loss": 0.4724, "step": 16970 }, { "epoch": 1.6475839316902774, "grad_norm": 3.028000821794418, "learning_rate": 2.2613407748718282e-05, "loss": 0.5444, "step": 16980 }, { "epoch": 1.648554240248399, "grad_norm": 1.8847091529095503, "learning_rate": 2.259718346420923e-05, "loss": 0.5095, "step": 16990 }, { "epoch": 1.6495245488065204, "grad_norm": 1.584479445600283, "learning_rate": 2.2580959179700175e-05, "loss": 0.5471, "step": 17000 }, { "epoch": 1.6495245488065204, "eval_loss": 0.6354050636291504, "eval_runtime": 2468.5352, "eval_samples_per_second": 0.726, "eval_steps_per_second": 0.363, "step": 17000 }, { "epoch": 1.650494857364642, "grad_norm": 1.668904824652418, "learning_rate": 2.256473489519112e-05, "loss": 0.4838, "step": 17010 }, { "epoch": 1.6514651659227635, "grad_norm": 1.7782439469811153, "learning_rate": 2.2548510610682068e-05, "loss": 0.5225, "step": 17020 }, { "epoch": 1.652435474480885, "grad_norm": 2.1175538832822576, "learning_rate": 2.2532286326173017e-05, "loss": 0.5438, "step": 17030 }, { "epoch": 1.6534057830390063, "grad_norm": 1.9031650594744414, "learning_rate": 2.2516062041663964e-05, "loss": 0.4456, "step": 17040 }, { "epoch": 1.6543760915971277, "grad_norm": 1.8418596497226953, "learning_rate": 2.249983775715491e-05, "loss": 0.5148, "step": 17050 }, { "epoch": 1.6553464001552494, "grad_norm": 2.001058155829245, "learning_rate": 2.2483613472645857e-05, "loss": 0.5381, "step": 17060 }, { "epoch": 1.656316708713371, "grad_norm": 1.6070553409553883, "learning_rate": 2.2467389188136803e-05, "loss": 0.5199, "step": 17070 }, { "epoch": 1.6572870172714924, "grad_norm": 1.8977480591408646, "learning_rate": 2.245116490362775e-05, "loss": 0.5132, "step": 17080 }, { "epoch": 1.6582573258296138, "grad_norm": 1.8011894993916608, "learning_rate": 2.2434940619118696e-05, "loss": 0.5448, "step": 17090 }, { "epoch": 1.6592276343877352, "grad_norm": 2.0284255015217565, "learning_rate": 2.2418716334609642e-05, "loss": 0.4717, "step": 17100 }, { "epoch": 1.6601979429458567, "grad_norm": 1.885999888526123, "learning_rate": 2.2402492050100592e-05, "loss": 0.5183, "step": 17110 }, { "epoch": 1.6611682515039783, "grad_norm": 1.9056970812089797, "learning_rate": 2.2386267765591538e-05, "loss": 0.4393, "step": 17120 }, { "epoch": 1.6621385600620997, "grad_norm": 2.267841170403769, "learning_rate": 2.2370043481082488e-05, "loss": 0.4882, "step": 17130 }, { "epoch": 1.6631088686202213, "grad_norm": 1.5488822989395437, "learning_rate": 2.2353819196573434e-05, "loss": 0.5294, "step": 17140 }, { "epoch": 1.6640791771783427, "grad_norm": 1.992034372217963, "learning_rate": 2.233759491206438e-05, "loss": 0.4925, "step": 17150 }, { "epoch": 1.6650494857364642, "grad_norm": 1.7050665334433257, "learning_rate": 2.2321370627555327e-05, "loss": 0.5237, "step": 17160 }, { "epoch": 1.6660197942945856, "grad_norm": 1.8164813364072325, "learning_rate": 2.2305146343046274e-05, "loss": 0.5128, "step": 17170 }, { "epoch": 1.6669901028527072, "grad_norm": 1.7407964429381908, "learning_rate": 2.228892205853722e-05, "loss": 0.4812, "step": 17180 }, { "epoch": 1.6679604114108286, "grad_norm": 2.220803029174025, "learning_rate": 2.2272697774028166e-05, "loss": 0.4936, "step": 17190 }, { "epoch": 1.6689307199689503, "grad_norm": 1.7697338691239146, "learning_rate": 2.2256473489519113e-05, "loss": 0.5208, "step": 17200 }, { "epoch": 1.6699010285270717, "grad_norm": 2.2487670732225085, "learning_rate": 2.224024920501006e-05, "loss": 0.4833, "step": 17210 }, { "epoch": 1.670871337085193, "grad_norm": 1.888912866564453, "learning_rate": 2.222402492050101e-05, "loss": 0.5287, "step": 17220 }, { "epoch": 1.6718416456433145, "grad_norm": 1.7902985235611324, "learning_rate": 2.2207800635991955e-05, "loss": 0.5271, "step": 17230 }, { "epoch": 1.672811954201436, "grad_norm": 2.2626922106136607, "learning_rate": 2.21915763514829e-05, "loss": 0.5062, "step": 17240 }, { "epoch": 1.6737822627595575, "grad_norm": 2.0509489741448923, "learning_rate": 2.2175352066973848e-05, "loss": 0.4586, "step": 17250 }, { "epoch": 1.6747525713176792, "grad_norm": 1.7028309180471788, "learning_rate": 2.2159127782464794e-05, "loss": 0.5591, "step": 17260 }, { "epoch": 1.6757228798758006, "grad_norm": 1.8148060135409945, "learning_rate": 2.214290349795574e-05, "loss": 0.4708, "step": 17270 }, { "epoch": 1.676693188433922, "grad_norm": 1.7415357367009103, "learning_rate": 2.2126679213446687e-05, "loss": 0.4877, "step": 17280 }, { "epoch": 1.6776634969920434, "grad_norm": 1.6879714610876217, "learning_rate": 2.2110454928937634e-05, "loss": 0.4838, "step": 17290 }, { "epoch": 1.6786338055501648, "grad_norm": 1.3869597913004987, "learning_rate": 2.2094230644428583e-05, "loss": 0.496, "step": 17300 }, { "epoch": 1.6796041141082865, "grad_norm": 1.4730335934961858, "learning_rate": 2.207800635991953e-05, "loss": 0.4926, "step": 17310 }, { "epoch": 1.6805744226664079, "grad_norm": 1.771999456693497, "learning_rate": 2.2061782075410476e-05, "loss": 0.4993, "step": 17320 }, { "epoch": 1.6815447312245295, "grad_norm": 1.6689193723022826, "learning_rate": 2.2045557790901423e-05, "loss": 0.4815, "step": 17330 }, { "epoch": 1.682515039782651, "grad_norm": 2.1543699729043717, "learning_rate": 2.202933350639237e-05, "loss": 0.4772, "step": 17340 }, { "epoch": 1.6834853483407723, "grad_norm": 1.7964447035669882, "learning_rate": 2.2013109221883315e-05, "loss": 0.4872, "step": 17350 }, { "epoch": 1.6844556568988938, "grad_norm": 1.5499702179216124, "learning_rate": 2.199688493737426e-05, "loss": 0.5115, "step": 17360 }, { "epoch": 1.6854259654570152, "grad_norm": 1.736034409254174, "learning_rate": 2.1980660652865208e-05, "loss": 0.5004, "step": 17370 }, { "epoch": 1.6863962740151368, "grad_norm": 1.6670883800573928, "learning_rate": 2.1964436368356154e-05, "loss": 0.5172, "step": 17380 }, { "epoch": 1.6873665825732584, "grad_norm": 1.8976878178656118, "learning_rate": 2.1948212083847104e-05, "loss": 0.5056, "step": 17390 }, { "epoch": 1.6883368911313799, "grad_norm": 1.7825783485749558, "learning_rate": 2.193198779933805e-05, "loss": 0.5079, "step": 17400 }, { "epoch": 1.6893071996895013, "grad_norm": 1.963783922685446, "learning_rate": 2.1915763514828997e-05, "loss": 0.5595, "step": 17410 }, { "epoch": 1.6902775082476227, "grad_norm": 1.577100524669314, "learning_rate": 2.1899539230319943e-05, "loss": 0.5122, "step": 17420 }, { "epoch": 1.691247816805744, "grad_norm": 1.9224227001223828, "learning_rate": 2.188331494581089e-05, "loss": 0.5179, "step": 17430 }, { "epoch": 1.6922181253638657, "grad_norm": 1.968807335226898, "learning_rate": 2.1867090661301836e-05, "loss": 0.4811, "step": 17440 }, { "epoch": 1.6931884339219871, "grad_norm": 1.8656524316794496, "learning_rate": 2.1850866376792782e-05, "loss": 0.5562, "step": 17450 }, { "epoch": 1.6941587424801088, "grad_norm": 1.936995738098722, "learning_rate": 2.183464209228373e-05, "loss": 0.5128, "step": 17460 }, { "epoch": 1.6951290510382302, "grad_norm": 1.716335988732862, "learning_rate": 2.181841780777468e-05, "loss": 0.4878, "step": 17470 }, { "epoch": 1.6960993595963516, "grad_norm": 1.8671035370037838, "learning_rate": 2.1802193523265625e-05, "loss": 0.5631, "step": 17480 }, { "epoch": 1.697069668154473, "grad_norm": 2.10031146751523, "learning_rate": 2.178596923875657e-05, "loss": 0.4911, "step": 17490 }, { "epoch": 1.6980399767125947, "grad_norm": 1.4651955164387034, "learning_rate": 2.1769744954247518e-05, "loss": 0.4841, "step": 17500 }, { "epoch": 1.6980399767125947, "eval_loss": 0.6339951157569885, "eval_runtime": 2473.3585, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 17500 }, { "epoch": 1.699010285270716, "grad_norm": 1.9647331919080824, "learning_rate": 2.1753520669738468e-05, "loss": 0.4788, "step": 17510 }, { "epoch": 1.6999805938288377, "grad_norm": 1.9100412693293238, "learning_rate": 2.1737296385229414e-05, "loss": 0.475, "step": 17520 }, { "epoch": 1.7009509023869591, "grad_norm": 1.7020406495492606, "learning_rate": 2.172107210072036e-05, "loss": 0.4908, "step": 17530 }, { "epoch": 1.7019212109450805, "grad_norm": 1.4911580170253027, "learning_rate": 2.1704847816211307e-05, "loss": 0.4832, "step": 17540 }, { "epoch": 1.702891519503202, "grad_norm": 2.086292759031188, "learning_rate": 2.1688623531702253e-05, "loss": 0.5129, "step": 17550 }, { "epoch": 1.7038618280613234, "grad_norm": 1.5359738880009668, "learning_rate": 2.16723992471932e-05, "loss": 0.4512, "step": 17560 }, { "epoch": 1.704832136619445, "grad_norm": 1.850242190602371, "learning_rate": 2.165617496268415e-05, "loss": 0.5224, "step": 17570 }, { "epoch": 1.7058024451775666, "grad_norm": 1.2439323678887082, "learning_rate": 2.1639950678175096e-05, "loss": 0.4625, "step": 17580 }, { "epoch": 1.706772753735688, "grad_norm": 1.5186947478659967, "learning_rate": 2.1623726393666042e-05, "loss": 0.5527, "step": 17590 }, { "epoch": 1.7077430622938095, "grad_norm": 2.1869553971014035, "learning_rate": 2.160750210915699e-05, "loss": 0.499, "step": 17600 }, { "epoch": 1.7087133708519309, "grad_norm": 2.1896368464295968, "learning_rate": 2.1591277824647935e-05, "loss": 0.5182, "step": 17610 }, { "epoch": 1.7096836794100523, "grad_norm": 2.5508627899244853, "learning_rate": 2.157505354013888e-05, "loss": 0.529, "step": 17620 }, { "epoch": 1.710653987968174, "grad_norm": 2.0451418684052527, "learning_rate": 2.1558829255629828e-05, "loss": 0.4789, "step": 17630 }, { "epoch": 1.7116242965262953, "grad_norm": 2.358852269881603, "learning_rate": 2.1542604971120774e-05, "loss": 0.4701, "step": 17640 }, { "epoch": 1.712594605084417, "grad_norm": 1.65876380363299, "learning_rate": 2.152638068661172e-05, "loss": 0.5078, "step": 17650 }, { "epoch": 1.7135649136425384, "grad_norm": 1.819260137292149, "learning_rate": 2.151015640210267e-05, "loss": 0.5115, "step": 17660 }, { "epoch": 1.7145352222006598, "grad_norm": 1.9278786735506124, "learning_rate": 2.1493932117593616e-05, "loss": 0.526, "step": 17670 }, { "epoch": 1.7155055307587812, "grad_norm": 1.647908501988862, "learning_rate": 2.1477707833084563e-05, "loss": 0.4455, "step": 17680 }, { "epoch": 1.7164758393169026, "grad_norm": 2.211325829098572, "learning_rate": 2.146148354857551e-05, "loss": 0.5251, "step": 17690 }, { "epoch": 1.7174461478750243, "grad_norm": 2.068602886260178, "learning_rate": 2.1445259264066456e-05, "loss": 0.5173, "step": 17700 }, { "epoch": 1.718416456433146, "grad_norm": 1.7976369940318933, "learning_rate": 2.1429034979557402e-05, "loss": 0.4997, "step": 17710 }, { "epoch": 1.7193867649912673, "grad_norm": 1.8518665259593716, "learning_rate": 2.141281069504835e-05, "loss": 0.5221, "step": 17720 }, { "epoch": 1.7203570735493887, "grad_norm": 2.162010082710502, "learning_rate": 2.1396586410539295e-05, "loss": 0.5096, "step": 17730 }, { "epoch": 1.7213273821075101, "grad_norm": 2.3175061118466704, "learning_rate": 2.138036212603024e-05, "loss": 0.5203, "step": 17740 }, { "epoch": 1.7222976906656315, "grad_norm": 2.0036851029306364, "learning_rate": 2.136413784152119e-05, "loss": 0.5312, "step": 17750 }, { "epoch": 1.7232679992237532, "grad_norm": 2.232799745735125, "learning_rate": 2.1347913557012137e-05, "loss": 0.477, "step": 17760 }, { "epoch": 1.7242383077818746, "grad_norm": 2.0579250461700225, "learning_rate": 2.1331689272503084e-05, "loss": 0.5104, "step": 17770 }, { "epoch": 1.7252086163399962, "grad_norm": 1.9067412132990194, "learning_rate": 2.131546498799403e-05, "loss": 0.4663, "step": 17780 }, { "epoch": 1.7261789248981176, "grad_norm": 1.6166777364493794, "learning_rate": 2.1299240703484976e-05, "loss": 0.5091, "step": 17790 }, { "epoch": 1.727149233456239, "grad_norm": 1.8400065423176315, "learning_rate": 2.1283016418975923e-05, "loss": 0.4577, "step": 17800 }, { "epoch": 1.7281195420143605, "grad_norm": 1.8343636819724598, "learning_rate": 2.126679213446687e-05, "loss": 0.5171, "step": 17810 }, { "epoch": 1.729089850572482, "grad_norm": 2.035066789255769, "learning_rate": 2.1250567849957816e-05, "loss": 0.4809, "step": 17820 }, { "epoch": 1.7300601591306035, "grad_norm": 1.9750470965118032, "learning_rate": 2.1234343565448765e-05, "loss": 0.4681, "step": 17830 }, { "epoch": 1.7310304676887251, "grad_norm": 1.707449728088738, "learning_rate": 2.1218119280939712e-05, "loss": 0.4768, "step": 17840 }, { "epoch": 1.7320007762468466, "grad_norm": 2.091683187172791, "learning_rate": 2.1201894996430658e-05, "loss": 0.4967, "step": 17850 }, { "epoch": 1.732971084804968, "grad_norm": 1.8317230891625513, "learning_rate": 2.1185670711921604e-05, "loss": 0.4617, "step": 17860 }, { "epoch": 1.7339413933630894, "grad_norm": 1.8581590049450867, "learning_rate": 2.116944642741255e-05, "loss": 0.453, "step": 17870 }, { "epoch": 1.7349117019212108, "grad_norm": 2.394138871900483, "learning_rate": 2.1153222142903497e-05, "loss": 0.5231, "step": 17880 }, { "epoch": 1.7358820104793324, "grad_norm": 1.8303455092887013, "learning_rate": 2.1136997858394444e-05, "loss": 0.5045, "step": 17890 }, { "epoch": 1.736852319037454, "grad_norm": 1.7040025873966649, "learning_rate": 2.1120773573885393e-05, "loss": 0.5185, "step": 17900 }, { "epoch": 1.7378226275955755, "grad_norm": 1.721795865054807, "learning_rate": 2.110454928937634e-05, "loss": 0.5153, "step": 17910 }, { "epoch": 1.738792936153697, "grad_norm": 2.195542603436347, "learning_rate": 2.1088325004867286e-05, "loss": 0.515, "step": 17920 }, { "epoch": 1.7397632447118183, "grad_norm": 1.9475497865392444, "learning_rate": 2.1072100720358236e-05, "loss": 0.6134, "step": 17930 }, { "epoch": 1.7407335532699397, "grad_norm": 2.007827581925297, "learning_rate": 2.1055876435849182e-05, "loss": 0.4483, "step": 17940 }, { "epoch": 1.7417038618280614, "grad_norm": 2.427289180607889, "learning_rate": 2.103965215134013e-05, "loss": 0.4791, "step": 17950 }, { "epoch": 1.7426741703861828, "grad_norm": 2.090548009394418, "learning_rate": 2.1023427866831075e-05, "loss": 0.4485, "step": 17960 }, { "epoch": 1.7436444789443044, "grad_norm": 1.4231148878110762, "learning_rate": 2.100720358232202e-05, "loss": 0.4819, "step": 17970 }, { "epoch": 1.7446147875024258, "grad_norm": 1.952695177483555, "learning_rate": 2.0990979297812968e-05, "loss": 0.4615, "step": 17980 }, { "epoch": 1.7455850960605472, "grad_norm": 1.5076062020543768, "learning_rate": 2.0974755013303914e-05, "loss": 0.5145, "step": 17990 }, { "epoch": 1.7465554046186686, "grad_norm": 2.385317508868708, "learning_rate": 2.095853072879486e-05, "loss": 0.4667, "step": 18000 }, { "epoch": 1.7465554046186686, "eval_loss": 0.6351094841957092, "eval_runtime": 2471.0101, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 18000 }, { "epoch": 1.74752571317679, "grad_norm": 1.5204879350818516, "learning_rate": 2.0942306444285807e-05, "loss": 0.4808, "step": 18010 }, { "epoch": 1.7484960217349117, "grad_norm": 2.363404313519829, "learning_rate": 2.0926082159776757e-05, "loss": 0.4829, "step": 18020 }, { "epoch": 1.7494663302930333, "grad_norm": 1.771660432749116, "learning_rate": 2.0909857875267703e-05, "loss": 0.4838, "step": 18030 }, { "epoch": 1.7504366388511547, "grad_norm": 1.754309450094576, "learning_rate": 2.089363359075865e-05, "loss": 0.5289, "step": 18040 }, { "epoch": 1.7514069474092762, "grad_norm": 1.7394710721825013, "learning_rate": 2.0877409306249596e-05, "loss": 0.4681, "step": 18050 }, { "epoch": 1.7523772559673976, "grad_norm": 1.903002291309086, "learning_rate": 2.0861185021740542e-05, "loss": 0.4352, "step": 18060 }, { "epoch": 1.753347564525519, "grad_norm": 1.8599137833426156, "learning_rate": 2.084496073723149e-05, "loss": 0.5158, "step": 18070 }, { "epoch": 1.7543178730836406, "grad_norm": 2.416977234001108, "learning_rate": 2.0828736452722435e-05, "loss": 0.458, "step": 18080 }, { "epoch": 1.755288181641762, "grad_norm": 2.0471286021548876, "learning_rate": 2.081251216821338e-05, "loss": 0.5093, "step": 18090 }, { "epoch": 1.7562584901998837, "grad_norm": 1.676028789821819, "learning_rate": 2.079628788370433e-05, "loss": 0.4682, "step": 18100 }, { "epoch": 1.757228798758005, "grad_norm": 2.27723774043769, "learning_rate": 2.0780063599195278e-05, "loss": 0.4844, "step": 18110 }, { "epoch": 1.7581991073161265, "grad_norm": 1.9065062110447546, "learning_rate": 2.0763839314686224e-05, "loss": 0.4913, "step": 18120 }, { "epoch": 1.759169415874248, "grad_norm": 1.499218120899005, "learning_rate": 2.074761503017717e-05, "loss": 0.4764, "step": 18130 }, { "epoch": 1.7601397244323695, "grad_norm": 1.7111231535151605, "learning_rate": 2.0731390745668117e-05, "loss": 0.4376, "step": 18140 }, { "epoch": 1.761110032990491, "grad_norm": 2.1731054208801406, "learning_rate": 2.0715166461159063e-05, "loss": 0.5115, "step": 18150 }, { "epoch": 1.7620803415486126, "grad_norm": 2.537816428144228, "learning_rate": 2.069894217665001e-05, "loss": 0.4872, "step": 18160 }, { "epoch": 1.763050650106734, "grad_norm": 1.8860270703852153, "learning_rate": 2.0682717892140956e-05, "loss": 0.544, "step": 18170 }, { "epoch": 1.7640209586648554, "grad_norm": 2.021054656113362, "learning_rate": 2.0666493607631902e-05, "loss": 0.4595, "step": 18180 }, { "epoch": 1.7649912672229768, "grad_norm": 1.7916097460233942, "learning_rate": 2.0650269323122852e-05, "loss": 0.5098, "step": 18190 }, { "epoch": 1.7659615757810982, "grad_norm": 2.3100480138537, "learning_rate": 2.06340450386138e-05, "loss": 0.4801, "step": 18200 }, { "epoch": 1.7669318843392199, "grad_norm": 2.4377132473167413, "learning_rate": 2.0617820754104745e-05, "loss": 0.4507, "step": 18210 }, { "epoch": 1.7679021928973415, "grad_norm": 2.303644526731782, "learning_rate": 2.060159646959569e-05, "loss": 0.4557, "step": 18220 }, { "epoch": 1.768872501455463, "grad_norm": 2.104570073246246, "learning_rate": 2.0585372185086638e-05, "loss": 0.4657, "step": 18230 }, { "epoch": 1.7698428100135843, "grad_norm": 1.902037691736978, "learning_rate": 2.0569147900577584e-05, "loss": 0.4885, "step": 18240 }, { "epoch": 1.7708131185717058, "grad_norm": 1.8321649978057852, "learning_rate": 2.055292361606853e-05, "loss": 0.4953, "step": 18250 }, { "epoch": 1.7717834271298272, "grad_norm": 1.7318686323178791, "learning_rate": 2.0536699331559477e-05, "loss": 0.4993, "step": 18260 }, { "epoch": 1.7727537356879488, "grad_norm": 1.6339592524860076, "learning_rate": 2.0520475047050423e-05, "loss": 0.5311, "step": 18270 }, { "epoch": 1.7737240442460702, "grad_norm": 2.0377735402561448, "learning_rate": 2.0504250762541373e-05, "loss": 0.4785, "step": 18280 }, { "epoch": 1.7746943528041919, "grad_norm": 1.4282663698707199, "learning_rate": 2.0488026478032323e-05, "loss": 0.5077, "step": 18290 }, { "epoch": 1.7756646613623133, "grad_norm": 1.9149293437143238, "learning_rate": 2.047180219352327e-05, "loss": 0.5319, "step": 18300 }, { "epoch": 1.7766349699204347, "grad_norm": 2.1578236484875077, "learning_rate": 2.0455577909014215e-05, "loss": 0.4928, "step": 18310 }, { "epoch": 1.777605278478556, "grad_norm": 2.107005500307039, "learning_rate": 2.0439353624505162e-05, "loss": 0.4969, "step": 18320 }, { "epoch": 1.7785755870366775, "grad_norm": 1.9793180118959242, "learning_rate": 2.0423129339996108e-05, "loss": 0.4585, "step": 18330 }, { "epoch": 1.7795458955947991, "grad_norm": 1.8333976168598185, "learning_rate": 2.0406905055487055e-05, "loss": 0.4982, "step": 18340 }, { "epoch": 1.7805162041529208, "grad_norm": 1.9049818949759387, "learning_rate": 2.0390680770978e-05, "loss": 0.494, "step": 18350 }, { "epoch": 1.7814865127110422, "grad_norm": 1.9993776566406598, "learning_rate": 2.0374456486468947e-05, "loss": 0.4918, "step": 18360 }, { "epoch": 1.7824568212691636, "grad_norm": 1.901331422597336, "learning_rate": 2.0358232201959894e-05, "loss": 0.5138, "step": 18370 }, { "epoch": 1.783427129827285, "grad_norm": 1.6972755513639741, "learning_rate": 2.0342007917450843e-05, "loss": 0.4715, "step": 18380 }, { "epoch": 1.7843974383854064, "grad_norm": 1.7683176039213586, "learning_rate": 2.032578363294179e-05, "loss": 0.4917, "step": 18390 }, { "epoch": 1.785367746943528, "grad_norm": 2.473174371442726, "learning_rate": 2.0309559348432736e-05, "loss": 0.5202, "step": 18400 }, { "epoch": 1.7863380555016495, "grad_norm": 1.5926803956510256, "learning_rate": 2.0293335063923683e-05, "loss": 0.4988, "step": 18410 }, { "epoch": 1.7873083640597711, "grad_norm": 2.291647967076523, "learning_rate": 2.027711077941463e-05, "loss": 0.4824, "step": 18420 }, { "epoch": 1.7882786726178925, "grad_norm": 1.8562307329324135, "learning_rate": 2.0260886494905575e-05, "loss": 0.4563, "step": 18430 }, { "epoch": 1.789248981176014, "grad_norm": 2.0376982504169225, "learning_rate": 2.0244662210396522e-05, "loss": 0.5306, "step": 18440 }, { "epoch": 1.7902192897341354, "grad_norm": 2.2229061869152287, "learning_rate": 2.0228437925887468e-05, "loss": 0.4992, "step": 18450 }, { "epoch": 1.791189598292257, "grad_norm": 1.9381177105445806, "learning_rate": 2.0212213641378418e-05, "loss": 0.5163, "step": 18460 }, { "epoch": 1.7921599068503784, "grad_norm": 2.2038889339934276, "learning_rate": 2.0195989356869364e-05, "loss": 0.4872, "step": 18470 }, { "epoch": 1.7931302154085, "grad_norm": 1.7467002610372642, "learning_rate": 2.017976507236031e-05, "loss": 0.4876, "step": 18480 }, { "epoch": 1.7941005239666215, "grad_norm": 1.80516559021005, "learning_rate": 2.0163540787851257e-05, "loss": 0.5155, "step": 18490 }, { "epoch": 1.7950708325247429, "grad_norm": 1.6841944257951464, "learning_rate": 2.0147316503342203e-05, "loss": 0.485, "step": 18500 }, { "epoch": 1.7950708325247429, "eval_loss": 0.634519636631012, "eval_runtime": 2476.7893, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 18500 }, { "epoch": 1.7960411410828643, "grad_norm": 1.9439021855874303, "learning_rate": 2.013109221883315e-05, "loss": 0.452, "step": 18510 }, { "epoch": 1.7970114496409857, "grad_norm": 2.2513317080100226, "learning_rate": 2.0114867934324096e-05, "loss": 0.5325, "step": 18520 }, { "epoch": 1.7979817581991073, "grad_norm": 2.08111292912013, "learning_rate": 2.0098643649815043e-05, "loss": 0.5473, "step": 18530 }, { "epoch": 1.798952066757229, "grad_norm": 1.5733834399311872, "learning_rate": 2.008241936530599e-05, "loss": 0.477, "step": 18540 }, { "epoch": 1.7999223753153504, "grad_norm": 2.8502682562631954, "learning_rate": 2.006619508079694e-05, "loss": 0.5206, "step": 18550 }, { "epoch": 1.8008926838734718, "grad_norm": 1.753275957528235, "learning_rate": 2.0049970796287885e-05, "loss": 0.4547, "step": 18560 }, { "epoch": 1.8018629924315932, "grad_norm": 2.1314672834115473, "learning_rate": 2.003374651177883e-05, "loss": 0.5419, "step": 18570 }, { "epoch": 1.8028333009897146, "grad_norm": 1.9602167097784182, "learning_rate": 2.0017522227269778e-05, "loss": 0.5155, "step": 18580 }, { "epoch": 1.8038036095478363, "grad_norm": 1.938753313860514, "learning_rate": 2.0001297942760724e-05, "loss": 0.5068, "step": 18590 }, { "epoch": 1.8047739181059577, "grad_norm": 2.5245908920622924, "learning_rate": 1.998507365825167e-05, "loss": 0.4823, "step": 18600 }, { "epoch": 1.8057442266640793, "grad_norm": 1.8472353172834934, "learning_rate": 1.9968849373742617e-05, "loss": 0.5307, "step": 18610 }, { "epoch": 1.8067145352222007, "grad_norm": 2.119846040052731, "learning_rate": 1.9952625089233563e-05, "loss": 0.5497, "step": 18620 }, { "epoch": 1.8076848437803221, "grad_norm": 1.8254352104689588, "learning_rate": 1.9936400804724513e-05, "loss": 0.5343, "step": 18630 }, { "epoch": 1.8086551523384435, "grad_norm": 2.286926975318207, "learning_rate": 1.992017652021546e-05, "loss": 0.5079, "step": 18640 }, { "epoch": 1.809625460896565, "grad_norm": 2.145456846814811, "learning_rate": 1.9903952235706406e-05, "loss": 0.4792, "step": 18650 }, { "epoch": 1.8105957694546866, "grad_norm": 2.1237990360529615, "learning_rate": 1.9887727951197352e-05, "loss": 0.5221, "step": 18660 }, { "epoch": 1.8115660780128082, "grad_norm": 1.9365349043963929, "learning_rate": 1.98715036666883e-05, "loss": 0.4994, "step": 18670 }, { "epoch": 1.8125363865709296, "grad_norm": 2.1477510328366543, "learning_rate": 1.985527938217925e-05, "loss": 0.5051, "step": 18680 }, { "epoch": 1.813506695129051, "grad_norm": 2.24009021245378, "learning_rate": 1.9839055097670195e-05, "loss": 0.4411, "step": 18690 }, { "epoch": 1.8144770036871725, "grad_norm": 2.161147622755333, "learning_rate": 1.982283081316114e-05, "loss": 0.4947, "step": 18700 }, { "epoch": 1.8154473122452939, "grad_norm": 2.5777752480302296, "learning_rate": 1.9806606528652088e-05, "loss": 0.5027, "step": 18710 }, { "epoch": 1.8164176208034155, "grad_norm": 1.5289437881489423, "learning_rate": 1.9790382244143034e-05, "loss": 0.5276, "step": 18720 }, { "epoch": 1.817387929361537, "grad_norm": 1.808692912848697, "learning_rate": 1.977415795963398e-05, "loss": 0.5155, "step": 18730 }, { "epoch": 1.8183582379196586, "grad_norm": 1.7568825751279589, "learning_rate": 1.975793367512493e-05, "loss": 0.4658, "step": 18740 }, { "epoch": 1.81932854647778, "grad_norm": 1.5226619183552335, "learning_rate": 1.9741709390615877e-05, "loss": 0.4764, "step": 18750 }, { "epoch": 1.8202988550359014, "grad_norm": 1.5664856097734037, "learning_rate": 1.9725485106106823e-05, "loss": 0.4503, "step": 18760 }, { "epoch": 1.8212691635940228, "grad_norm": 1.7127846075920363, "learning_rate": 1.970926082159777e-05, "loss": 0.4558, "step": 18770 }, { "epoch": 1.8222394721521444, "grad_norm": 2.1030557687163007, "learning_rate": 1.9693036537088716e-05, "loss": 0.5578, "step": 18780 }, { "epoch": 1.8232097807102658, "grad_norm": 2.0907960296944474, "learning_rate": 1.9676812252579662e-05, "loss": 0.4974, "step": 18790 }, { "epoch": 1.8241800892683875, "grad_norm": 1.4612012638589904, "learning_rate": 1.966058796807061e-05, "loss": 0.4931, "step": 18800 }, { "epoch": 1.825150397826509, "grad_norm": 1.6970216305431254, "learning_rate": 1.9644363683561555e-05, "loss": 0.4691, "step": 18810 }, { "epoch": 1.8261207063846303, "grad_norm": 2.4415425125364125, "learning_rate": 1.9628139399052505e-05, "loss": 0.5249, "step": 18820 }, { "epoch": 1.8270910149427517, "grad_norm": 1.7771990898290009, "learning_rate": 1.961191511454345e-05, "loss": 0.5019, "step": 18830 }, { "epoch": 1.8280613235008731, "grad_norm": 1.7839717190667712, "learning_rate": 1.9595690830034397e-05, "loss": 0.5052, "step": 18840 }, { "epoch": 1.8290316320589948, "grad_norm": 2.209198913639542, "learning_rate": 1.9579466545525344e-05, "loss": 0.5235, "step": 18850 }, { "epoch": 1.8300019406171164, "grad_norm": 2.200681320097298, "learning_rate": 1.956324226101629e-05, "loss": 0.5302, "step": 18860 }, { "epoch": 1.8309722491752378, "grad_norm": 2.000921134970765, "learning_rate": 1.9547017976507237e-05, "loss": 0.4648, "step": 18870 }, { "epoch": 1.8319425577333592, "grad_norm": 2.623565257277865, "learning_rate": 1.9530793691998183e-05, "loss": 0.4855, "step": 18880 }, { "epoch": 1.8329128662914806, "grad_norm": 1.8550961862060569, "learning_rate": 1.951456940748913e-05, "loss": 0.5135, "step": 18890 }, { "epoch": 1.833883174849602, "grad_norm": 2.0777083761268718, "learning_rate": 1.9498345122980076e-05, "loss": 0.5236, "step": 18900 }, { "epoch": 1.8348534834077237, "grad_norm": 2.2339320492583337, "learning_rate": 1.9482120838471025e-05, "loss": 0.5336, "step": 18910 }, { "epoch": 1.835823791965845, "grad_norm": 2.2130179741978173, "learning_rate": 1.9465896553961972e-05, "loss": 0.4468, "step": 18920 }, { "epoch": 1.8367941005239667, "grad_norm": 1.9136609274036684, "learning_rate": 1.9449672269452918e-05, "loss": 0.516, "step": 18930 }, { "epoch": 1.8377644090820882, "grad_norm": 1.9319944175941055, "learning_rate": 1.9433447984943865e-05, "loss": 0.4837, "step": 18940 }, { "epoch": 1.8387347176402096, "grad_norm": 1.8144243996167448, "learning_rate": 1.941722370043481e-05, "loss": 0.5388, "step": 18950 }, { "epoch": 1.839705026198331, "grad_norm": 1.9871922674209272, "learning_rate": 1.9400999415925757e-05, "loss": 0.5543, "step": 18960 }, { "epoch": 1.8406753347564524, "grad_norm": 2.1436846647503707, "learning_rate": 1.9384775131416704e-05, "loss": 0.4787, "step": 18970 }, { "epoch": 1.841645643314574, "grad_norm": 1.926685455696982, "learning_rate": 1.936855084690765e-05, "loss": 0.5096, "step": 18980 }, { "epoch": 1.8426159518726957, "grad_norm": 2.342596116742962, "learning_rate": 1.93523265623986e-05, "loss": 0.522, "step": 18990 }, { "epoch": 1.843586260430817, "grad_norm": 2.255501885540124, "learning_rate": 1.9336102277889546e-05, "loss": 0.5377, "step": 19000 }, { "epoch": 1.843586260430817, "eval_loss": 0.6318312883377075, "eval_runtime": 2473.7549, "eval_samples_per_second": 0.724, "eval_steps_per_second": 0.362, "step": 19000 }, { "epoch": 1.8445565689889385, "grad_norm": 2.1596583204757884, "learning_rate": 1.9319877993380493e-05, "loss": 0.5629, "step": 19010 }, { "epoch": 1.84552687754706, "grad_norm": 1.8190379524057834, "learning_rate": 1.930365370887144e-05, "loss": 0.4796, "step": 19020 }, { "epoch": 1.8464971861051813, "grad_norm": 1.019855687196459, "learning_rate": 1.9287429424362385e-05, "loss": 0.471, "step": 19030 }, { "epoch": 1.847467494663303, "grad_norm": 1.9569552509020551, "learning_rate": 1.9271205139853332e-05, "loss": 0.5126, "step": 19040 }, { "epoch": 1.8484378032214244, "grad_norm": 2.1870238918367524, "learning_rate": 1.9254980855344278e-05, "loss": 0.4865, "step": 19050 }, { "epoch": 1.849408111779546, "grad_norm": 1.594729244733264, "learning_rate": 1.9238756570835225e-05, "loss": 0.4802, "step": 19060 }, { "epoch": 1.8503784203376674, "grad_norm": 2.1426009139912843, "learning_rate": 1.9222532286326174e-05, "loss": 0.5303, "step": 19070 }, { "epoch": 1.8513487288957888, "grad_norm": 1.7760248540972265, "learning_rate": 1.920630800181712e-05, "loss": 0.4948, "step": 19080 }, { "epoch": 1.8523190374539102, "grad_norm": 1.6725671761260958, "learning_rate": 1.919008371730807e-05, "loss": 0.4922, "step": 19090 }, { "epoch": 1.8532893460120319, "grad_norm": 1.9565774055440965, "learning_rate": 1.9173859432799017e-05, "loss": 0.5658, "step": 19100 }, { "epoch": 1.8542596545701533, "grad_norm": 2.231350831039019, "learning_rate": 1.9157635148289963e-05, "loss": 0.4906, "step": 19110 }, { "epoch": 1.855229963128275, "grad_norm": 1.5993699848453529, "learning_rate": 1.914141086378091e-05, "loss": 0.4796, "step": 19120 }, { "epoch": 1.8562002716863963, "grad_norm": 2.0780226359466387, "learning_rate": 1.9125186579271856e-05, "loss": 0.4773, "step": 19130 }, { "epoch": 1.8571705802445178, "grad_norm": 2.01763881138038, "learning_rate": 1.9108962294762802e-05, "loss": 0.4984, "step": 19140 }, { "epoch": 1.8581408888026392, "grad_norm": 2.240723917017317, "learning_rate": 1.909273801025375e-05, "loss": 0.4559, "step": 19150 }, { "epoch": 1.8591111973607606, "grad_norm": 1.849249605812625, "learning_rate": 1.9076513725744695e-05, "loss": 0.5445, "step": 19160 }, { "epoch": 1.8600815059188822, "grad_norm": 1.7656331962779384, "learning_rate": 1.906028944123564e-05, "loss": 0.466, "step": 19170 }, { "epoch": 1.8610518144770039, "grad_norm": 1.9723478771428924, "learning_rate": 1.904406515672659e-05, "loss": 0.4951, "step": 19180 }, { "epoch": 1.8620221230351253, "grad_norm": 2.133741060165629, "learning_rate": 1.9027840872217538e-05, "loss": 0.5064, "step": 19190 }, { "epoch": 1.8629924315932467, "grad_norm": 2.267317304504692, "learning_rate": 1.9011616587708484e-05, "loss": 0.4774, "step": 19200 }, { "epoch": 1.863962740151368, "grad_norm": 1.6590470147649787, "learning_rate": 1.899539230319943e-05, "loss": 0.4684, "step": 19210 }, { "epoch": 1.8649330487094895, "grad_norm": 1.7428570168415043, "learning_rate": 1.8979168018690377e-05, "loss": 0.494, "step": 19220 }, { "epoch": 1.8659033572676111, "grad_norm": 1.6212375134091286, "learning_rate": 1.8962943734181323e-05, "loss": 0.481, "step": 19230 }, { "epoch": 1.8668736658257326, "grad_norm": 2.0261745446913717, "learning_rate": 1.894671944967227e-05, "loss": 0.5107, "step": 19240 }, { "epoch": 1.8678439743838542, "grad_norm": 1.7751731877262686, "learning_rate": 1.8930495165163216e-05, "loss": 0.4615, "step": 19250 }, { "epoch": 1.8688142829419756, "grad_norm": 1.5566178962662132, "learning_rate": 1.8914270880654162e-05, "loss": 0.5061, "step": 19260 }, { "epoch": 1.869784591500097, "grad_norm": 2.0648415585072137, "learning_rate": 1.8898046596145112e-05, "loss": 0.4721, "step": 19270 }, { "epoch": 1.8707549000582184, "grad_norm": 1.8255664078089486, "learning_rate": 1.888182231163606e-05, "loss": 0.4652, "step": 19280 }, { "epoch": 1.8717252086163398, "grad_norm": 2.144758151637292, "learning_rate": 1.8865598027127005e-05, "loss": 0.4391, "step": 19290 }, { "epoch": 1.8726955171744615, "grad_norm": 1.803874603824675, "learning_rate": 1.884937374261795e-05, "loss": 0.492, "step": 19300 }, { "epoch": 1.8736658257325831, "grad_norm": 2.1603319792398685, "learning_rate": 1.8833149458108898e-05, "loss": 0.4602, "step": 19310 }, { "epoch": 1.8746361342907045, "grad_norm": 1.931469165672542, "learning_rate": 1.8816925173599844e-05, "loss": 0.479, "step": 19320 }, { "epoch": 1.875606442848826, "grad_norm": 2.289215850770512, "learning_rate": 1.880070088909079e-05, "loss": 0.475, "step": 19330 }, { "epoch": 1.8765767514069474, "grad_norm": 2.0548701793141384, "learning_rate": 1.8784476604581737e-05, "loss": 0.4523, "step": 19340 }, { "epoch": 1.8775470599650688, "grad_norm": 1.6304292402973444, "learning_rate": 1.8768252320072687e-05, "loss": 0.4704, "step": 19350 }, { "epoch": 1.8785173685231904, "grad_norm": 1.3760981254681826, "learning_rate": 1.8752028035563633e-05, "loss": 0.509, "step": 19360 }, { "epoch": 1.8794876770813118, "grad_norm": 2.1389308936606364, "learning_rate": 1.873580375105458e-05, "loss": 0.4791, "step": 19370 }, { "epoch": 1.8804579856394334, "grad_norm": 2.0673076161036663, "learning_rate": 1.8719579466545526e-05, "loss": 0.4902, "step": 19380 }, { "epoch": 1.8814282941975549, "grad_norm": 1.861431210462014, "learning_rate": 1.8703355182036472e-05, "loss": 0.4719, "step": 19390 }, { "epoch": 1.8823986027556763, "grad_norm": 1.9788908151587594, "learning_rate": 1.868713089752742e-05, "loss": 0.5127, "step": 19400 }, { "epoch": 1.8833689113137977, "grad_norm": 1.9487695948736958, "learning_rate": 1.8670906613018365e-05, "loss": 0.4469, "step": 19410 }, { "epoch": 1.8843392198719193, "grad_norm": 1.8048413444236069, "learning_rate": 1.865468232850931e-05, "loss": 0.5345, "step": 19420 }, { "epoch": 1.8853095284300407, "grad_norm": 1.8168395167881652, "learning_rate": 1.8638458044000258e-05, "loss": 0.4957, "step": 19430 }, { "epoch": 1.8862798369881624, "grad_norm": 1.9900462235084369, "learning_rate": 1.8622233759491207e-05, "loss": 0.4928, "step": 19440 }, { "epoch": 1.8872501455462838, "grad_norm": 1.9978639659195843, "learning_rate": 1.8606009474982154e-05, "loss": 0.5085, "step": 19450 }, { "epoch": 1.8882204541044052, "grad_norm": 1.9870398133754268, "learning_rate": 1.8589785190473104e-05, "loss": 0.4857, "step": 19460 }, { "epoch": 1.8891907626625266, "grad_norm": 1.9723654086733247, "learning_rate": 1.857356090596405e-05, "loss": 0.4658, "step": 19470 }, { "epoch": 1.890161071220648, "grad_norm": 1.8141064421371964, "learning_rate": 1.8557336621454996e-05, "loss": 0.4881, "step": 19480 }, { "epoch": 1.8911313797787697, "grad_norm": 1.6779273277493256, "learning_rate": 1.8541112336945943e-05, "loss": 0.4798, "step": 19490 }, { "epoch": 1.8921016883368913, "grad_norm": 1.901625964641795, "learning_rate": 1.852488805243689e-05, "loss": 0.5199, "step": 19500 }, { "epoch": 1.8921016883368913, "eval_loss": 0.631912350654602, "eval_runtime": 2473.0661, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.362, "step": 19500 }, { "epoch": 1.8930719968950127, "grad_norm": 1.7019367377384935, "learning_rate": 1.8508663767927836e-05, "loss": 0.4693, "step": 19510 }, { "epoch": 1.8940423054531341, "grad_norm": 1.9777898503801572, "learning_rate": 1.8492439483418782e-05, "loss": 0.4907, "step": 19520 }, { "epoch": 1.8950126140112555, "grad_norm": 2.613311730337197, "learning_rate": 1.8476215198909728e-05, "loss": 0.5179, "step": 19530 }, { "epoch": 1.895982922569377, "grad_norm": 1.3976847929675515, "learning_rate": 1.8459990914400678e-05, "loss": 0.4316, "step": 19540 }, { "epoch": 1.8969532311274986, "grad_norm": 2.1428783623843715, "learning_rate": 1.8443766629891624e-05, "loss": 0.5275, "step": 19550 }, { "epoch": 1.89792353968562, "grad_norm": 1.6878966830886573, "learning_rate": 1.842754234538257e-05, "loss": 0.5061, "step": 19560 }, { "epoch": 1.8988938482437416, "grad_norm": 1.6660923395832288, "learning_rate": 1.8411318060873517e-05, "loss": 0.4846, "step": 19570 }, { "epoch": 1.899864156801863, "grad_norm": 1.88350077160719, "learning_rate": 1.8395093776364464e-05, "loss": 0.4558, "step": 19580 }, { "epoch": 1.9008344653599845, "grad_norm": 1.8180585161160603, "learning_rate": 1.837886949185541e-05, "loss": 0.4462, "step": 19590 }, { "epoch": 1.9018047739181059, "grad_norm": 2.17218483201875, "learning_rate": 1.8362645207346356e-05, "loss": 0.4732, "step": 19600 }, { "epoch": 1.9027750824762273, "grad_norm": 2.150072249394045, "learning_rate": 1.8346420922837303e-05, "loss": 0.4807, "step": 19610 }, { "epoch": 1.903745391034349, "grad_norm": 2.186590190383392, "learning_rate": 1.8330196638328253e-05, "loss": 0.5064, "step": 19620 }, { "epoch": 1.9047156995924706, "grad_norm": 1.9773135303028604, "learning_rate": 1.83139723538192e-05, "loss": 0.4987, "step": 19630 }, { "epoch": 1.905686008150592, "grad_norm": 1.7722699958849875, "learning_rate": 1.8297748069310145e-05, "loss": 0.493, "step": 19640 }, { "epoch": 1.9066563167087134, "grad_norm": 2.398748653045921, "learning_rate": 1.828152378480109e-05, "loss": 0.4632, "step": 19650 }, { "epoch": 1.9076266252668348, "grad_norm": 1.9381638418073313, "learning_rate": 1.8265299500292038e-05, "loss": 0.4879, "step": 19660 }, { "epoch": 1.9085969338249562, "grad_norm": 1.806142449327403, "learning_rate": 1.8249075215782984e-05, "loss": 0.4426, "step": 19670 }, { "epoch": 1.9095672423830778, "grad_norm": 1.8375555769454368, "learning_rate": 1.823285093127393e-05, "loss": 0.5133, "step": 19680 }, { "epoch": 1.9105375509411993, "grad_norm": 2.1900711143018703, "learning_rate": 1.8216626646764877e-05, "loss": 0.4783, "step": 19690 }, { "epoch": 1.911507859499321, "grad_norm": 1.7860565394856096, "learning_rate": 1.8200402362255824e-05, "loss": 0.5432, "step": 19700 }, { "epoch": 1.9124781680574423, "grad_norm": 1.5287991043658957, "learning_rate": 1.8184178077746773e-05, "loss": 0.4818, "step": 19710 }, { "epoch": 1.9134484766155637, "grad_norm": 1.9162344859036078, "learning_rate": 1.816795379323772e-05, "loss": 0.5134, "step": 19720 }, { "epoch": 1.9144187851736851, "grad_norm": 1.9361948714776511, "learning_rate": 1.8151729508728666e-05, "loss": 0.4705, "step": 19730 }, { "epoch": 1.9153890937318068, "grad_norm": 1.7948070446854587, "learning_rate": 1.8135505224219613e-05, "loss": 0.4713, "step": 19740 }, { "epoch": 1.9163594022899282, "grad_norm": 1.94331048872498, "learning_rate": 1.811928093971056e-05, "loss": 0.4636, "step": 19750 }, { "epoch": 1.9173297108480498, "grad_norm": 1.9083792067669565, "learning_rate": 1.8103056655201505e-05, "loss": 0.4985, "step": 19760 }, { "epoch": 1.9183000194061712, "grad_norm": 1.7889544788537133, "learning_rate": 1.808683237069245e-05, "loss": 0.465, "step": 19770 }, { "epoch": 1.9192703279642926, "grad_norm": 2.1133403229015606, "learning_rate": 1.8070608086183398e-05, "loss": 0.4962, "step": 19780 }, { "epoch": 1.920240636522414, "grad_norm": 2.119511574698095, "learning_rate": 1.8054383801674344e-05, "loss": 0.5265, "step": 19790 }, { "epoch": 1.9212109450805355, "grad_norm": 1.8530048672687303, "learning_rate": 1.8038159517165294e-05, "loss": 0.4674, "step": 19800 }, { "epoch": 1.922181253638657, "grad_norm": 2.060163549018839, "learning_rate": 1.802193523265624e-05, "loss": 0.5232, "step": 19810 }, { "epoch": 1.9231515621967787, "grad_norm": 1.582662573057051, "learning_rate": 1.8005710948147187e-05, "loss": 0.5146, "step": 19820 }, { "epoch": 1.9241218707549002, "grad_norm": 1.9973478575975316, "learning_rate": 1.7989486663638133e-05, "loss": 0.5294, "step": 19830 }, { "epoch": 1.9250921793130216, "grad_norm": 2.2324903918022088, "learning_rate": 1.797326237912908e-05, "loss": 0.5039, "step": 19840 }, { "epoch": 1.926062487871143, "grad_norm": 2.24550841489829, "learning_rate": 1.7957038094620026e-05, "loss": 0.4763, "step": 19850 }, { "epoch": 1.9270327964292644, "grad_norm": 1.9055133702740896, "learning_rate": 1.7940813810110976e-05, "loss": 0.4978, "step": 19860 }, { "epoch": 1.928003104987386, "grad_norm": 1.6202922943730298, "learning_rate": 1.7924589525601922e-05, "loss": 0.4467, "step": 19870 }, { "epoch": 1.9289734135455074, "grad_norm": 1.7946217889300042, "learning_rate": 1.790836524109287e-05, "loss": 0.4975, "step": 19880 }, { "epoch": 1.929943722103629, "grad_norm": 2.0837037474907776, "learning_rate": 1.7892140956583815e-05, "loss": 0.4488, "step": 19890 }, { "epoch": 1.9309140306617505, "grad_norm": 2.284301097020319, "learning_rate": 1.7875916672074765e-05, "loss": 0.5315, "step": 19900 }, { "epoch": 1.931884339219872, "grad_norm": 1.6410720035809963, "learning_rate": 1.785969238756571e-05, "loss": 0.5064, "step": 19910 }, { "epoch": 1.9328546477779933, "grad_norm": 1.913707413180516, "learning_rate": 1.7843468103056658e-05, "loss": 0.5226, "step": 19920 }, { "epoch": 1.9338249563361147, "grad_norm": 1.972749697853565, "learning_rate": 1.7827243818547604e-05, "loss": 0.4794, "step": 19930 }, { "epoch": 1.9347952648942364, "grad_norm": 2.4245035874963663, "learning_rate": 1.781101953403855e-05, "loss": 0.472, "step": 19940 }, { "epoch": 1.935765573452358, "grad_norm": 1.6532981118899497, "learning_rate": 1.7794795249529497e-05, "loss": 0.4926, "step": 19950 }, { "epoch": 1.9367358820104794, "grad_norm": 2.301368908039538, "learning_rate": 1.7778570965020443e-05, "loss": 0.4847, "step": 19960 }, { "epoch": 1.9377061905686008, "grad_norm": 1.9798901797296686, "learning_rate": 1.776234668051139e-05, "loss": 0.5041, "step": 19970 }, { "epoch": 1.9386764991267222, "grad_norm": 2.0914880764103825, "learning_rate": 1.774612239600234e-05, "loss": 0.4543, "step": 19980 }, { "epoch": 1.9396468076848437, "grad_norm": 1.1710098992705869, "learning_rate": 1.7729898111493286e-05, "loss": 0.4568, "step": 19990 }, { "epoch": 1.9406171162429653, "grad_norm": 2.475821372597333, "learning_rate": 1.7713673826984232e-05, "loss": 0.503, "step": 20000 }, { "epoch": 1.9406171162429653, "eval_loss": 0.6306876540184021, "eval_runtime": 2470.8815, "eval_samples_per_second": 0.725, "eval_steps_per_second": 0.363, "step": 20000 }, { "epoch": 1.9415874248010867, "grad_norm": 1.8135605973580828, "learning_rate": 1.769744954247518e-05, "loss": 0.5225, "step": 20010 }, { "epoch": 1.9425577333592083, "grad_norm": 1.8818540999249187, "learning_rate": 1.7681225257966125e-05, "loss": 0.4411, "step": 20020 }, { "epoch": 1.9435280419173298, "grad_norm": 2.3455403519185767, "learning_rate": 1.766500097345707e-05, "loss": 0.4618, "step": 20030 }, { "epoch": 1.9444983504754512, "grad_norm": 1.8781392474090728, "learning_rate": 1.7648776688948018e-05, "loss": 0.4757, "step": 20040 }, { "epoch": 1.9454686590335726, "grad_norm": 2.4837262255821004, "learning_rate": 1.7632552404438964e-05, "loss": 0.4983, "step": 20050 }, { "epoch": 1.9464389675916942, "grad_norm": 2.0371146667611963, "learning_rate": 1.761632811992991e-05, "loss": 0.4684, "step": 20060 }, { "epoch": 1.9474092761498156, "grad_norm": 1.9070841836100068, "learning_rate": 1.760010383542086e-05, "loss": 0.5148, "step": 20070 }, { "epoch": 1.9483795847079373, "grad_norm": 1.6764022613616618, "learning_rate": 1.7583879550911806e-05, "loss": 0.5129, "step": 20080 }, { "epoch": 1.9493498932660587, "grad_norm": 1.801575932413747, "learning_rate": 1.7567655266402753e-05, "loss": 0.4924, "step": 20090 }, { "epoch": 1.95032020182418, "grad_norm": 1.417352324977605, "learning_rate": 1.75514309818937e-05, "loss": 0.4871, "step": 20100 }, { "epoch": 1.9512905103823015, "grad_norm": 1.7430495978746263, "learning_rate": 1.7535206697384646e-05, "loss": 0.4906, "step": 20110 }, { "epoch": 1.952260818940423, "grad_norm": 1.9378606596579797, "learning_rate": 1.7518982412875592e-05, "loss": 0.4441, "step": 20120 }, { "epoch": 1.9532311274985446, "grad_norm": 2.024032521983931, "learning_rate": 1.750275812836654e-05, "loss": 0.4574, "step": 20130 }, { "epoch": 1.9542014360566662, "grad_norm": 1.9748616447898866, "learning_rate": 1.7486533843857485e-05, "loss": 0.4955, "step": 20140 }, { "epoch": 1.9551717446147876, "grad_norm": 2.0789037317300876, "learning_rate": 1.7470309559348435e-05, "loss": 0.5176, "step": 20150 }, { "epoch": 1.956142053172909, "grad_norm": 1.7669158731638315, "learning_rate": 1.745408527483938e-05, "loss": 0.4242, "step": 20160 }, { "epoch": 1.9571123617310304, "grad_norm": 2.15917406316717, "learning_rate": 1.7437860990330327e-05, "loss": 0.457, "step": 20170 }, { "epoch": 1.9580826702891518, "grad_norm": 1.7798687601178151, "learning_rate": 1.7421636705821274e-05, "loss": 0.467, "step": 20180 }, { "epoch": 1.9590529788472735, "grad_norm": 2.2050151044492154, "learning_rate": 1.740541242131222e-05, "loss": 0.4562, "step": 20190 }, { "epoch": 1.9600232874053949, "grad_norm": 1.7571600556165854, "learning_rate": 1.7389188136803166e-05, "loss": 0.4951, "step": 20200 }, { "epoch": 1.9609935959635165, "grad_norm": 2.370912740726444, "learning_rate": 1.7372963852294113e-05, "loss": 0.5177, "step": 20210 }, { "epoch": 1.961963904521638, "grad_norm": 1.9598530660426143, "learning_rate": 1.735673956778506e-05, "loss": 0.4898, "step": 20220 }, { "epoch": 1.9629342130797593, "grad_norm": 1.8458335418058476, "learning_rate": 1.7340515283276006e-05, "loss": 0.4685, "step": 20230 }, { "epoch": 1.9639045216378808, "grad_norm": 2.086603579490736, "learning_rate": 1.7324290998766955e-05, "loss": 0.4655, "step": 20240 }, { "epoch": 1.9648748301960022, "grad_norm": 1.8861003799697185, "learning_rate": 1.7308066714257902e-05, "loss": 0.5012, "step": 20250 }, { "epoch": 1.9658451387541238, "grad_norm": 2.0410888085121655, "learning_rate": 1.729184242974885e-05, "loss": 0.4686, "step": 20260 }, { "epoch": 1.9668154473122454, "grad_norm": 2.0486563420463293, "learning_rate": 1.7275618145239798e-05, "loss": 0.4971, "step": 20270 }, { "epoch": 1.9677857558703669, "grad_norm": 2.053946424813733, "learning_rate": 1.7259393860730744e-05, "loss": 0.5002, "step": 20280 }, { "epoch": 1.9687560644284883, "grad_norm": 1.933061743356704, "learning_rate": 1.724316957622169e-05, "loss": 0.5072, "step": 20290 }, { "epoch": 1.9697263729866097, "grad_norm": 2.066749720982141, "learning_rate": 1.7226945291712637e-05, "loss": 0.4928, "step": 20300 }, { "epoch": 1.970696681544731, "grad_norm": 1.55984280015245, "learning_rate": 1.7210721007203583e-05, "loss": 0.4681, "step": 20310 }, { "epoch": 1.9716669901028527, "grad_norm": 2.0658531156309254, "learning_rate": 1.719449672269453e-05, "loss": 0.4451, "step": 20320 }, { "epoch": 1.9726372986609741, "grad_norm": 1.56362708496437, "learning_rate": 1.7178272438185476e-05, "loss": 0.5221, "step": 20330 }, { "epoch": 1.9736076072190958, "grad_norm": 2.161774232597424, "learning_rate": 1.7162048153676426e-05, "loss": 0.5527, "step": 20340 }, { "epoch": 1.9745779157772172, "grad_norm": 2.411930244708304, "learning_rate": 1.7145823869167372e-05, "loss": 0.4742, "step": 20350 }, { "epoch": 1.9755482243353386, "grad_norm": 2.0274695839961803, "learning_rate": 1.712959958465832e-05, "loss": 0.5055, "step": 20360 }, { "epoch": 1.97651853289346, "grad_norm": 1.9046814027988292, "learning_rate": 1.7113375300149265e-05, "loss": 0.5018, "step": 20370 }, { "epoch": 1.9774888414515817, "grad_norm": 2.6745574351193437, "learning_rate": 1.709715101564021e-05, "loss": 0.5013, "step": 20380 }, { "epoch": 1.978459150009703, "grad_norm": 1.983809422276857, "learning_rate": 1.7080926731131158e-05, "loss": 0.506, "step": 20390 }, { "epoch": 1.9794294585678247, "grad_norm": 1.8313417501684919, "learning_rate": 1.7064702446622104e-05, "loss": 0.4913, "step": 20400 }, { "epoch": 1.9803997671259461, "grad_norm": 1.9488632706180251, "learning_rate": 1.704847816211305e-05, "loss": 0.4753, "step": 20410 }, { "epoch": 1.9813700756840675, "grad_norm": 2.1235067779704386, "learning_rate": 1.7032253877603997e-05, "loss": 0.5076, "step": 20420 }, { "epoch": 1.982340384242189, "grad_norm": 1.9628606937438253, "learning_rate": 1.7016029593094947e-05, "loss": 0.4488, "step": 20430 }, { "epoch": 1.9833106928003104, "grad_norm": 2.053405187246396, "learning_rate": 1.6999805308585893e-05, "loss": 0.4492, "step": 20440 }, { "epoch": 1.984281001358432, "grad_norm": 2.027877392955927, "learning_rate": 1.698358102407684e-05, "loss": 0.4481, "step": 20450 }, { "epoch": 1.9852513099165536, "grad_norm": 2.6424653212507585, "learning_rate": 1.6967356739567786e-05, "loss": 0.4866, "step": 20460 }, { "epoch": 1.986221618474675, "grad_norm": 1.798153398364296, "learning_rate": 1.6951132455058732e-05, "loss": 0.4901, "step": 20470 }, { "epoch": 1.9871919270327965, "grad_norm": 1.9375715199504966, "learning_rate": 1.693490817054968e-05, "loss": 0.5145, "step": 20480 }, { "epoch": 1.9881622355909179, "grad_norm": 2.227384783436997, "learning_rate": 1.6918683886040625e-05, "loss": 0.5234, "step": 20490 }, { "epoch": 1.9891325441490393, "grad_norm": 1.6406328006727198, "learning_rate": 1.690245960153157e-05, "loss": 0.4938, "step": 20500 }, { "epoch": 1.9891325441490393, "eval_loss": 0.6277603507041931, "eval_runtime": 3080.184, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.291, "step": 20500 }, { "epoch": 1.990102852707161, "grad_norm": 2.189175771584087, "learning_rate": 1.688623531702252e-05, "loss": 0.4815, "step": 20510 }, { "epoch": 1.9910731612652823, "grad_norm": 1.9037268932948403, "learning_rate": 1.6870011032513468e-05, "loss": 0.4675, "step": 20520 }, { "epoch": 1.992043469823404, "grad_norm": 2.1370328259985096, "learning_rate": 1.6853786748004414e-05, "loss": 0.4696, "step": 20530 }, { "epoch": 1.9930137783815254, "grad_norm": 2.407631343670518, "learning_rate": 1.683756246349536e-05, "loss": 0.4904, "step": 20540 }, { "epoch": 1.9939840869396468, "grad_norm": 2.113566676023369, "learning_rate": 1.6821338178986307e-05, "loss": 0.4713, "step": 20550 }, { "epoch": 1.9949543954977682, "grad_norm": 1.7998412815701594, "learning_rate": 1.6805113894477253e-05, "loss": 0.4638, "step": 20560 }, { "epoch": 1.9959247040558896, "grad_norm": 2.129222373307167, "learning_rate": 1.67888896099682e-05, "loss": 0.4567, "step": 20570 }, { "epoch": 1.9968950126140113, "grad_norm": 2.0439893320145197, "learning_rate": 1.6772665325459146e-05, "loss": 0.499, "step": 20580 }, { "epoch": 1.997865321172133, "grad_norm": 1.996718548398611, "learning_rate": 1.6756441040950092e-05, "loss": 0.4657, "step": 20590 }, { "epoch": 1.9988356297302543, "grad_norm": 1.8047941252901814, "learning_rate": 1.6740216756441042e-05, "loss": 0.4723, "step": 20600 }, { "epoch": 1.9998059382883757, "grad_norm": 1.8452676210065997, "learning_rate": 1.672399247193199e-05, "loss": 0.4802, "step": 20610 }, { "epoch": 2.000776246846497, "grad_norm": 2.2446601261564254, "learning_rate": 1.6707768187422935e-05, "loss": 0.501, "step": 20620 }, { "epoch": 2.0017465554046185, "grad_norm": 1.6471605415361736, "learning_rate": 1.669154390291388e-05, "loss": 0.4735, "step": 20630 }, { "epoch": 2.00271686396274, "grad_norm": 2.038536155314213, "learning_rate": 1.667531961840483e-05, "loss": 0.4575, "step": 20640 }, { "epoch": 2.003687172520862, "grad_norm": 2.1416733149321834, "learning_rate": 1.6659095333895777e-05, "loss": 0.5152, "step": 20650 }, { "epoch": 2.0046574810789832, "grad_norm": 1.9639254031456526, "learning_rate": 1.6642871049386724e-05, "loss": 0.4424, "step": 20660 }, { "epoch": 2.0056277896371046, "grad_norm": 2.1201945824283777, "learning_rate": 1.662664676487767e-05, "loss": 0.5194, "step": 20670 }, { "epoch": 2.006598098195226, "grad_norm": 2.0259029186156616, "learning_rate": 1.6610422480368617e-05, "loss": 0.5085, "step": 20680 }, { "epoch": 2.0075684067533475, "grad_norm": 1.5035201214590004, "learning_rate": 1.6594198195859563e-05, "loss": 0.445, "step": 20690 }, { "epoch": 2.008538715311469, "grad_norm": 1.655200868630425, "learning_rate": 1.6577973911350513e-05, "loss": 0.4247, "step": 20700 }, { "epoch": 2.0095090238695907, "grad_norm": 2.1707069905375764, "learning_rate": 1.656174962684146e-05, "loss": 0.4783, "step": 20710 }, { "epoch": 2.010479332427712, "grad_norm": 1.8212536925620644, "learning_rate": 1.6545525342332405e-05, "loss": 0.4972, "step": 20720 }, { "epoch": 2.0114496409858336, "grad_norm": 2.126405409204561, "learning_rate": 1.6529301057823352e-05, "loss": 0.4878, "step": 20730 }, { "epoch": 2.012419949543955, "grad_norm": 2.077712603387953, "learning_rate": 1.6513076773314298e-05, "loss": 0.4571, "step": 20740 }, { "epoch": 2.0133902581020764, "grad_norm": 2.1605056660181345, "learning_rate": 1.6496852488805245e-05, "loss": 0.5012, "step": 20750 }, { "epoch": 2.014360566660198, "grad_norm": 1.8021397618450556, "learning_rate": 1.648062820429619e-05, "loss": 0.5049, "step": 20760 }, { "epoch": 2.015330875218319, "grad_norm": 1.8857630619437384, "learning_rate": 1.6464403919787137e-05, "loss": 0.4726, "step": 20770 }, { "epoch": 2.016301183776441, "grad_norm": 1.7074209121094917, "learning_rate": 1.6448179635278084e-05, "loss": 0.4914, "step": 20780 }, { "epoch": 2.0172714923345625, "grad_norm": 2.6019622648688614, "learning_rate": 1.6431955350769033e-05, "loss": 0.4239, "step": 20790 }, { "epoch": 2.018241800892684, "grad_norm": 1.6879446237580993, "learning_rate": 1.641573106625998e-05, "loss": 0.4657, "step": 20800 }, { "epoch": 2.0192121094508053, "grad_norm": 1.4603799114693456, "learning_rate": 1.6399506781750926e-05, "loss": 0.4503, "step": 20810 }, { "epoch": 2.0201824180089267, "grad_norm": 1.948639651180534, "learning_rate": 1.6383282497241873e-05, "loss": 0.4544, "step": 20820 }, { "epoch": 2.021152726567048, "grad_norm": 2.1458744835535617, "learning_rate": 1.636705821273282e-05, "loss": 0.4632, "step": 20830 }, { "epoch": 2.02212303512517, "grad_norm": 2.3393805927580997, "learning_rate": 1.6350833928223765e-05, "loss": 0.4686, "step": 20840 }, { "epoch": 2.0230933436832914, "grad_norm": 1.8274491944101148, "learning_rate": 1.6334609643714712e-05, "loss": 0.4509, "step": 20850 }, { "epoch": 2.024063652241413, "grad_norm": 2.5507345485151767, "learning_rate": 1.6318385359205658e-05, "loss": 0.5145, "step": 20860 }, { "epoch": 2.0250339607995342, "grad_norm": 1.878444628312939, "learning_rate": 1.6302161074696608e-05, "loss": 0.449, "step": 20870 }, { "epoch": 2.0260042693576557, "grad_norm": 2.1096395212213235, "learning_rate": 1.6285936790187554e-05, "loss": 0.409, "step": 20880 }, { "epoch": 2.026974577915777, "grad_norm": 1.9597465473748303, "learning_rate": 1.62697125056785e-05, "loss": 0.4909, "step": 20890 }, { "epoch": 2.0279448864738985, "grad_norm": 1.4262456717146317, "learning_rate": 1.6253488221169447e-05, "loss": 0.4933, "step": 20900 }, { "epoch": 2.0289151950320203, "grad_norm": 1.7254635997166883, "learning_rate": 1.6237263936660393e-05, "loss": 0.4581, "step": 20910 }, { "epoch": 2.0298855035901417, "grad_norm": 1.8255091897283775, "learning_rate": 1.622103965215134e-05, "loss": 0.462, "step": 20920 }, { "epoch": 2.030855812148263, "grad_norm": 1.592461465195459, "learning_rate": 1.6204815367642286e-05, "loss": 0.5343, "step": 20930 }, { "epoch": 2.0318261207063846, "grad_norm": 2.4124278045088445, "learning_rate": 1.6188591083133233e-05, "loss": 0.4738, "step": 20940 }, { "epoch": 2.032796429264506, "grad_norm": 1.4707790381425363, "learning_rate": 1.617236679862418e-05, "loss": 0.4183, "step": 20950 }, { "epoch": 2.0337667378226274, "grad_norm": 1.7383346761795593, "learning_rate": 1.615614251411513e-05, "loss": 0.4351, "step": 20960 }, { "epoch": 2.0347370463807493, "grad_norm": 2.292481354882268, "learning_rate": 1.6139918229606075e-05, "loss": 0.4294, "step": 20970 }, { "epoch": 2.0357073549388707, "grad_norm": 2.158168642341278, "learning_rate": 1.612369394509702e-05, "loss": 0.4703, "step": 20980 }, { "epoch": 2.036677663496992, "grad_norm": 2.283161751864384, "learning_rate": 1.6107469660587968e-05, "loss": 0.5128, "step": 20990 }, { "epoch": 2.0376479720551135, "grad_norm": 2.125582882132147, "learning_rate": 1.6091245376078914e-05, "loss": 0.4652, "step": 21000 }, { "epoch": 2.0376479720551135, "eval_loss": 0.6309866905212402, "eval_runtime": 3136.9547, "eval_samples_per_second": 0.571, "eval_steps_per_second": 0.286, "step": 21000 }, { "epoch": 2.038618280613235, "grad_norm": 1.9414092633891158, "learning_rate": 1.607502109156986e-05, "loss": 0.4451, "step": 21010 }, { "epoch": 2.0395885891713563, "grad_norm": 2.0603594802374863, "learning_rate": 1.6058796807060807e-05, "loss": 0.4569, "step": 21020 }, { "epoch": 2.040558897729478, "grad_norm": 1.6323777472844119, "learning_rate": 1.6042572522551757e-05, "loss": 0.4764, "step": 21030 }, { "epoch": 2.0415292062875996, "grad_norm": 1.8941882942793062, "learning_rate": 1.6026348238042703e-05, "loss": 0.4514, "step": 21040 }, { "epoch": 2.042499514845721, "grad_norm": 1.905361739266791, "learning_rate": 1.601012395353365e-05, "loss": 0.4681, "step": 21050 }, { "epoch": 2.0434698234038424, "grad_norm": 2.357513508114439, "learning_rate": 1.59938996690246e-05, "loss": 0.5029, "step": 21060 }, { "epoch": 2.044440131961964, "grad_norm": 2.045526641253892, "learning_rate": 1.5977675384515546e-05, "loss": 0.5031, "step": 21070 }, { "epoch": 2.0454104405200852, "grad_norm": 1.8018214238085957, "learning_rate": 1.5961451100006492e-05, "loss": 0.475, "step": 21080 }, { "epoch": 2.0463807490782067, "grad_norm": 2.157846458503379, "learning_rate": 1.594522681549744e-05, "loss": 0.4922, "step": 21090 }, { "epoch": 2.0473510576363285, "grad_norm": 2.2821214882561773, "learning_rate": 1.5929002530988385e-05, "loss": 0.4361, "step": 21100 }, { "epoch": 2.04832136619445, "grad_norm": 2.4410635323771395, "learning_rate": 1.591277824647933e-05, "loss": 0.4426, "step": 21110 }, { "epoch": 2.0492916747525713, "grad_norm": 1.847883527949176, "learning_rate": 1.5896553961970278e-05, "loss": 0.4998, "step": 21120 }, { "epoch": 2.0502619833106928, "grad_norm": 2.086929164640455, "learning_rate": 1.5880329677461224e-05, "loss": 0.5218, "step": 21130 }, { "epoch": 2.051232291868814, "grad_norm": 1.8955619202400775, "learning_rate": 1.5864105392952174e-05, "loss": 0.519, "step": 21140 }, { "epoch": 2.0522026004269356, "grad_norm": 2.43252621811095, "learning_rate": 1.584788110844312e-05, "loss": 0.4439, "step": 21150 }, { "epoch": 2.0531729089850574, "grad_norm": 2.0827457082492464, "learning_rate": 1.5831656823934067e-05, "loss": 0.5047, "step": 21160 }, { "epoch": 2.054143217543179, "grad_norm": 1.9212860192837349, "learning_rate": 1.5815432539425013e-05, "loss": 0.4655, "step": 21170 }, { "epoch": 2.0551135261013003, "grad_norm": 2.090169452444841, "learning_rate": 1.579920825491596e-05, "loss": 0.4421, "step": 21180 }, { "epoch": 2.0560838346594217, "grad_norm": 1.9503749019739738, "learning_rate": 1.5782983970406906e-05, "loss": 0.442, "step": 21190 }, { "epoch": 2.057054143217543, "grad_norm": 1.724201210329928, "learning_rate": 1.5766759685897852e-05, "loss": 0.3953, "step": 21200 }, { "epoch": 2.0580244517756645, "grad_norm": 2.081302027118628, "learning_rate": 1.57505354013888e-05, "loss": 0.4846, "step": 21210 }, { "epoch": 2.0589947603337864, "grad_norm": 2.5592097491782546, "learning_rate": 1.5734311116879745e-05, "loss": 0.4428, "step": 21220 }, { "epoch": 2.059965068891908, "grad_norm": 1.9807395474177856, "learning_rate": 1.5718086832370695e-05, "loss": 0.4843, "step": 21230 }, { "epoch": 2.060935377450029, "grad_norm": 1.946678515294379, "learning_rate": 1.570186254786164e-05, "loss": 0.5123, "step": 21240 }, { "epoch": 2.0619056860081506, "grad_norm": 1.704583779463954, "learning_rate": 1.5685638263352587e-05, "loss": 0.5018, "step": 21250 }, { "epoch": 2.062875994566272, "grad_norm": 1.9509206749604142, "learning_rate": 1.5669413978843534e-05, "loss": 0.4156, "step": 21260 }, { "epoch": 2.0638463031243934, "grad_norm": 2.0838442571499365, "learning_rate": 1.565318969433448e-05, "loss": 0.4632, "step": 21270 }, { "epoch": 2.064816611682515, "grad_norm": 1.9349794015053616, "learning_rate": 1.5636965409825427e-05, "loss": 0.4493, "step": 21280 }, { "epoch": 2.0657869202406367, "grad_norm": 2.188378390390559, "learning_rate": 1.5620741125316373e-05, "loss": 0.5279, "step": 21290 }, { "epoch": 2.066757228798758, "grad_norm": 1.8328420572385127, "learning_rate": 1.560451684080732e-05, "loss": 0.4611, "step": 21300 }, { "epoch": 2.0677275373568795, "grad_norm": 1.563376875838364, "learning_rate": 1.5588292556298266e-05, "loss": 0.4449, "step": 21310 }, { "epoch": 2.068697845915001, "grad_norm": 2.231697202375159, "learning_rate": 1.5572068271789215e-05, "loss": 0.4763, "step": 21320 }, { "epoch": 2.0696681544731224, "grad_norm": 2.116198927861082, "learning_rate": 1.5555843987280162e-05, "loss": 0.4313, "step": 21330 }, { "epoch": 2.0706384630312438, "grad_norm": 2.3801041085422763, "learning_rate": 1.5539619702771108e-05, "loss": 0.4643, "step": 21340 }, { "epoch": 2.0716087715893656, "grad_norm": 1.8571501672365118, "learning_rate": 1.5523395418262055e-05, "loss": 0.4423, "step": 21350 }, { "epoch": 2.072579080147487, "grad_norm": 1.6559131513123913, "learning_rate": 1.5507171133753e-05, "loss": 0.4437, "step": 21360 }, { "epoch": 2.0735493887056085, "grad_norm": 2.137893390544345, "learning_rate": 1.5490946849243947e-05, "loss": 0.4732, "step": 21370 }, { "epoch": 2.07451969726373, "grad_norm": 1.4742723623250382, "learning_rate": 1.5474722564734894e-05, "loss": 0.412, "step": 21380 }, { "epoch": 2.0754900058218513, "grad_norm": 2.1318829799319543, "learning_rate": 1.545849828022584e-05, "loss": 0.4916, "step": 21390 }, { "epoch": 2.0764603143799727, "grad_norm": 2.307110089522299, "learning_rate": 1.544227399571679e-05, "loss": 0.4124, "step": 21400 }, { "epoch": 2.077430622938094, "grad_norm": 2.3097275184207637, "learning_rate": 1.5426049711207736e-05, "loss": 0.4799, "step": 21410 }, { "epoch": 2.078400931496216, "grad_norm": 1.5854013780070428, "learning_rate": 1.5409825426698686e-05, "loss": 0.4759, "step": 21420 }, { "epoch": 2.0793712400543374, "grad_norm": 2.008961619138051, "learning_rate": 1.5393601142189632e-05, "loss": 0.4844, "step": 21430 }, { "epoch": 2.080341548612459, "grad_norm": 1.848635541993877, "learning_rate": 1.537737685768058e-05, "loss": 0.4558, "step": 21440 }, { "epoch": 2.08131185717058, "grad_norm": 2.3862164923195217, "learning_rate": 1.5361152573171525e-05, "loss": 0.4514, "step": 21450 }, { "epoch": 2.0822821657287016, "grad_norm": 2.322912088388782, "learning_rate": 1.534492828866247e-05, "loss": 0.4507, "step": 21460 }, { "epoch": 2.083252474286823, "grad_norm": 2.1765100761424074, "learning_rate": 1.5328704004153418e-05, "loss": 0.4114, "step": 21470 }, { "epoch": 2.084222782844945, "grad_norm": 1.915448768034906, "learning_rate": 1.5312479719644364e-05, "loss": 0.4794, "step": 21480 }, { "epoch": 2.0851930914030663, "grad_norm": 1.955422403883371, "learning_rate": 1.529625543513531e-05, "loss": 0.485, "step": 21490 }, { "epoch": 2.0861633999611877, "grad_norm": 1.5866416015413762, "learning_rate": 1.528003115062626e-05, "loss": 0.4961, "step": 21500 }, { "epoch": 2.0861633999611877, "eval_loss": 0.6304420232772827, "eval_runtime": 3075.3205, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.291, "step": 21500 }, { "epoch": 2.087133708519309, "grad_norm": 1.8961317659006691, "learning_rate": 1.5263806866117207e-05, "loss": 0.4803, "step": 21510 }, { "epoch": 2.0881040170774305, "grad_norm": 2.1574061097476633, "learning_rate": 1.5247582581608153e-05, "loss": 0.4897, "step": 21520 }, { "epoch": 2.089074325635552, "grad_norm": 2.080433640436673, "learning_rate": 1.52313582970991e-05, "loss": 0.4623, "step": 21530 }, { "epoch": 2.0900446341936734, "grad_norm": 2.274765795002327, "learning_rate": 1.5215134012590046e-05, "loss": 0.5149, "step": 21540 }, { "epoch": 2.0910149427517952, "grad_norm": 2.514598169769852, "learning_rate": 1.5198909728080992e-05, "loss": 0.4028, "step": 21550 }, { "epoch": 2.0919852513099166, "grad_norm": 2.1324330512224057, "learning_rate": 1.5182685443571939e-05, "loss": 0.5213, "step": 21560 }, { "epoch": 2.092955559868038, "grad_norm": 2.0829108139421106, "learning_rate": 1.5166461159062887e-05, "loss": 0.4338, "step": 21570 }, { "epoch": 2.0939258684261595, "grad_norm": 1.9606921161193192, "learning_rate": 1.5150236874553833e-05, "loss": 0.5347, "step": 21580 }, { "epoch": 2.094896176984281, "grad_norm": 1.9007868794459526, "learning_rate": 1.513401259004478e-05, "loss": 0.4698, "step": 21590 }, { "epoch": 2.0958664855424023, "grad_norm": 2.2070560808992385, "learning_rate": 1.5117788305535726e-05, "loss": 0.4992, "step": 21600 }, { "epoch": 2.096836794100524, "grad_norm": 1.7052502823407865, "learning_rate": 1.5101564021026674e-05, "loss": 0.4448, "step": 21610 }, { "epoch": 2.0978071026586456, "grad_norm": 1.9838818965293705, "learning_rate": 1.508533973651762e-05, "loss": 0.452, "step": 21620 }, { "epoch": 2.098777411216767, "grad_norm": 2.361870038528201, "learning_rate": 1.5069115452008567e-05, "loss": 0.4772, "step": 21630 }, { "epoch": 2.0997477197748884, "grad_norm": 2.3190708510226767, "learning_rate": 1.5052891167499513e-05, "loss": 0.4732, "step": 21640 }, { "epoch": 2.10071802833301, "grad_norm": 2.0341040427547967, "learning_rate": 1.5036666882990461e-05, "loss": 0.5013, "step": 21650 }, { "epoch": 2.101688336891131, "grad_norm": 2.2854031285851337, "learning_rate": 1.5020442598481408e-05, "loss": 0.4587, "step": 21660 }, { "epoch": 2.102658645449253, "grad_norm": 2.171534197336386, "learning_rate": 1.5004218313972354e-05, "loss": 0.4685, "step": 21670 }, { "epoch": 2.1036289540073745, "grad_norm": 2.237731272885117, "learning_rate": 1.49879940294633e-05, "loss": 0.4714, "step": 21680 }, { "epoch": 2.104599262565496, "grad_norm": 1.83625835750722, "learning_rate": 1.4971769744954247e-05, "loss": 0.4858, "step": 21690 }, { "epoch": 2.1055695711236173, "grad_norm": 2.1226910702010935, "learning_rate": 1.4955545460445195e-05, "loss": 0.4735, "step": 21700 }, { "epoch": 2.1065398796817387, "grad_norm": 2.4412848671448764, "learning_rate": 1.4939321175936141e-05, "loss": 0.4955, "step": 21710 }, { "epoch": 2.10751018823986, "grad_norm": 2.171686191103959, "learning_rate": 1.4923096891427088e-05, "loss": 0.4882, "step": 21720 }, { "epoch": 2.1084804967979816, "grad_norm": 1.9028000906788562, "learning_rate": 1.4906872606918034e-05, "loss": 0.5008, "step": 21730 }, { "epoch": 2.1094508053561034, "grad_norm": 2.032206148469331, "learning_rate": 1.4890648322408982e-05, "loss": 0.4688, "step": 21740 }, { "epoch": 2.110421113914225, "grad_norm": 2.2254255720592457, "learning_rate": 1.4874424037899929e-05, "loss": 0.434, "step": 21750 }, { "epoch": 2.1113914224723462, "grad_norm": 2.013615294714695, "learning_rate": 1.4858199753390875e-05, "loss": 0.4429, "step": 21760 }, { "epoch": 2.1123617310304676, "grad_norm": 1.625353905024779, "learning_rate": 1.4841975468881821e-05, "loss": 0.4197, "step": 21770 }, { "epoch": 2.113332039588589, "grad_norm": 2.6956426206670074, "learning_rate": 1.482575118437277e-05, "loss": 0.4412, "step": 21780 }, { "epoch": 2.1143023481467105, "grad_norm": 1.642670039289611, "learning_rate": 1.4809526899863716e-05, "loss": 0.4365, "step": 21790 }, { "epoch": 2.1152726567048323, "grad_norm": 2.047556877474702, "learning_rate": 1.4793302615354662e-05, "loss": 0.4882, "step": 21800 }, { "epoch": 2.1162429652629537, "grad_norm": 2.666095406948268, "learning_rate": 1.4777078330845612e-05, "loss": 0.4885, "step": 21810 }, { "epoch": 2.117213273821075, "grad_norm": 2.0148741202390736, "learning_rate": 1.4760854046336558e-05, "loss": 0.5092, "step": 21820 }, { "epoch": 2.1181835823791966, "grad_norm": 1.5220212421711388, "learning_rate": 1.4744629761827505e-05, "loss": 0.5007, "step": 21830 }, { "epoch": 2.119153890937318, "grad_norm": 2.2566622563684446, "learning_rate": 1.4728405477318453e-05, "loss": 0.4386, "step": 21840 }, { "epoch": 2.1201241994954394, "grad_norm": 2.13286935915719, "learning_rate": 1.47121811928094e-05, "loss": 0.4559, "step": 21850 }, { "epoch": 2.1210945080535613, "grad_norm": 1.8220197546440986, "learning_rate": 1.4695956908300346e-05, "loss": 0.451, "step": 21860 }, { "epoch": 2.1220648166116827, "grad_norm": 2.2468599066846653, "learning_rate": 1.4679732623791292e-05, "loss": 0.446, "step": 21870 }, { "epoch": 2.123035125169804, "grad_norm": 1.8918874944552218, "learning_rate": 1.466350833928224e-05, "loss": 0.459, "step": 21880 }, { "epoch": 2.1240054337279255, "grad_norm": 2.247255412788798, "learning_rate": 1.4647284054773186e-05, "loss": 0.5151, "step": 21890 }, { "epoch": 2.124975742286047, "grad_norm": 1.8291853119382993, "learning_rate": 1.4631059770264133e-05, "loss": 0.4557, "step": 21900 }, { "epoch": 2.1259460508441683, "grad_norm": 2.2000080164154654, "learning_rate": 1.461483548575508e-05, "loss": 0.5106, "step": 21910 }, { "epoch": 2.1269163594022897, "grad_norm": 1.608928034740487, "learning_rate": 1.4598611201246026e-05, "loss": 0.4696, "step": 21920 }, { "epoch": 2.1278866679604116, "grad_norm": 1.7796242187219558, "learning_rate": 1.4582386916736974e-05, "loss": 0.4372, "step": 21930 }, { "epoch": 2.128856976518533, "grad_norm": 1.9355986409124974, "learning_rate": 1.456616263222792e-05, "loss": 0.4601, "step": 21940 }, { "epoch": 2.1298272850766544, "grad_norm": 2.5044478412060776, "learning_rate": 1.4549938347718866e-05, "loss": 0.5153, "step": 21950 }, { "epoch": 2.130797593634776, "grad_norm": 2.1644310902185264, "learning_rate": 1.4533714063209813e-05, "loss": 0.4755, "step": 21960 }, { "epoch": 2.1317679021928972, "grad_norm": 1.9759401921373174, "learning_rate": 1.451748977870076e-05, "loss": 0.5306, "step": 21970 }, { "epoch": 2.1327382107510187, "grad_norm": 2.5662041065947827, "learning_rate": 1.4501265494191707e-05, "loss": 0.4936, "step": 21980 }, { "epoch": 2.13370851930914, "grad_norm": 1.834252286116916, "learning_rate": 1.4485041209682654e-05, "loss": 0.4248, "step": 21990 }, { "epoch": 2.134678827867262, "grad_norm": 2.048099727707521, "learning_rate": 1.44688169251736e-05, "loss": 0.4573, "step": 22000 }, { "epoch": 2.134678827867262, "eval_loss": 0.6303107142448425, "eval_runtime": 3417.8488, "eval_samples_per_second": 0.524, "eval_steps_per_second": 0.262, "step": 22000 }, { "epoch": 2.1356491364253833, "grad_norm": 2.032462630747061, "learning_rate": 1.4452592640664548e-05, "loss": 0.4496, "step": 22010 }, { "epoch": 2.1366194449835048, "grad_norm": 2.0538977423462548, "learning_rate": 1.4436368356155494e-05, "loss": 0.4512, "step": 22020 }, { "epoch": 2.137589753541626, "grad_norm": 2.0559328662146403, "learning_rate": 1.442014407164644e-05, "loss": 0.4469, "step": 22030 }, { "epoch": 2.1385600620997476, "grad_norm": 1.8412343311132753, "learning_rate": 1.4403919787137387e-05, "loss": 0.449, "step": 22040 }, { "epoch": 2.139530370657869, "grad_norm": 2.0513694288656765, "learning_rate": 1.4387695502628334e-05, "loss": 0.4709, "step": 22050 }, { "epoch": 2.140500679215991, "grad_norm": 1.7890079195029862, "learning_rate": 1.4371471218119282e-05, "loss": 0.4419, "step": 22060 }, { "epoch": 2.1414709877741123, "grad_norm": 2.535357630446327, "learning_rate": 1.4355246933610228e-05, "loss": 0.4305, "step": 22070 }, { "epoch": 2.1424412963322337, "grad_norm": 2.2977476166849957, "learning_rate": 1.4339022649101174e-05, "loss": 0.4664, "step": 22080 }, { "epoch": 2.143411604890355, "grad_norm": 1.8778176122292005, "learning_rate": 1.432279836459212e-05, "loss": 0.4321, "step": 22090 }, { "epoch": 2.1443819134484765, "grad_norm": 2.2740974899219526, "learning_rate": 1.4306574080083069e-05, "loss": 0.4582, "step": 22100 }, { "epoch": 2.145352222006598, "grad_norm": 1.7411166330088808, "learning_rate": 1.4290349795574015e-05, "loss": 0.4703, "step": 22110 }, { "epoch": 2.1463225305647198, "grad_norm": 2.270025807621109, "learning_rate": 1.4274125511064962e-05, "loss": 0.462, "step": 22120 }, { "epoch": 2.147292839122841, "grad_norm": 2.047404098376008, "learning_rate": 1.4257901226555908e-05, "loss": 0.4572, "step": 22130 }, { "epoch": 2.1482631476809626, "grad_norm": 1.9025461949345281, "learning_rate": 1.4241676942046856e-05, "loss": 0.4523, "step": 22140 }, { "epoch": 2.149233456239084, "grad_norm": 2.41657182633304, "learning_rate": 1.4225452657537803e-05, "loss": 0.4485, "step": 22150 }, { "epoch": 2.1502037647972054, "grad_norm": 2.1984178815028246, "learning_rate": 1.4209228373028749e-05, "loss": 0.4744, "step": 22160 }, { "epoch": 2.151174073355327, "grad_norm": 2.1671082483900044, "learning_rate": 1.4193004088519695e-05, "loss": 0.4277, "step": 22170 }, { "epoch": 2.1521443819134483, "grad_norm": 2.2526638629347118, "learning_rate": 1.4176779804010643e-05, "loss": 0.4078, "step": 22180 }, { "epoch": 2.15311469047157, "grad_norm": 2.4484821608494665, "learning_rate": 1.416055551950159e-05, "loss": 0.4726, "step": 22190 }, { "epoch": 2.1540849990296915, "grad_norm": 2.145545596752741, "learning_rate": 1.414433123499254e-05, "loss": 0.5181, "step": 22200 }, { "epoch": 2.155055307587813, "grad_norm": 2.2593836123374, "learning_rate": 1.4128106950483486e-05, "loss": 0.4986, "step": 22210 }, { "epoch": 2.1560256161459344, "grad_norm": 1.8007881267775863, "learning_rate": 1.4111882665974432e-05, "loss": 0.4483, "step": 22220 }, { "epoch": 2.1569959247040558, "grad_norm": 1.348362904445229, "learning_rate": 1.4095658381465379e-05, "loss": 0.4637, "step": 22230 }, { "epoch": 2.157966233262177, "grad_norm": 2.1469770203677445, "learning_rate": 1.4079434096956327e-05, "loss": 0.494, "step": 22240 }, { "epoch": 2.158936541820299, "grad_norm": 2.337402150331418, "learning_rate": 1.4063209812447273e-05, "loss": 0.4761, "step": 22250 }, { "epoch": 2.1599068503784205, "grad_norm": 1.7961706912599362, "learning_rate": 1.404698552793822e-05, "loss": 0.4683, "step": 22260 }, { "epoch": 2.160877158936542, "grad_norm": 1.6086385155901042, "learning_rate": 1.4030761243429166e-05, "loss": 0.4305, "step": 22270 }, { "epoch": 2.1618474674946633, "grad_norm": 1.853121432266295, "learning_rate": 1.4014536958920112e-05, "loss": 0.4808, "step": 22280 }, { "epoch": 2.1628177760527847, "grad_norm": 2.1594671768519094, "learning_rate": 1.399831267441106e-05, "loss": 0.4279, "step": 22290 }, { "epoch": 2.163788084610906, "grad_norm": 1.8445913886566945, "learning_rate": 1.3982088389902007e-05, "loss": 0.4905, "step": 22300 }, { "epoch": 2.164758393169028, "grad_norm": 1.8975990263034965, "learning_rate": 1.3965864105392953e-05, "loss": 0.4888, "step": 22310 }, { "epoch": 2.1657287017271494, "grad_norm": 2.015470924856379, "learning_rate": 1.39496398208839e-05, "loss": 0.5232, "step": 22320 }, { "epoch": 2.166699010285271, "grad_norm": 2.4943071641031698, "learning_rate": 1.3933415536374848e-05, "loss": 0.4827, "step": 22330 }, { "epoch": 2.167669318843392, "grad_norm": 1.5900299707664105, "learning_rate": 1.3917191251865794e-05, "loss": 0.4963, "step": 22340 }, { "epoch": 2.1686396274015136, "grad_norm": 1.8705765102756569, "learning_rate": 1.390096696735674e-05, "loss": 0.4389, "step": 22350 }, { "epoch": 2.169609935959635, "grad_norm": 1.9703821664953534, "learning_rate": 1.3884742682847687e-05, "loss": 0.3945, "step": 22360 }, { "epoch": 2.1705802445177564, "grad_norm": 1.6647445519639283, "learning_rate": 1.3868518398338635e-05, "loss": 0.4163, "step": 22370 }, { "epoch": 2.1715505530758783, "grad_norm": 1.617333797152806, "learning_rate": 1.3852294113829581e-05, "loss": 0.4943, "step": 22380 }, { "epoch": 2.1725208616339997, "grad_norm": 2.1150619610025796, "learning_rate": 1.3836069829320528e-05, "loss": 0.4941, "step": 22390 }, { "epoch": 2.173491170192121, "grad_norm": 2.70908196165092, "learning_rate": 1.3819845544811474e-05, "loss": 0.448, "step": 22400 }, { "epoch": 2.1744614787502425, "grad_norm": 2.195631441394506, "learning_rate": 1.3803621260302422e-05, "loss": 0.4611, "step": 22410 }, { "epoch": 2.175431787308364, "grad_norm": 1.602301760768748, "learning_rate": 1.3787396975793368e-05, "loss": 0.4837, "step": 22420 }, { "epoch": 2.1764020958664854, "grad_norm": 1.7676765995955546, "learning_rate": 1.3771172691284315e-05, "loss": 0.4798, "step": 22430 }, { "epoch": 2.177372404424607, "grad_norm": 2.211793623579749, "learning_rate": 1.3754948406775261e-05, "loss": 0.4607, "step": 22440 }, { "epoch": 2.1783427129827286, "grad_norm": 1.9526273269630037, "learning_rate": 1.3738724122266208e-05, "loss": 0.4638, "step": 22450 }, { "epoch": 2.17931302154085, "grad_norm": 2.063947829078263, "learning_rate": 1.3722499837757156e-05, "loss": 0.4548, "step": 22460 }, { "epoch": 2.1802833300989715, "grad_norm": 2.127208435257803, "learning_rate": 1.3706275553248102e-05, "loss": 0.4332, "step": 22470 }, { "epoch": 2.181253638657093, "grad_norm": 2.098766619473584, "learning_rate": 1.3690051268739048e-05, "loss": 0.4822, "step": 22480 }, { "epoch": 2.1822239472152143, "grad_norm": 1.5960382905615234, "learning_rate": 1.3673826984229995e-05, "loss": 0.4394, "step": 22490 }, { "epoch": 2.183194255773336, "grad_norm": 2.5521137154858087, "learning_rate": 1.3657602699720943e-05, "loss": 0.4568, "step": 22500 }, { "epoch": 2.183194255773336, "eval_loss": 0.6301902532577515, "eval_runtime": 3074.8498, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.291, "step": 22500 }, { "epoch": 2.1841645643314576, "grad_norm": 1.8964546731319356, "learning_rate": 1.364137841521189e-05, "loss": 0.4569, "step": 22510 }, { "epoch": 2.185134872889579, "grad_norm": 2.1836501057996918, "learning_rate": 1.3625154130702836e-05, "loss": 0.4497, "step": 22520 }, { "epoch": 2.1861051814477004, "grad_norm": 1.6786624803632986, "learning_rate": 1.3608929846193782e-05, "loss": 0.4229, "step": 22530 }, { "epoch": 2.187075490005822, "grad_norm": 2.1475126662304675, "learning_rate": 1.359270556168473e-05, "loss": 0.4412, "step": 22540 }, { "epoch": 2.188045798563943, "grad_norm": 2.322023881303393, "learning_rate": 1.3576481277175676e-05, "loss": 0.4235, "step": 22550 }, { "epoch": 2.1890161071220646, "grad_norm": 2.208490633488266, "learning_rate": 1.3560256992666623e-05, "loss": 0.4997, "step": 22560 }, { "epoch": 2.1899864156801865, "grad_norm": 1.869675214539207, "learning_rate": 1.354403270815757e-05, "loss": 0.4635, "step": 22570 }, { "epoch": 2.190956724238308, "grad_norm": 1.7398189038479703, "learning_rate": 1.3527808423648516e-05, "loss": 0.4128, "step": 22580 }, { "epoch": 2.1919270327964293, "grad_norm": 2.266171424803214, "learning_rate": 1.3511584139139465e-05, "loss": 0.4651, "step": 22590 }, { "epoch": 2.1928973413545507, "grad_norm": 2.3614429446369063, "learning_rate": 1.3495359854630413e-05, "loss": 0.4123, "step": 22600 }, { "epoch": 2.193867649912672, "grad_norm": 1.724551648002317, "learning_rate": 1.347913557012136e-05, "loss": 0.4481, "step": 22610 }, { "epoch": 2.1948379584707935, "grad_norm": 2.0558570962807945, "learning_rate": 1.3462911285612306e-05, "loss": 0.4431, "step": 22620 }, { "epoch": 2.195808267028915, "grad_norm": 1.943776934136987, "learning_rate": 1.3446687001103253e-05, "loss": 0.5128, "step": 22630 }, { "epoch": 2.196778575587037, "grad_norm": 1.958173689925549, "learning_rate": 1.34304627165942e-05, "loss": 0.4628, "step": 22640 }, { "epoch": 2.1977488841451582, "grad_norm": 1.990366090746634, "learning_rate": 1.3414238432085147e-05, "loss": 0.459, "step": 22650 }, { "epoch": 2.1987191927032796, "grad_norm": 1.8373680505548435, "learning_rate": 1.3398014147576093e-05, "loss": 0.4397, "step": 22660 }, { "epoch": 2.199689501261401, "grad_norm": 1.8078209399386682, "learning_rate": 1.338178986306704e-05, "loss": 0.5215, "step": 22670 }, { "epoch": 2.2006598098195225, "grad_norm": 2.400832940310904, "learning_rate": 1.3365565578557986e-05, "loss": 0.4926, "step": 22680 }, { "epoch": 2.201630118377644, "grad_norm": 2.055045167286068, "learning_rate": 1.3349341294048934e-05, "loss": 0.4461, "step": 22690 }, { "epoch": 2.2026004269357657, "grad_norm": 1.8144420639570567, "learning_rate": 1.333311700953988e-05, "loss": 0.4112, "step": 22700 }, { "epoch": 2.203570735493887, "grad_norm": 2.391519552042341, "learning_rate": 1.3316892725030827e-05, "loss": 0.439, "step": 22710 }, { "epoch": 2.2045410440520086, "grad_norm": 1.563384599944884, "learning_rate": 1.3300668440521773e-05, "loss": 0.4694, "step": 22720 }, { "epoch": 2.20551135261013, "grad_norm": 2.3546727224779715, "learning_rate": 1.3284444156012721e-05, "loss": 0.4355, "step": 22730 }, { "epoch": 2.2064816611682514, "grad_norm": 1.832975462434467, "learning_rate": 1.3268219871503668e-05, "loss": 0.4984, "step": 22740 }, { "epoch": 2.207451969726373, "grad_norm": 1.7975073035604554, "learning_rate": 1.3251995586994614e-05, "loss": 0.4584, "step": 22750 }, { "epoch": 2.2084222782844947, "grad_norm": 2.243306278714619, "learning_rate": 1.323577130248556e-05, "loss": 0.4365, "step": 22760 }, { "epoch": 2.209392586842616, "grad_norm": 2.0297904158946043, "learning_rate": 1.3219547017976509e-05, "loss": 0.5148, "step": 22770 }, { "epoch": 2.2103628954007375, "grad_norm": 1.6322562865594754, "learning_rate": 1.3203322733467455e-05, "loss": 0.4782, "step": 22780 }, { "epoch": 2.211333203958859, "grad_norm": 2.277603717413092, "learning_rate": 1.3187098448958401e-05, "loss": 0.4899, "step": 22790 }, { "epoch": 2.2123035125169803, "grad_norm": 1.2452738117030233, "learning_rate": 1.3170874164449348e-05, "loss": 0.4459, "step": 22800 }, { "epoch": 2.2132738210751017, "grad_norm": 2.3624119652920177, "learning_rate": 1.3154649879940294e-05, "loss": 0.5033, "step": 22810 }, { "epoch": 2.214244129633223, "grad_norm": 1.6861367814447958, "learning_rate": 1.3138425595431242e-05, "loss": 0.4434, "step": 22820 }, { "epoch": 2.215214438191345, "grad_norm": 2.5489904753238166, "learning_rate": 1.3122201310922189e-05, "loss": 0.4465, "step": 22830 }, { "epoch": 2.2161847467494664, "grad_norm": 1.5652883952842678, "learning_rate": 1.3105977026413135e-05, "loss": 0.4648, "step": 22840 }, { "epoch": 2.217155055307588, "grad_norm": 2.106609892171019, "learning_rate": 1.3089752741904081e-05, "loss": 0.4809, "step": 22850 }, { "epoch": 2.2181253638657092, "grad_norm": 2.7417498454164653, "learning_rate": 1.307352845739503e-05, "loss": 0.4652, "step": 22860 }, { "epoch": 2.2190956724238307, "grad_norm": 2.0375355388485645, "learning_rate": 1.3057304172885976e-05, "loss": 0.499, "step": 22870 }, { "epoch": 2.220065980981952, "grad_norm": 1.6341471214683112, "learning_rate": 1.3041079888376922e-05, "loss": 0.4935, "step": 22880 }, { "epoch": 2.221036289540074, "grad_norm": 2.3570884682492346, "learning_rate": 1.3024855603867869e-05, "loss": 0.4884, "step": 22890 }, { "epoch": 2.2220065980981953, "grad_norm": 1.9493074304368556, "learning_rate": 1.3008631319358817e-05, "loss": 0.4838, "step": 22900 }, { "epoch": 2.2229769066563168, "grad_norm": 2.1315651431097224, "learning_rate": 1.2992407034849763e-05, "loss": 0.4978, "step": 22910 }, { "epoch": 2.223947215214438, "grad_norm": 1.8843986392844572, "learning_rate": 1.297618275034071e-05, "loss": 0.4922, "step": 22920 }, { "epoch": 2.2249175237725596, "grad_norm": 2.0055582988037313, "learning_rate": 1.2959958465831656e-05, "loss": 0.4702, "step": 22930 }, { "epoch": 2.225887832330681, "grad_norm": 1.8513569200366902, "learning_rate": 1.2943734181322604e-05, "loss": 0.467, "step": 22940 }, { "epoch": 2.226858140888803, "grad_norm": 2.3018937212184682, "learning_rate": 1.292750989681355e-05, "loss": 0.4632, "step": 22950 }, { "epoch": 2.2278284494469243, "grad_norm": 2.1028486436310905, "learning_rate": 1.2911285612304497e-05, "loss": 0.4329, "step": 22960 }, { "epoch": 2.2287987580050457, "grad_norm": 2.3891744137342994, "learning_rate": 1.2895061327795443e-05, "loss": 0.4549, "step": 22970 }, { "epoch": 2.229769066563167, "grad_norm": 1.8338746725123478, "learning_rate": 1.2878837043286393e-05, "loss": 0.4518, "step": 22980 }, { "epoch": 2.2307393751212885, "grad_norm": 2.5499388804291483, "learning_rate": 1.286261275877734e-05, "loss": 0.4852, "step": 22990 }, { "epoch": 2.23170968367941, "grad_norm": 1.6648341489250056, "learning_rate": 1.2846388474268287e-05, "loss": 0.4266, "step": 23000 }, { "epoch": 2.23170968367941, "eval_loss": 0.6283465623855591, "eval_runtime": 3074.8923, "eval_samples_per_second": 0.583, "eval_steps_per_second": 0.291, "step": 23000 } ], "logging_steps": 10, "max_steps": 30918, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9248237711523840.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }