{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995577178239717, "eval_steps": 142, "global_step": 565, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.705155372619629, "learning_rate": 0.0001, "loss": 3.3182, "step": 1 }, { "epoch": 0.0, "eval_loss": 3.3362529277801514, "eval_runtime": 14.4366, "eval_samples_per_second": 33.041, "eval_steps_per_second": 8.312, "step": 1 }, { "epoch": 0.0, "grad_norm": 4.644563674926758, "learning_rate": 0.0002, "loss": 3.2788, "step": 2 }, { "epoch": 0.01, "grad_norm": 4.3825764656066895, "learning_rate": 0.0003, "loss": 2.9231, "step": 3 }, { "epoch": 0.01, "grad_norm": 2.904296636581421, "learning_rate": 0.0004, "loss": 1.1824, "step": 4 }, { "epoch": 0.01, "grad_norm": 1.6988284587860107, "learning_rate": 0.0005, "loss": 0.3472, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.5742101073265076, "learning_rate": 0.0006, "loss": 0.1478, "step": 6 }, { "epoch": 0.01, "grad_norm": 0.6511944532394409, "learning_rate": 0.0007, "loss": 0.1532, "step": 7 }, { "epoch": 0.01, "grad_norm": 2.305083751678467, "learning_rate": 0.0008, "loss": 0.2397, "step": 8 }, { "epoch": 0.02, "grad_norm": 4.7435078620910645, "learning_rate": 0.0009000000000000001, "loss": 0.434, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.6199779510498047, "learning_rate": 0.001, "loss": 0.1743, "step": 10 }, { "epoch": 0.02, "grad_norm": 0.14406554400920868, "learning_rate": 0.0009999991309598973, "loss": 0.1404, "step": 11 }, { "epoch": 0.02, "grad_norm": 0.1965201050043106, "learning_rate": 0.0009999965238426103, "loss": 0.1418, "step": 12 }, { "epoch": 0.02, "grad_norm": 26.520109176635742, "learning_rate": 0.0009999921786572016, "loss": 0.2689, "step": 13 }, { "epoch": 0.02, "grad_norm": 0.0870603695511818, "learning_rate": 0.0009999860954187755, "loss": 0.1338, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.05200817808508873, "learning_rate": 0.0009999782741484788, "loss": 0.1308, "step": 15 }, { "epoch": 0.03, "grad_norm": 0.2145700752735138, "learning_rate": 0.0009999687148734995, "loss": 0.1375, "step": 16 }, { "epoch": 0.03, "grad_norm": 0.19921083748340607, "learning_rate": 0.0009999574176270667, "loss": 0.1388, "step": 17 }, { "epoch": 0.03, "grad_norm": 1.193419337272644, "learning_rate": 0.0009999443824484518, "loss": 0.1978, "step": 18 }, { "epoch": 0.03, "grad_norm": 0.4399484395980835, "learning_rate": 0.0009999296093829671, "loss": 0.1518, "step": 19 }, { "epoch": 0.04, "grad_norm": 44.88853073120117, "learning_rate": 0.0009999130984819661, "loss": 0.9033, "step": 20 }, { "epoch": 0.04, "grad_norm": 0.3220385015010834, "learning_rate": 0.0009998948498028434, "loss": 0.1234, "step": 21 }, { "epoch": 0.04, "grad_norm": 0.5420748591423035, "learning_rate": 0.0009998748634090344, "loss": 0.1602, "step": 22 }, { "epoch": 0.04, "grad_norm": 0.5249865651130676, "learning_rate": 0.0009998531393700149, "loss": 0.1538, "step": 23 }, { "epoch": 0.04, "grad_norm": 0.056158341467380524, "learning_rate": 0.000999829677761301, "loss": 0.1374, "step": 24 }, { "epoch": 0.04, "grad_norm": 0.19818872213363647, "learning_rate": 0.0009998044786644492, "loss": 0.1413, "step": 25 }, { "epoch": 0.05, "grad_norm": 0.27901849150657654, "learning_rate": 0.0009997775421670557, "loss": 0.1395, "step": 26 }, { "epoch": 0.05, "grad_norm": 0.22768354415893555, "learning_rate": 0.0009997488683627558, "loss": 0.1241, "step": 27 }, { "epoch": 0.05, "grad_norm": 0.14878959953784943, "learning_rate": 0.0009997184573512245, "loss": 0.1243, "step": 28 }, { "epoch": 0.05, "grad_norm": 1.0589066743850708, "learning_rate": 0.000999686309238175, "loss": 0.2499, "step": 29 }, { "epoch": 0.05, "grad_norm": 0.11455405503511429, "learning_rate": 0.00099965242413536, "loss": 0.1254, "step": 30 }, { "epoch": 0.05, "grad_norm": 0.16566088795661926, "learning_rate": 0.000999616802160569, "loss": 0.1416, "step": 31 }, { "epoch": 0.06, "grad_norm": 1.3691716194152832, "learning_rate": 0.0009995794434376297, "loss": 0.1465, "step": 32 }, { "epoch": 0.06, "grad_norm": 0.09674070030450821, "learning_rate": 0.000999540348096407, "loss": 0.1373, "step": 33 }, { "epoch": 0.06, "grad_norm": 0.5034632086753845, "learning_rate": 0.000999499516272803, "loss": 0.1471, "step": 34 }, { "epoch": 0.06, "grad_norm": 0.26572930812835693, "learning_rate": 0.0009994569481087553, "loss": 0.1424, "step": 35 }, { "epoch": 0.06, "grad_norm": 0.20631802082061768, "learning_rate": 0.0009994126437522376, "loss": 0.1449, "step": 36 }, { "epoch": 0.07, "grad_norm": 0.11268749833106995, "learning_rate": 0.0009993666033572591, "loss": 0.1403, "step": 37 }, { "epoch": 0.07, "grad_norm": 0.6610996723175049, "learning_rate": 0.0009993188270838635, "loss": 0.1424, "step": 38 }, { "epoch": 0.07, "grad_norm": 98.93838500976562, "learning_rate": 0.0009992693150981291, "loss": 2.775, "step": 39 }, { "epoch": 0.07, "grad_norm": 32.53168869018555, "learning_rate": 0.0009992180675721671, "loss": 0.6932, "step": 40 }, { "epoch": 0.07, "grad_norm": 54.8778076171875, "learning_rate": 0.0009991650846841226, "loss": 5.7008, "step": 41 }, { "epoch": 0.07, "grad_norm": 2.0524775981903076, "learning_rate": 0.000999110366618172, "loss": 0.1623, "step": 42 }, { "epoch": 0.08, "grad_norm": 0.404278427362442, "learning_rate": 0.0009990539135645246, "loss": 0.1427, "step": 43 }, { "epoch": 0.08, "grad_norm": 1.7963409423828125, "learning_rate": 0.0009989957257194198, "loss": 0.174, "step": 44 }, { "epoch": 0.08, "grad_norm": 0.11620022356510162, "learning_rate": 0.0009989358032851284, "loss": 0.1339, "step": 45 }, { "epoch": 0.08, "grad_norm": 0.5025681853294373, "learning_rate": 0.00099887414646995, "loss": 0.1558, "step": 46 }, { "epoch": 0.08, "grad_norm": 78.1165771484375, "learning_rate": 0.0009988107554882138, "loss": 2.2938, "step": 47 }, { "epoch": 0.08, "grad_norm": 0.08389786630868912, "learning_rate": 0.0009987456305602768, "loss": 0.1409, "step": 48 }, { "epoch": 0.09, "grad_norm": 7.123101711273193, "learning_rate": 0.0009986787719125242, "loss": 0.1524, "step": 49 }, { "epoch": 0.09, "grad_norm": 0.5341290235519409, "learning_rate": 0.0009986101797773666, "loss": 0.1598, "step": 50 }, { "epoch": 0.09, "grad_norm": 0.05239284038543701, "learning_rate": 0.000998539854393242, "loss": 0.1386, "step": 51 }, { "epoch": 0.09, "grad_norm": 0.0722254291176796, "learning_rate": 0.0009984677960046123, "loss": 0.1385, "step": 52 }, { "epoch": 0.09, "grad_norm": 0.11535236239433289, "learning_rate": 0.000998394004861964, "loss": 0.1369, "step": 53 }, { "epoch": 0.1, "grad_norm": 0.7584894299507141, "learning_rate": 0.0009983184812218072, "loss": 0.108, "step": 54 }, { "epoch": 0.1, "grad_norm": 0.8361538052558899, "learning_rate": 0.000998241225346674, "loss": 0.1703, "step": 55 }, { "epoch": 0.1, "grad_norm": 0.37683162093162537, "learning_rate": 0.0009981622375051184, "loss": 0.1368, "step": 56 }, { "epoch": 0.1, "grad_norm": 0.5335961580276489, "learning_rate": 0.0009980815179717144, "loss": 0.1559, "step": 57 }, { "epoch": 0.1, "grad_norm": 0.2806299328804016, "learning_rate": 0.0009979990670270565, "loss": 0.1397, "step": 58 }, { "epoch": 0.1, "grad_norm": 0.4967437982559204, "learning_rate": 0.0009979148849577574, "loss": 0.1543, "step": 59 }, { "epoch": 0.11, "grad_norm": 0.10350017994642258, "learning_rate": 0.0009978289720564471, "loss": 0.1367, "step": 60 }, { "epoch": 0.11, "grad_norm": 78.35698699951172, "learning_rate": 0.0009977413286217727, "loss": 2.2474, "step": 61 }, { "epoch": 0.11, "grad_norm": 0.15264186263084412, "learning_rate": 0.0009976519549583973, "loss": 0.1311, "step": 62 }, { "epoch": 0.11, "grad_norm": 0.31865784525871277, "learning_rate": 0.0009975608513769975, "loss": 0.1407, "step": 63 }, { "epoch": 0.11, "grad_norm": 0.32891547679901123, "learning_rate": 0.0009974680181942645, "loss": 0.1423, "step": 64 }, { "epoch": 0.11, "grad_norm": 0.15653717517852783, "learning_rate": 0.0009973734557329008, "loss": 0.1365, "step": 65 }, { "epoch": 0.12, "grad_norm": 0.3237778842449188, "learning_rate": 0.0009972771643216212, "loss": 0.1407, "step": 66 }, { "epoch": 0.12, "grad_norm": 0.13634416460990906, "learning_rate": 0.0009971791442951496, "loss": 0.1378, "step": 67 }, { "epoch": 0.12, "grad_norm": 0.3488883376121521, "learning_rate": 0.0009970793959942197, "loss": 0.1429, "step": 68 }, { "epoch": 0.12, "grad_norm": 0.5150622129440308, "learning_rate": 0.0009969779197655725, "loss": 0.1492, "step": 69 }, { "epoch": 0.12, "grad_norm": 0.3482552468776703, "learning_rate": 0.0009968747159619555, "loss": 0.1415, "step": 70 }, { "epoch": 0.13, "grad_norm": 0.22551549971103668, "learning_rate": 0.000996769784942122, "loss": 0.1418, "step": 71 }, { "epoch": 0.13, "grad_norm": 0.20759086310863495, "learning_rate": 0.0009966631270708287, "loss": 0.1366, "step": 72 }, { "epoch": 0.13, "grad_norm": 13.050313949584961, "learning_rate": 0.0009965547427188356, "loss": 0.1375, "step": 73 }, { "epoch": 0.13, "grad_norm": 0.18372055888175964, "learning_rate": 0.0009964446322629043, "loss": 0.1285, "step": 74 }, { "epoch": 0.13, "grad_norm": 0.4404817819595337, "learning_rate": 0.000996332796085796, "loss": 0.1501, "step": 75 }, { "epoch": 0.13, "grad_norm": 1.269240379333496, "learning_rate": 0.0009962192345762716, "loss": 0.1346, "step": 76 }, { "epoch": 0.14, "grad_norm": 32.20164108276367, "learning_rate": 0.0009961039481290888, "loss": 0.3348, "step": 77 }, { "epoch": 0.14, "grad_norm": 82.92976379394531, "learning_rate": 0.0009959869371450021, "loss": 5.8309, "step": 78 }, { "epoch": 0.14, "grad_norm": 0.3416314721107483, "learning_rate": 0.0009958682020307602, "loss": 0.1418, "step": 79 }, { "epoch": 0.14, "grad_norm": 31.961870193481445, "learning_rate": 0.0009957477431991053, "loss": 0.1899, "step": 80 }, { "epoch": 0.14, "grad_norm": 38.58375930786133, "learning_rate": 0.000995625561068772, "loss": 0.5641, "step": 81 }, { "epoch": 0.15, "grad_norm": 0.32622194290161133, "learning_rate": 0.0009955016560644846, "loss": 0.1144, "step": 82 }, { "epoch": 0.15, "grad_norm": 6.264970779418945, "learning_rate": 0.0009953760286169572, "loss": 0.4788, "step": 83 }, { "epoch": 0.15, "grad_norm": 0.07168668508529663, "learning_rate": 0.0009952486791628904, "loss": 0.1326, "step": 84 }, { "epoch": 0.15, "grad_norm": 35.18340301513672, "learning_rate": 0.000995119608144972, "loss": 0.3884, "step": 85 }, { "epoch": 0.15, "grad_norm": 0.03896519914269447, "learning_rate": 0.000994988816011873, "loss": 0.1249, "step": 86 }, { "epoch": 0.15, "grad_norm": 14.499520301818848, "learning_rate": 0.000994856303218248, "loss": 0.3756, "step": 87 }, { "epoch": 0.16, "grad_norm": 0.3134947419166565, "learning_rate": 0.000994722070224733, "loss": 0.1539, "step": 88 }, { "epoch": 0.16, "grad_norm": 117.39696502685547, "learning_rate": 0.000994586117497943, "loss": 0.5885, "step": 89 }, { "epoch": 0.16, "grad_norm": 37.93465805053711, "learning_rate": 0.0009944484455104716, "loss": 0.7709, "step": 90 }, { "epoch": 0.16, "grad_norm": 236.63330078125, "learning_rate": 0.0009943090547408888, "loss": 6.0182, "step": 91 }, { "epoch": 0.16, "grad_norm": 1.1088515520095825, "learning_rate": 0.0009941679456737394, "loss": 0.1931, "step": 92 }, { "epoch": 0.16, "grad_norm": 0.11310256272554398, "learning_rate": 0.0009940251187995411, "loss": 0.1293, "step": 93 }, { "epoch": 0.17, "grad_norm": 0.6143047213554382, "learning_rate": 0.0009938805746147828, "loss": 0.2364, "step": 94 }, { "epoch": 0.17, "grad_norm": 0.2461577206850052, "learning_rate": 0.0009937343136219232, "loss": 0.1504, "step": 95 }, { "epoch": 0.17, "grad_norm": 97.17162322998047, "learning_rate": 0.0009935863363293895, "loss": 5.764, "step": 96 }, { "epoch": 0.17, "grad_norm": 0.5417380928993225, "learning_rate": 0.000993436643251574, "loss": 0.1576, "step": 97 }, { "epoch": 0.17, "grad_norm": 0.2737255096435547, "learning_rate": 0.0009932852349088341, "loss": 0.1437, "step": 98 }, { "epoch": 0.18, "grad_norm": 138.00778198242188, "learning_rate": 0.0009931321118274896, "loss": 4.1331, "step": 99 }, { "epoch": 0.18, "grad_norm": 46.4688606262207, "learning_rate": 0.0009929772745398205, "loss": 0.6178, "step": 100 }, { "epoch": 0.18, "grad_norm": 0.49907386302948, "learning_rate": 0.0009928207235840663, "loss": 0.1445, "step": 101 }, { "epoch": 0.18, "grad_norm": 0.33814460039138794, "learning_rate": 0.0009926624595044233, "loss": 0.139, "step": 102 }, { "epoch": 0.18, "grad_norm": 0.3241071403026581, "learning_rate": 0.0009925024828510427, "loss": 0.1404, "step": 103 }, { "epoch": 0.18, "grad_norm": 78.4036865234375, "learning_rate": 0.000992340794180029, "loss": 1.2663, "step": 104 }, { "epoch": 0.19, "grad_norm": 1.113776445388794, "learning_rate": 0.000992177394053438, "loss": 0.162, "step": 105 }, { "epoch": 0.19, "grad_norm": 16.512048721313477, "learning_rate": 0.0009920122830392748, "loss": 3.3373, "step": 106 }, { "epoch": 0.19, "grad_norm": 111.53176879882812, "learning_rate": 0.0009918454617114918, "loss": 2.3969, "step": 107 }, { "epoch": 0.19, "grad_norm": 14.91741943359375, "learning_rate": 0.0009916769306499865, "loss": 1.8837, "step": 108 }, { "epoch": 0.19, "grad_norm": 61.30055618286133, "learning_rate": 0.0009915066904406, "loss": 10.4922, "step": 109 }, { "epoch": 0.19, "grad_norm": 0.6948704123497009, "learning_rate": 0.0009913347416751147, "loss": 0.1536, "step": 110 }, { "epoch": 0.2, "grad_norm": 0.7721084356307983, "learning_rate": 0.000991161084951252, "loss": 0.1356, "step": 111 }, { "epoch": 0.2, "grad_norm": 0.24614596366882324, "learning_rate": 0.0009909857208726704, "loss": 0.1339, "step": 112 }, { "epoch": 0.2, "grad_norm": 7.189969062805176, "learning_rate": 0.0009908086500489638, "loss": 0.2551, "step": 113 }, { "epoch": 0.2, "grad_norm": 0.8675662279129028, "learning_rate": 0.0009906298730956585, "loss": 0.1668, "step": 114 }, { "epoch": 0.2, "grad_norm": 0.605249285697937, "learning_rate": 0.0009904493906342123, "loss": 0.1478, "step": 115 }, { "epoch": 0.21, "grad_norm": 0.8765722513198853, "learning_rate": 0.0009902672032920106, "loss": 0.1598, "step": 116 }, { "epoch": 0.21, "grad_norm": 0.6021157503128052, "learning_rate": 0.0009900833117023665, "loss": 0.1506, "step": 117 }, { "epoch": 0.21, "grad_norm": 0.28180792927742004, "learning_rate": 0.000989897716504516, "loss": 0.1389, "step": 118 }, { "epoch": 0.21, "grad_norm": 0.21730898320674896, "learning_rate": 0.0009897104183436184, "loss": 0.1377, "step": 119 }, { "epoch": 0.21, "grad_norm": 0.977118730545044, "learning_rate": 0.0009895214178707516, "loss": 0.1698, "step": 120 }, { "epoch": 0.21, "grad_norm": 2.674729585647583, "learning_rate": 0.0009893307157429118, "loss": 0.1559, "step": 121 }, { "epoch": 0.22, "grad_norm": 0.9852035045623779, "learning_rate": 0.0009891383126230102, "loss": 0.2027, "step": 122 }, { "epoch": 0.22, "grad_norm": 0.36689773201942444, "learning_rate": 0.0009889442091798712, "loss": 0.1498, "step": 123 }, { "epoch": 0.22, "grad_norm": 0.104621522128582, "learning_rate": 0.000988748406088229, "loss": 0.1379, "step": 124 }, { "epoch": 0.22, "grad_norm": 74.17496490478516, "learning_rate": 0.0009885509040287268, "loss": 0.7724, "step": 125 }, { "epoch": 0.22, "grad_norm": 1.2943025827407837, "learning_rate": 0.0009883517036879132, "loss": 0.2643, "step": 126 }, { "epoch": 0.22, "grad_norm": 0.828774094581604, "learning_rate": 0.000988150805758241, "loss": 0.1852, "step": 127 }, { "epoch": 0.23, "grad_norm": 0.13165877759456635, "learning_rate": 0.0009879482109380632, "loss": 0.1429, "step": 128 }, { "epoch": 0.23, "grad_norm": 0.662426769733429, "learning_rate": 0.0009877439199316323, "loss": 0.1643, "step": 129 }, { "epoch": 0.23, "grad_norm": 0.6256189942359924, "learning_rate": 0.0009875379334490962, "loss": 0.157, "step": 130 }, { "epoch": 0.23, "grad_norm": 0.5049256086349487, "learning_rate": 0.0009873302522064972, "loss": 0.1484, "step": 131 }, { "epoch": 0.23, "grad_norm": 1.4133671522140503, "learning_rate": 0.0009871208769257685, "loss": 0.1736, "step": 132 }, { "epoch": 0.24, "grad_norm": 0.7930824756622314, "learning_rate": 0.0009869098083347323, "loss": 0.1543, "step": 133 }, { "epoch": 0.24, "grad_norm": 0.5717449188232422, "learning_rate": 0.0009866970471670965, "loss": 0.1338, "step": 134 }, { "epoch": 0.24, "grad_norm": 0.582081139087677, "learning_rate": 0.0009864825941624537, "loss": 0.1692, "step": 135 }, { "epoch": 0.24, "grad_norm": 10.226588249206543, "learning_rate": 0.0009862664500662763, "loss": 0.2425, "step": 136 }, { "epoch": 0.24, "grad_norm": 1.1186953783035278, "learning_rate": 0.0009860486156299164, "loss": 0.2052, "step": 137 }, { "epoch": 0.24, "grad_norm": 0.2953661382198334, "learning_rate": 0.000985829091610601, "loss": 0.1408, "step": 138 }, { "epoch": 0.25, "grad_norm": 0.8647088408470154, "learning_rate": 0.000985607878771431, "loss": 0.1571, "step": 139 }, { "epoch": 0.25, "grad_norm": 0.41964420676231384, "learning_rate": 0.0009853849778813776, "loss": 0.1477, "step": 140 }, { "epoch": 0.25, "grad_norm": 0.25675931572914124, "learning_rate": 0.0009851603897152803, "loss": 0.1398, "step": 141 }, { "epoch": 0.25, "grad_norm": 0.2311631143093109, "learning_rate": 0.0009849341150538434, "loss": 0.1432, "step": 142 }, { "epoch": 0.25, "eval_loss": 1.5366541147232056, "eval_runtime": 14.6962, "eval_samples_per_second": 32.457, "eval_steps_per_second": 8.165, "step": 142 }, { "epoch": 0.25, "grad_norm": 41.83562469482422, "learning_rate": 0.0009847061546836339, "loss": 1.1525, "step": 143 }, { "epoch": 0.25, "grad_norm": 0.27440375089645386, "learning_rate": 0.0009844765093970787, "loss": 0.1452, "step": 144 }, { "epoch": 0.26, "grad_norm": 0.27643319964408875, "learning_rate": 0.0009842451799924616, "loss": 0.1069, "step": 145 }, { "epoch": 0.26, "grad_norm": 0.21519601345062256, "learning_rate": 0.0009840121672739207, "loss": 0.1358, "step": 146 }, { "epoch": 0.26, "grad_norm": 0.4073689877986908, "learning_rate": 0.0009837774720514456, "loss": 0.1545, "step": 147 }, { "epoch": 0.26, "grad_norm": 0.13685636222362518, "learning_rate": 0.0009835410951408747, "loss": 0.1259, "step": 148 }, { "epoch": 0.26, "grad_norm": 0.07474564015865326, "learning_rate": 0.000983303037363892, "loss": 0.1356, "step": 149 }, { "epoch": 0.27, "grad_norm": 0.45116662979125977, "learning_rate": 0.0009830632995480241, "loss": 0.1379, "step": 150 }, { "epoch": 0.27, "grad_norm": 0.1297813504934311, "learning_rate": 0.0009828218825266388, "loss": 0.1343, "step": 151 }, { "epoch": 0.27, "grad_norm": 0.5846492052078247, "learning_rate": 0.00098257878713894, "loss": 0.1563, "step": 152 }, { "epoch": 0.27, "grad_norm": 0.38457778096199036, "learning_rate": 0.0009823340142299662, "loss": 0.1477, "step": 153 }, { "epoch": 0.27, "grad_norm": 0.09184035658836365, "learning_rate": 0.0009820875646505873, "loss": 0.1376, "step": 154 }, { "epoch": 0.27, "grad_norm": 0.5166211128234863, "learning_rate": 0.0009818394392575019, "loss": 0.1498, "step": 155 }, { "epoch": 0.28, "grad_norm": 0.2788640260696411, "learning_rate": 0.0009815896389132332, "loss": 0.1434, "step": 156 }, { "epoch": 0.28, "grad_norm": 0.3762676417827606, "learning_rate": 0.0009813381644861276, "loss": 0.1482, "step": 157 }, { "epoch": 0.28, "grad_norm": 0.3615610897541046, "learning_rate": 0.0009810850168503506, "loss": 0.1312, "step": 158 }, { "epoch": 0.28, "grad_norm": 0.03483320027589798, "learning_rate": 0.0009808301968858837, "loss": 0.1239, "step": 159 }, { "epoch": 0.28, "grad_norm": 0.5616227984428406, "learning_rate": 0.0009805737054785222, "loss": 0.1881, "step": 160 }, { "epoch": 0.28, "grad_norm": 0.029542161151766777, "learning_rate": 0.000980315543519871, "loss": 0.1254, "step": 161 }, { "epoch": 0.29, "grad_norm": 0.142581045627594, "learning_rate": 0.0009800557119073433, "loss": 0.1258, "step": 162 }, { "epoch": 0.29, "grad_norm": 0.7289375066757202, "learning_rate": 0.0009797942115441546, "loss": 0.1526, "step": 163 }, { "epoch": 0.29, "grad_norm": 0.6975064873695374, "learning_rate": 0.0009795310433393224, "loss": 0.1487, "step": 164 }, { "epoch": 0.29, "grad_norm": 1.3072260618209839, "learning_rate": 0.0009792662082076617, "loss": 0.1712, "step": 165 }, { "epoch": 0.29, "grad_norm": 0.2993917465209961, "learning_rate": 0.000978999707069782, "loss": 0.1424, "step": 166 }, { "epoch": 0.3, "grad_norm": 0.3258236050605774, "learning_rate": 0.0009787315408520839, "loss": 0.135, "step": 167 }, { "epoch": 0.3, "grad_norm": 0.26566603779792786, "learning_rate": 0.000978461710486756, "loss": 0.1441, "step": 168 }, { "epoch": 0.3, "grad_norm": 1.1709599494934082, "learning_rate": 0.0009781902169117718, "loss": 0.2084, "step": 169 }, { "epoch": 0.3, "grad_norm": 0.6554279923439026, "learning_rate": 0.000977917061070887, "loss": 0.1634, "step": 170 }, { "epoch": 0.3, "grad_norm": 0.1635073721408844, "learning_rate": 0.000977642243913635, "loss": 0.1371, "step": 171 }, { "epoch": 0.3, "grad_norm": 0.4419834017753601, "learning_rate": 0.0009773657663953242, "loss": 0.1523, "step": 172 }, { "epoch": 0.31, "grad_norm": 0.839259147644043, "learning_rate": 0.000977087629477035, "loss": 0.1628, "step": 173 }, { "epoch": 0.31, "grad_norm": 0.1979222148656845, "learning_rate": 0.0009768078341256155, "loss": 0.1367, "step": 174 }, { "epoch": 0.31, "grad_norm": 0.2939910888671875, "learning_rate": 0.0009765263813136795, "loss": 0.1349, "step": 175 }, { "epoch": 0.31, "grad_norm": 0.19882674515247345, "learning_rate": 0.0009762432720196024, "loss": 0.1424, "step": 176 }, { "epoch": 0.31, "grad_norm": 0.07146954536437988, "learning_rate": 0.000975958507227517, "loss": 0.1237, "step": 177 }, { "epoch": 0.31, "grad_norm": 0.5031868815422058, "learning_rate": 0.0009756720879273117, "loss": 0.1592, "step": 178 }, { "epoch": 0.32, "grad_norm": 0.14860151708126068, "learning_rate": 0.0009753840151146258, "loss": 0.1396, "step": 179 }, { "epoch": 0.32, "grad_norm": 0.10280521959066391, "learning_rate": 0.0009750942897908468, "loss": 0.1333, "step": 180 }, { "epoch": 0.32, "grad_norm": 0.4652903974056244, "learning_rate": 0.0009748029129631061, "loss": 0.1421, "step": 181 }, { "epoch": 0.32, "grad_norm": 0.3985591530799866, "learning_rate": 0.0009745098856442768, "loss": 0.1459, "step": 182 }, { "epoch": 0.32, "grad_norm": 0.20321591198444366, "learning_rate": 0.0009742152088529683, "loss": 0.1381, "step": 183 }, { "epoch": 0.33, "grad_norm": 0.7694361805915833, "learning_rate": 0.0009739188836135246, "loss": 0.1676, "step": 184 }, { "epoch": 0.33, "grad_norm": 0.04469340294599533, "learning_rate": 0.0009736209109560201, "loss": 0.136, "step": 185 }, { "epoch": 0.33, "grad_norm": 0.08576061576604843, "learning_rate": 0.0009733212919162549, "loss": 0.1408, "step": 186 }, { "epoch": 0.33, "grad_norm": 0.042906519025564194, "learning_rate": 0.0009730200275357535, "loss": 0.1364, "step": 187 }, { "epoch": 0.33, "grad_norm": 0.30054494738578796, "learning_rate": 0.0009727171188617588, "loss": 0.1539, "step": 188 }, { "epoch": 0.33, "grad_norm": 0.05149005725979805, "learning_rate": 0.0009724125669472299, "loss": 0.1352, "step": 189 }, { "epoch": 0.34, "grad_norm": 0.1381620466709137, "learning_rate": 0.0009721063728508383, "loss": 0.1409, "step": 190 }, { "epoch": 0.34, "grad_norm": 0.37344205379486084, "learning_rate": 0.0009717985376369639, "loss": 0.1299, "step": 191 }, { "epoch": 0.34, "grad_norm": 0.1037706583738327, "learning_rate": 0.0009714890623756912, "loss": 0.1341, "step": 192 }, { "epoch": 0.34, "grad_norm": 0.14189712703227997, "learning_rate": 0.0009711779481428056, "loss": 0.1418, "step": 193 }, { "epoch": 0.34, "grad_norm": 0.15108801424503326, "learning_rate": 0.0009708651960197903, "loss": 0.142, "step": 194 }, { "epoch": 0.34, "grad_norm": 0.037045519798994064, "learning_rate": 0.0009705508070938218, "loss": 0.1315, "step": 195 }, { "epoch": 0.35, "grad_norm": 0.23301652073860168, "learning_rate": 0.0009702347824577666, "loss": 0.1396, "step": 196 }, { "epoch": 0.35, "grad_norm": 0.08476269990205765, "learning_rate": 0.0009699171232101768, "loss": 0.1392, "step": 197 }, { "epoch": 0.35, "grad_norm": 0.4222690463066101, "learning_rate": 0.000969597830455287, "loss": 0.1463, "step": 198 }, { "epoch": 0.35, "grad_norm": 0.3234136402606964, "learning_rate": 0.0009692769053030099, "loss": 0.1257, "step": 199 }, { "epoch": 0.35, "grad_norm": 0.04025443643331528, "learning_rate": 0.0009689543488689332, "loss": 0.1303, "step": 200 }, { "epoch": 0.36, "grad_norm": 0.07074520736932755, "learning_rate": 0.0009686301622743144, "loss": 0.1289, "step": 201 }, { "epoch": 0.36, "grad_norm": 0.0788850486278534, "learning_rate": 0.0009683043466460782, "loss": 0.1236, "step": 202 }, { "epoch": 0.36, "grad_norm": 0.525541365146637, "learning_rate": 0.000967976903116812, "loss": 0.1564, "step": 203 }, { "epoch": 0.36, "grad_norm": 0.6145509481430054, "learning_rate": 0.0009676478328247623, "loss": 0.156, "step": 204 }, { "epoch": 0.36, "grad_norm": 0.230132058262825, "learning_rate": 0.0009673171369138296, "loss": 0.1425, "step": 205 }, { "epoch": 0.36, "grad_norm": 0.03262978792190552, "learning_rate": 0.0009669848165335666, "loss": 0.1297, "step": 206 }, { "epoch": 0.37, "grad_norm": 0.0462469644844532, "learning_rate": 0.0009666508728391718, "loss": 0.1177, "step": 207 }, { "epoch": 0.37, "grad_norm": 0.06880385428667068, "learning_rate": 0.0009663153069914874, "loss": 0.1207, "step": 208 }, { "epoch": 0.37, "grad_norm": 0.4248260259628296, "learning_rate": 0.000965978120156994, "loss": 0.1571, "step": 209 }, { "epoch": 0.37, "grad_norm": 0.060492075979709625, "learning_rate": 0.0009656393135078068, "loss": 0.1219, "step": 210 }, { "epoch": 0.37, "grad_norm": 0.12135621905326843, "learning_rate": 0.0009652988882216725, "loss": 0.1323, "step": 211 }, { "epoch": 0.38, "grad_norm": 0.252119243144989, "learning_rate": 0.0009649568454819637, "loss": 0.1366, "step": 212 }, { "epoch": 0.38, "grad_norm": 0.5283567905426025, "learning_rate": 0.0009646131864776761, "loss": 0.1246, "step": 213 }, { "epoch": 0.38, "grad_norm": 2.224665880203247, "learning_rate": 0.0009642679124034233, "loss": 0.2582, "step": 214 }, { "epoch": 0.38, "grad_norm": 1.9277523756027222, "learning_rate": 0.0009639210244594335, "loss": 0.2131, "step": 215 }, { "epoch": 0.38, "grad_norm": 0.5668452978134155, "learning_rate": 0.0009635725238515446, "loss": 0.141, "step": 216 }, { "epoch": 0.38, "grad_norm": 0.13912492990493774, "learning_rate": 0.000963222411791201, "loss": 0.1418, "step": 217 }, { "epoch": 0.39, "grad_norm": 0.39307814836502075, "learning_rate": 0.0009628706894954479, "loss": 0.1477, "step": 218 }, { "epoch": 0.39, "grad_norm": 0.26248928904533386, "learning_rate": 0.000962517358186929, "loss": 0.1315, "step": 219 }, { "epoch": 0.39, "grad_norm": 0.2875257730484009, "learning_rate": 0.0009621624190938803, "loss": 0.1321, "step": 220 }, { "epoch": 0.39, "grad_norm": 0.6386964917182922, "learning_rate": 0.0009618058734501269, "loss": 0.1668, "step": 221 }, { "epoch": 0.39, "grad_norm": 0.16165001690387726, "learning_rate": 0.0009614477224950789, "loss": 0.1272, "step": 222 }, { "epoch": 0.39, "grad_norm": 0.6959558129310608, "learning_rate": 0.0009610879674737262, "loss": 0.1381, "step": 223 }, { "epoch": 0.4, "grad_norm": 0.1701437532901764, "learning_rate": 0.0009607266096366352, "loss": 0.1366, "step": 224 }, { "epoch": 0.4, "grad_norm": 0.2511409819126129, "learning_rate": 0.0009603636502399437, "loss": 0.126, "step": 225 }, { "epoch": 0.4, "grad_norm": 0.04554220288991928, "learning_rate": 0.0009599990905453566, "loss": 0.1321, "step": 226 }, { "epoch": 0.4, "grad_norm": 0.3964705765247345, "learning_rate": 0.000959632931820142, "loss": 0.1383, "step": 227 }, { "epoch": 0.4, "grad_norm": 0.10925984382629395, "learning_rate": 0.0009592651753371264, "loss": 0.1226, "step": 228 }, { "epoch": 0.41, "grad_norm": 0.19012318551540375, "learning_rate": 0.0009588958223746903, "loss": 0.1255, "step": 229 }, { "epoch": 0.41, "grad_norm": 0.23432157933712006, "learning_rate": 0.0009585248742167639, "loss": 0.1152, "step": 230 }, { "epoch": 0.41, "grad_norm": 0.1737753301858902, "learning_rate": 0.0009581523321528223, "loss": 0.1468, "step": 231 }, { "epoch": 0.41, "grad_norm": 0.2625434100627899, "learning_rate": 0.0009577781974778817, "loss": 0.1296, "step": 232 }, { "epoch": 0.41, "grad_norm": 0.3056884706020355, "learning_rate": 0.000957402471492494, "loss": 0.1574, "step": 233 }, { "epoch": 0.41, "grad_norm": 0.4111999273300171, "learning_rate": 0.0009570251555027432, "loss": 0.1434, "step": 234 }, { "epoch": 0.42, "grad_norm": 0.056673482060432434, "learning_rate": 0.0009566462508202401, "loss": 0.1337, "step": 235 }, { "epoch": 0.42, "grad_norm": 0.3861597180366516, "learning_rate": 0.0009562657587621184, "loss": 0.1609, "step": 236 }, { "epoch": 0.42, "grad_norm": 0.35893362760543823, "learning_rate": 0.0009558836806510292, "loss": 0.1189, "step": 237 }, { "epoch": 0.42, "grad_norm": 0.40538331866264343, "learning_rate": 0.0009555000178151374, "loss": 0.1504, "step": 238 }, { "epoch": 0.42, "grad_norm": 81.36141967773438, "learning_rate": 0.0009551147715881167, "loss": 4.7235, "step": 239 }, { "epoch": 0.42, "grad_norm": 0.21178042888641357, "learning_rate": 0.0009547279433091446, "loss": 0.1139, "step": 240 }, { "epoch": 0.43, "grad_norm": 0.27380529046058655, "learning_rate": 0.0009543395343228983, "loss": 0.1504, "step": 241 }, { "epoch": 0.43, "grad_norm": 41.42683410644531, "learning_rate": 0.0009539495459795498, "loss": 1.2477, "step": 242 }, { "epoch": 0.43, "grad_norm": 0.14853385090827942, "learning_rate": 0.0009535579796347612, "loss": 0.1343, "step": 243 }, { "epoch": 0.43, "grad_norm": 0.3484509289264679, "learning_rate": 0.0009531648366496798, "loss": 0.15, "step": 244 }, { "epoch": 0.43, "grad_norm": 0.20152732729911804, "learning_rate": 0.0009527701183909336, "loss": 0.1399, "step": 245 }, { "epoch": 0.44, "grad_norm": 80.84031677246094, "learning_rate": 0.000952373826230627, "loss": 3.1939, "step": 246 }, { "epoch": 0.44, "grad_norm": 15.475607872009277, "learning_rate": 0.0009519759615463346, "loss": 3.3935, "step": 247 }, { "epoch": 0.44, "grad_norm": 77.19477081298828, "learning_rate": 0.0009515765257210979, "loss": 6.5034, "step": 248 }, { "epoch": 0.44, "grad_norm": 0.1174071803689003, "learning_rate": 0.0009511755201434205, "loss": 0.1212, "step": 249 }, { "epoch": 0.44, "grad_norm": 16.503982543945312, "learning_rate": 0.0009507729462072614, "loss": 0.3753, "step": 250 }, { "epoch": 0.44, "grad_norm": 76.65412902832031, "learning_rate": 0.0009503688053120326, "loss": 0.9386, "step": 251 }, { "epoch": 0.45, "grad_norm": 94.82160186767578, "learning_rate": 0.0009499630988625925, "loss": 4.7449, "step": 252 }, { "epoch": 0.45, "grad_norm": 0.2721010148525238, "learning_rate": 0.0009495558282692421, "loss": 0.1358, "step": 253 }, { "epoch": 0.45, "grad_norm": 0.5150814056396484, "learning_rate": 0.0009491469949477187, "loss": 0.1622, "step": 254 }, { "epoch": 0.45, "grad_norm": 51.050167083740234, "learning_rate": 0.0009487366003191931, "loss": 0.7818, "step": 255 }, { "epoch": 0.45, "grad_norm": 11.698090553283691, "learning_rate": 0.0009483246458102625, "loss": 0.3862, "step": 256 }, { "epoch": 0.45, "grad_norm": 0.648543655872345, "learning_rate": 0.0009479111328529472, "loss": 0.1884, "step": 257 }, { "epoch": 0.46, "grad_norm": 0.745293140411377, "learning_rate": 0.0009474960628846843, "loss": 0.1562, "step": 258 }, { "epoch": 0.46, "grad_norm": 0.17890043556690216, "learning_rate": 0.0009470794373483235, "loss": 0.1425, "step": 259 }, { "epoch": 0.46, "grad_norm": 0.5058090686798096, "learning_rate": 0.0009466612576921223, "loss": 0.17, "step": 260 }, { "epoch": 0.46, "grad_norm": 1.3177820444107056, "learning_rate": 0.00094624152536974, "loss": 0.15, "step": 261 }, { "epoch": 0.46, "grad_norm": 0.49652573466300964, "learning_rate": 0.0009458202418402337, "loss": 0.145, "step": 262 }, { "epoch": 0.47, "grad_norm": 11.423394203186035, "learning_rate": 0.0009453974085680526, "loss": 0.349, "step": 263 }, { "epoch": 0.47, "grad_norm": 1.5422337055206299, "learning_rate": 0.0009449730270230326, "loss": 0.211, "step": 264 }, { "epoch": 0.47, "grad_norm": 103.68435668945312, "learning_rate": 0.0009445470986803921, "loss": 17.4069, "step": 265 }, { "epoch": 0.47, "grad_norm": 54.51758575439453, "learning_rate": 0.0009441196250207267, "loss": 15.685, "step": 266 }, { "epoch": 0.47, "grad_norm": 14.596623420715332, "learning_rate": 0.0009436906075300032, "loss": 0.791, "step": 267 }, { "epoch": 0.47, "grad_norm": 3.3164780139923096, "learning_rate": 0.000943260047699555, "loss": 0.3611, "step": 268 }, { "epoch": 0.48, "grad_norm": 0.3087855577468872, "learning_rate": 0.0009428279470260776, "loss": 0.1332, "step": 269 }, { "epoch": 0.48, "grad_norm": 1.1544523239135742, "learning_rate": 0.0009423943070116219, "loss": 0.2405, "step": 270 }, { "epoch": 0.48, "grad_norm": 0.27010253071784973, "learning_rate": 0.00094195912916359, "loss": 0.1241, "step": 271 }, { "epoch": 0.48, "grad_norm": 0.2287709265947342, "learning_rate": 0.0009415224149947306, "loss": 0.1366, "step": 272 }, { "epoch": 0.48, "grad_norm": 0.5216432809829712, "learning_rate": 0.0009410841660231316, "loss": 0.1641, "step": 273 }, { "epoch": 0.48, "grad_norm": 1.3091949224472046, "learning_rate": 0.0009406443837722167, "loss": 0.2524, "step": 274 }, { "epoch": 0.49, "grad_norm": 0.11813609302043915, "learning_rate": 0.0009402030697707398, "loss": 0.1353, "step": 275 }, { "epoch": 0.49, "grad_norm": 1.3709551095962524, "learning_rate": 0.000939760225552779, "loss": 0.2714, "step": 276 }, { "epoch": 0.49, "grad_norm": 8.527563095092773, "learning_rate": 0.0009393158526577322, "loss": 0.1955, "step": 277 }, { "epoch": 0.49, "grad_norm": 21.874027252197266, "learning_rate": 0.0009388699526303105, "loss": 0.2398, "step": 278 }, { "epoch": 0.49, "grad_norm": 51.793731689453125, "learning_rate": 0.0009384225270205339, "loss": 1.3069, "step": 279 }, { "epoch": 0.5, "grad_norm": 0.6711062788963318, "learning_rate": 0.0009379735773837259, "loss": 0.1664, "step": 280 }, { "epoch": 0.5, "grad_norm": 5.93789005279541, "learning_rate": 0.0009375231052805072, "loss": 0.2455, "step": 281 }, { "epoch": 0.5, "grad_norm": 62.527198791503906, "learning_rate": 0.0009370711122767912, "loss": 6.6447, "step": 282 }, { "epoch": 0.5, "grad_norm": 22.35348129272461, "learning_rate": 0.000936617599943778, "loss": 2.5015, "step": 283 }, { "epoch": 0.5, "grad_norm": 0.7277780175209045, "learning_rate": 0.0009361625698579493, "loss": 0.1667, "step": 284 }, { "epoch": 0.5, "eval_loss": 0.14179374277591705, "eval_runtime": 14.7139, "eval_samples_per_second": 32.418, "eval_steps_per_second": 8.156, "step": 284 }, { "epoch": 0.5, "grad_norm": 0.26271358132362366, "learning_rate": 0.0009357060236010625, "loss": 0.1429, "step": 285 }, { "epoch": 0.51, "grad_norm": 21.24464988708496, "learning_rate": 0.0009352479627601457, "loss": 2.0706, "step": 286 }, { "epoch": 0.51, "grad_norm": 6.5764265060424805, "learning_rate": 0.0009347883889274922, "loss": 0.3337, "step": 287 }, { "epoch": 0.51, "grad_norm": 0.6868380904197693, "learning_rate": 0.0009343273037006539, "loss": 0.1994, "step": 288 }, { "epoch": 0.51, "grad_norm": 0.9018234610557556, "learning_rate": 0.0009338647086824372, "loss": 0.1908, "step": 289 }, { "epoch": 0.51, "grad_norm": 1.7751502990722656, "learning_rate": 0.0009334006054808966, "loss": 0.2028, "step": 290 }, { "epoch": 0.51, "grad_norm": 0.5386408567428589, "learning_rate": 0.0009329349957093293, "loss": 0.1853, "step": 291 }, { "epoch": 0.52, "grad_norm": 1.4171103239059448, "learning_rate": 0.0009324678809862695, "loss": 0.3597, "step": 292 }, { "epoch": 0.52, "grad_norm": 0.4105970561504364, "learning_rate": 0.0009319992629354827, "loss": 0.1344, "step": 293 }, { "epoch": 0.52, "grad_norm": 0.26628127694129944, "learning_rate": 0.000931529143185961, "loss": 0.1453, "step": 294 }, { "epoch": 0.52, "grad_norm": 14.981964111328125, "learning_rate": 0.0009310575233719154, "loss": 0.2563, "step": 295 }, { "epoch": 0.52, "grad_norm": 0.6945788264274597, "learning_rate": 0.0009305844051327725, "loss": 0.1229, "step": 296 }, { "epoch": 0.53, "grad_norm": 31.034496307373047, "learning_rate": 0.000930109790113167, "loss": 1.2974, "step": 297 }, { "epoch": 0.53, "grad_norm": 1.5794603824615479, "learning_rate": 0.0009296336799629368, "loss": 0.22, "step": 298 }, { "epoch": 0.53, "grad_norm": 0.33219394087791443, "learning_rate": 0.0009291560763371172, "loss": 0.1262, "step": 299 }, { "epoch": 0.53, "grad_norm": 2.597118377685547, "learning_rate": 0.000928676980895935, "loss": 0.4026, "step": 300 }, { "epoch": 0.53, "grad_norm": 13.547090530395508, "learning_rate": 0.0009281963953048029, "loss": 1.3086, "step": 301 }, { "epoch": 0.53, "grad_norm": 1.289302945137024, "learning_rate": 0.0009277143212343134, "loss": 0.2215, "step": 302 }, { "epoch": 0.54, "grad_norm": 1.2176313400268555, "learning_rate": 0.0009272307603602334, "loss": 0.15, "step": 303 }, { "epoch": 0.54, "grad_norm": 4.436944007873535, "learning_rate": 0.0009267457143634979, "loss": 0.514, "step": 304 }, { "epoch": 0.54, "grad_norm": 29.960241317749023, "learning_rate": 0.0009262591849302047, "loss": 3.5389, "step": 305 }, { "epoch": 0.54, "grad_norm": 5.514049530029297, "learning_rate": 0.0009257711737516082, "loss": 0.2902, "step": 306 }, { "epoch": 0.54, "grad_norm": 2.331019401550293, "learning_rate": 0.0009252816825241135, "loss": 0.2775, "step": 307 }, { "epoch": 0.54, "grad_norm": 0.5708584189414978, "learning_rate": 0.0009247907129492707, "loss": 0.1438, "step": 308 }, { "epoch": 0.55, "grad_norm": 2.16607928276062, "learning_rate": 0.0009242982667337685, "loss": 0.2383, "step": 309 }, { "epoch": 0.55, "grad_norm": 1.5346423387527466, "learning_rate": 0.0009238043455894293, "loss": 0.1793, "step": 310 }, { "epoch": 0.55, "grad_norm": 0.6200052499771118, "learning_rate": 0.000923308951233202, "loss": 0.1473, "step": 311 }, { "epoch": 0.55, "grad_norm": 64.87612915039062, "learning_rate": 0.0009228120853871572, "loss": 0.8875, "step": 312 }, { "epoch": 0.55, "grad_norm": 1.077471137046814, "learning_rate": 0.0009223137497784797, "loss": 0.2114, "step": 313 }, { "epoch": 0.56, "grad_norm": 3.5934722423553467, "learning_rate": 0.0009218139461394644, "loss": 0.2852, "step": 314 }, { "epoch": 0.56, "grad_norm": 0.10276800394058228, "learning_rate": 0.0009213126762075088, "loss": 0.1365, "step": 315 }, { "epoch": 0.56, "grad_norm": 3.9422831535339355, "learning_rate": 0.0009208099417251077, "loss": 0.2949, "step": 316 }, { "epoch": 0.56, "grad_norm": 1.7574914693832397, "learning_rate": 0.0009203057444398468, "loss": 0.2621, "step": 317 }, { "epoch": 0.56, "grad_norm": 0.29479530453681946, "learning_rate": 0.0009198000861043967, "loss": 0.1341, "step": 318 }, { "epoch": 0.56, "grad_norm": 0.5362827181816101, "learning_rate": 0.0009192929684765068, "loss": 0.1398, "step": 319 }, { "epoch": 0.57, "grad_norm": 0.8159481287002563, "learning_rate": 0.0009187843933189994, "loss": 0.1863, "step": 320 }, { "epoch": 0.57, "grad_norm": 0.9413295388221741, "learning_rate": 0.0009182743623997634, "loss": 0.2104, "step": 321 }, { "epoch": 0.57, "grad_norm": 0.5306220650672913, "learning_rate": 0.0009177628774917479, "loss": 0.1537, "step": 322 }, { "epoch": 0.57, "grad_norm": 0.8887706398963928, "learning_rate": 0.0009172499403729567, "loss": 0.1963, "step": 323 }, { "epoch": 0.57, "grad_norm": 0.8467744588851929, "learning_rate": 0.0009167355528264414, "loss": 0.204, "step": 324 }, { "epoch": 0.57, "grad_norm": 0.19867151975631714, "learning_rate": 0.0009162197166402956, "loss": 0.1407, "step": 325 }, { "epoch": 0.58, "grad_norm": 0.13638383150100708, "learning_rate": 0.0009157024336076487, "loss": 0.1408, "step": 326 }, { "epoch": 0.58, "grad_norm": 0.2027496099472046, "learning_rate": 0.0009151837055266594, "loss": 0.1444, "step": 327 }, { "epoch": 0.58, "grad_norm": 0.370151549577713, "learning_rate": 0.0009146635342005098, "loss": 0.158, "step": 328 }, { "epoch": 0.58, "grad_norm": 0.3114052414894104, "learning_rate": 0.000914141921437399, "loss": 0.1464, "step": 329 }, { "epoch": 0.58, "grad_norm": 0.15394961833953857, "learning_rate": 0.0009136188690505362, "loss": 0.1341, "step": 330 }, { "epoch": 0.59, "grad_norm": 0.46498528122901917, "learning_rate": 0.0009130943788581359, "loss": 0.1426, "step": 331 }, { "epoch": 0.59, "grad_norm": 0.28067877888679504, "learning_rate": 0.00091256845268341, "loss": 0.1409, "step": 332 }, { "epoch": 0.59, "grad_norm": 0.061186857521533966, "learning_rate": 0.0009120410923545619, "loss": 0.1401, "step": 333 }, { "epoch": 0.59, "grad_norm": 0.26736098527908325, "learning_rate": 0.0009115122997047811, "loss": 0.1467, "step": 334 }, { "epoch": 0.59, "grad_norm": 0.5139696598052979, "learning_rate": 0.0009109820765722356, "loss": 0.1585, "step": 335 }, { "epoch": 0.59, "grad_norm": 0.40007275342941284, "learning_rate": 0.000910450424800066, "loss": 0.1473, "step": 336 }, { "epoch": 0.6, "grad_norm": 0.66825270652771, "learning_rate": 0.0009099173462363792, "loss": 0.1572, "step": 337 }, { "epoch": 0.6, "grad_norm": 0.5313024520874023, "learning_rate": 0.0009093828427342418, "loss": 0.1555, "step": 338 }, { "epoch": 0.6, "grad_norm": 0.4224655330181122, "learning_rate": 0.0009088469161516735, "loss": 0.1429, "step": 339 }, { "epoch": 0.6, "grad_norm": 0.03462248668074608, "learning_rate": 0.0009083095683516414, "loss": 0.1325, "step": 340 }, { "epoch": 0.6, "grad_norm": 0.542322039604187, "learning_rate": 0.0009077708012020524, "loss": 0.1755, "step": 341 }, { "epoch": 0.61, "grad_norm": 0.2164747267961502, "learning_rate": 0.0009072306165757476, "loss": 0.1458, "step": 342 }, { "epoch": 0.61, "grad_norm": 0.27414461970329285, "learning_rate": 0.0009066890163504955, "loss": 0.1512, "step": 343 }, { "epoch": 0.61, "grad_norm": 0.1911482959985733, "learning_rate": 0.0009061460024089853, "loss": 0.1185, "step": 344 }, { "epoch": 0.61, "grad_norm": 0.1287711262702942, "learning_rate": 0.0009056015766388205, "loss": 0.1372, "step": 345 }, { "epoch": 0.61, "grad_norm": 0.18598809838294983, "learning_rate": 0.0009050557409325125, "loss": 0.1341, "step": 346 }, { "epoch": 0.61, "grad_norm": 0.18694853782653809, "learning_rate": 0.0009045084971874737, "loss": 0.141, "step": 347 }, { "epoch": 0.62, "grad_norm": 0.06479912996292114, "learning_rate": 0.0009039598473060113, "loss": 0.1368, "step": 348 }, { "epoch": 0.62, "grad_norm": 0.17768733203411102, "learning_rate": 0.0009034097931953201, "loss": 0.1381, "step": 349 }, { "epoch": 0.62, "grad_norm": 0.28938984870910645, "learning_rate": 0.0009028583367674765, "loss": 0.1365, "step": 350 }, { "epoch": 0.62, "grad_norm": 0.2924034893512726, "learning_rate": 0.0009023054799394316, "loss": 0.1282, "step": 351 }, { "epoch": 0.62, "grad_norm": 0.28439652919769287, "learning_rate": 0.0009017512246330042, "loss": 0.151, "step": 352 }, { "epoch": 0.62, "grad_norm": 0.14329224824905396, "learning_rate": 0.0009011955727748749, "loss": 0.1419, "step": 353 }, { "epoch": 0.63, "grad_norm": 0.15245947241783142, "learning_rate": 0.0009006385262965785, "loss": 0.1163, "step": 354 }, { "epoch": 0.63, "grad_norm": 0.052399642765522, "learning_rate": 0.000900080087134498, "loss": 0.1241, "step": 355 }, { "epoch": 0.63, "grad_norm": 0.030301153659820557, "learning_rate": 0.0008995202572298575, "loss": 0.1232, "step": 356 }, { "epoch": 0.63, "grad_norm": 0.41738417744636536, "learning_rate": 0.0008989590385287155, "loss": 0.1675, "step": 357 }, { "epoch": 0.63, "grad_norm": 0.19307875633239746, "learning_rate": 0.0008983964329819583, "loss": 0.1328, "step": 358 }, { "epoch": 0.64, "grad_norm": 0.05682377517223358, "learning_rate": 0.000897832442545293, "loss": 0.1322, "step": 359 }, { "epoch": 0.64, "grad_norm": 0.15418089926242828, "learning_rate": 0.0008972670691792409, "loss": 0.1414, "step": 360 }, { "epoch": 0.64, "grad_norm": 0.07167459279298782, "learning_rate": 0.0008967003148491304, "loss": 0.1414, "step": 361 }, { "epoch": 0.64, "grad_norm": 0.2866109609603882, "learning_rate": 0.0008961321815250904, "loss": 0.1381, "step": 362 }, { "epoch": 0.64, "grad_norm": 0.281264990568161, "learning_rate": 0.0008955626711820438, "loss": 0.1365, "step": 363 }, { "epoch": 0.64, "grad_norm": 0.19263768196105957, "learning_rate": 0.0008949917857996997, "loss": 0.1394, "step": 364 }, { "epoch": 0.65, "grad_norm": 0.30531641840934753, "learning_rate": 0.0008944195273625471, "loss": 0.1478, "step": 365 }, { "epoch": 0.65, "grad_norm": 0.16229306161403656, "learning_rate": 0.0008938458978598483, "loss": 0.1412, "step": 366 }, { "epoch": 0.65, "grad_norm": 0.09315463900566101, "learning_rate": 0.0008932708992856315, "loss": 0.1397, "step": 367 }, { "epoch": 0.65, "grad_norm": 0.04228806868195534, "learning_rate": 0.0008926945336386838, "loss": 0.1383, "step": 368 }, { "epoch": 0.65, "grad_norm": 0.2209407389163971, "learning_rate": 0.0008921168029225448, "loss": 0.1434, "step": 369 }, { "epoch": 0.65, "grad_norm": 0.04254443198442459, "learning_rate": 0.0008915377091454992, "loss": 0.1326, "step": 370 }, { "epoch": 0.66, "grad_norm": 0.09651175886392593, "learning_rate": 0.0008909572543205698, "loss": 0.134, "step": 371 }, { "epoch": 0.66, "grad_norm": 0.2821654975414276, "learning_rate": 0.0008903754404655105, "loss": 0.1498, "step": 372 }, { "epoch": 0.66, "grad_norm": 0.43042680621147156, "learning_rate": 0.0008897922696027998, "loss": 0.1571, "step": 373 }, { "epoch": 0.66, "grad_norm": 0.06591568142175674, "learning_rate": 0.0008892077437596332, "loss": 0.1391, "step": 374 }, { "epoch": 0.66, "grad_norm": 0.08771979063749313, "learning_rate": 0.0008886218649679161, "loss": 0.1375, "step": 375 }, { "epoch": 0.67, "grad_norm": 0.03339942544698715, "learning_rate": 0.0008880346352642574, "loss": 0.1368, "step": 376 }, { "epoch": 0.67, "grad_norm": 0.15352453291416168, "learning_rate": 0.0008874460566899616, "loss": 0.1447, "step": 377 }, { "epoch": 0.67, "grad_norm": 0.1778584122657776, "learning_rate": 0.0008868561312910222, "loss": 0.1189, "step": 378 }, { "epoch": 0.67, "grad_norm": 0.11893154680728912, "learning_rate": 0.0008862648611181144, "loss": 0.1167, "step": 379 }, { "epoch": 0.67, "grad_norm": 0.4323861598968506, "learning_rate": 0.0008856722482265886, "loss": 0.1691, "step": 380 }, { "epoch": 0.67, "grad_norm": 0.28813356161117554, "learning_rate": 0.0008850782946764618, "loss": 0.1505, "step": 381 }, { "epoch": 0.68, "grad_norm": 0.5008757710456848, "learning_rate": 0.0008844830025324122, "loss": 0.1671, "step": 382 }, { "epoch": 0.68, "grad_norm": 0.12061876803636551, "learning_rate": 0.0008838863738637705, "loss": 0.1375, "step": 383 }, { "epoch": 0.68, "grad_norm": 0.6747052073478699, "learning_rate": 0.0008832884107445138, "loss": 0.1663, "step": 384 }, { "epoch": 0.68, "grad_norm": 0.18846777081489563, "learning_rate": 0.0008826891152532579, "loss": 0.1148, "step": 385 }, { "epoch": 0.68, "grad_norm": 0.0950111448764801, "learning_rate": 0.0008820884894732497, "loss": 0.1138, "step": 386 }, { "epoch": 0.68, "grad_norm": 0.42371127009391785, "learning_rate": 0.0008814865354923613, "loss": 0.142, "step": 387 }, { "epoch": 0.69, "grad_norm": 0.17662374675273895, "learning_rate": 0.0008808832554030808, "loss": 0.1255, "step": 388 }, { "epoch": 0.69, "grad_norm": 0.7766286134719849, "learning_rate": 0.0008802786513025068, "loss": 0.1613, "step": 389 }, { "epoch": 0.69, "grad_norm": 0.49581214785575867, "learning_rate": 0.0008796727252923403, "loss": 0.1346, "step": 390 }, { "epoch": 0.69, "grad_norm": 0.6148929595947266, "learning_rate": 0.0008790654794788768, "loss": 0.1426, "step": 391 }, { "epoch": 0.69, "grad_norm": 0.15860037505626678, "learning_rate": 0.0008784569159730007, "loss": 0.1382, "step": 392 }, { "epoch": 0.7, "grad_norm": 0.6793199777603149, "learning_rate": 0.0008778470368901761, "loss": 0.1398, "step": 393 }, { "epoch": 0.7, "grad_norm": 0.40314817428588867, "learning_rate": 0.0008772358443504404, "loss": 0.1428, "step": 394 }, { "epoch": 0.7, "grad_norm": 0.6403933167457581, "learning_rate": 0.0008766233404783974, "loss": 0.1556, "step": 395 }, { "epoch": 0.7, "grad_norm": 0.33554157614707947, "learning_rate": 0.0008760095274032083, "loss": 0.1439, "step": 396 }, { "epoch": 0.7, "grad_norm": 0.45690324902534485, "learning_rate": 0.000875394407258586, "loss": 0.1374, "step": 397 }, { "epoch": 0.7, "grad_norm": 0.0541120283305645, "learning_rate": 0.0008747779821827868, "loss": 0.1314, "step": 398 }, { "epoch": 0.71, "grad_norm": 0.6533159613609314, "learning_rate": 0.0008741602543186031, "loss": 0.169, "step": 399 }, { "epoch": 0.71, "grad_norm": 0.4919282793998718, "learning_rate": 0.0008735412258133561, "loss": 0.1569, "step": 400 }, { "epoch": 0.71, "grad_norm": 0.30325594544410706, "learning_rate": 0.0008729208988188881, "loss": 0.1471, "step": 401 }, { "epoch": 0.71, "grad_norm": 0.3497300148010254, "learning_rate": 0.0008722992754915554, "loss": 0.1457, "step": 402 }, { "epoch": 0.71, "grad_norm": 0.22892774641513824, "learning_rate": 0.0008716763579922203, "loss": 0.1334, "step": 403 }, { "epoch": 0.71, "grad_norm": 0.20050272345542908, "learning_rate": 0.0008710521484862439, "loss": 0.1446, "step": 404 }, { "epoch": 0.72, "grad_norm": 0.5029633641242981, "learning_rate": 0.0008704266491434787, "loss": 0.171, "step": 405 }, { "epoch": 0.72, "grad_norm": 0.2720576226711273, "learning_rate": 0.0008697998621382607, "loss": 0.144, "step": 406 }, { "epoch": 0.72, "grad_norm": 0.10961242765188217, "learning_rate": 0.000869171789649402, "loss": 0.1349, "step": 407 }, { "epoch": 0.72, "grad_norm": 0.13584192097187042, "learning_rate": 0.0008685424338601833, "loss": 0.1385, "step": 408 }, { "epoch": 0.72, "grad_norm": 0.6586437821388245, "learning_rate": 0.0008679117969583464, "loss": 0.1459, "step": 409 }, { "epoch": 0.73, "grad_norm": 0.24006032943725586, "learning_rate": 0.0008672798811360864, "loss": 0.1344, "step": 410 }, { "epoch": 0.73, "grad_norm": 0.1859387755393982, "learning_rate": 0.0008666466885900438, "loss": 0.1358, "step": 411 }, { "epoch": 0.73, "grad_norm": 0.5095134973526001, "learning_rate": 0.0008660122215212977, "loss": 0.1387, "step": 412 }, { "epoch": 0.73, "grad_norm": 0.1827729493379593, "learning_rate": 0.0008653764821353573, "loss": 0.1377, "step": 413 }, { "epoch": 0.73, "grad_norm": 0.14332665503025055, "learning_rate": 0.0008647394726421547, "loss": 0.131, "step": 414 }, { "epoch": 0.73, "grad_norm": 0.383101224899292, "learning_rate": 0.0008641011952560371, "loss": 0.146, "step": 415 }, { "epoch": 0.74, "grad_norm": 0.19079791009426117, "learning_rate": 0.000863461652195759, "loss": 0.1255, "step": 416 }, { "epoch": 0.74, "grad_norm": 0.49537310004234314, "learning_rate": 0.0008628208456844747, "loss": 0.1602, "step": 417 }, { "epoch": 0.74, "grad_norm": 0.5658069849014282, "learning_rate": 0.0008621787779497306, "loss": 0.1518, "step": 418 }, { "epoch": 0.74, "grad_norm": 0.2572256326675415, "learning_rate": 0.0008615354512234569, "loss": 0.1369, "step": 419 }, { "epoch": 0.74, "grad_norm": 1.1088945865631104, "learning_rate": 0.0008608908677419605, "loss": 0.1773, "step": 420 }, { "epoch": 0.74, "grad_norm": 0.35405099391937256, "learning_rate": 0.0008602450297459173, "loss": 0.1441, "step": 421 }, { "epoch": 0.75, "grad_norm": 0.39150556921958923, "learning_rate": 0.0008595979394803633, "loss": 0.147, "step": 422 }, { "epoch": 0.75, "grad_norm": 0.07459918409585953, "learning_rate": 0.0008589495991946885, "loss": 0.1338, "step": 423 }, { "epoch": 0.75, "grad_norm": 0.2999761402606964, "learning_rate": 0.0008583000111426276, "loss": 0.1357, "step": 424 }, { "epoch": 0.75, "grad_norm": 0.28417065739631653, "learning_rate": 0.0008576491775822525, "loss": 0.1411, "step": 425 }, { "epoch": 0.75, "grad_norm": 0.32605019211769104, "learning_rate": 0.0008569971007759657, "loss": 0.1329, "step": 426 }, { "epoch": 0.75, "eval_loss": 0.13750587403774261, "eval_runtime": 15.1749, "eval_samples_per_second": 31.433, "eval_steps_per_second": 7.908, "step": 426 }, { "epoch": 0.76, "grad_norm": 0.047430120408535004, "learning_rate": 0.0008563437829904903, "loss": 0.1373, "step": 427 }, { "epoch": 0.76, "grad_norm": 0.4616542160511017, "learning_rate": 0.0008556892264968639, "loss": 0.1534, "step": 428 }, { "epoch": 0.76, "grad_norm": 0.12317585945129395, "learning_rate": 0.0008550334335704297, "loss": 0.1338, "step": 429 }, { "epoch": 0.76, "grad_norm": 0.39604276418685913, "learning_rate": 0.0008543764064908295, "loss": 0.1434, "step": 430 }, { "epoch": 0.76, "grad_norm": 0.3490678369998932, "learning_rate": 0.0008537181475419944, "loss": 0.1365, "step": 431 }, { "epoch": 0.76, "grad_norm": 0.15001270174980164, "learning_rate": 0.0008530586590121383, "loss": 0.1358, "step": 432 }, { "epoch": 0.77, "grad_norm": 0.33340635895729065, "learning_rate": 0.0008523979431937492, "loss": 0.1367, "step": 433 }, { "epoch": 0.77, "grad_norm": 0.06029750779271126, "learning_rate": 0.0008517360023835809, "loss": 0.1366, "step": 434 }, { "epoch": 0.77, "grad_norm": 0.07978738099336624, "learning_rate": 0.0008510728388826463, "loss": 0.1345, "step": 435 }, { "epoch": 0.77, "grad_norm": 0.27599036693573, "learning_rate": 0.0008504084549962079, "loss": 0.1447, "step": 436 }, { "epoch": 0.77, "grad_norm": 0.13302059471607208, "learning_rate": 0.0008497428530337706, "loss": 0.1407, "step": 437 }, { "epoch": 0.77, "grad_norm": 0.20869582891464233, "learning_rate": 0.0008490760353090737, "loss": 0.1374, "step": 438 }, { "epoch": 0.78, "grad_norm": 0.10881117731332779, "learning_rate": 0.0008484080041400825, "loss": 0.1429, "step": 439 }, { "epoch": 0.78, "grad_norm": 0.20344361662864685, "learning_rate": 0.0008477387618489807, "loss": 0.139, "step": 440 }, { "epoch": 0.78, "grad_norm": 0.07153432071208954, "learning_rate": 0.0008470683107621615, "loss": 0.1315, "step": 441 }, { "epoch": 0.78, "grad_norm": 0.08688751608133316, "learning_rate": 0.0008463966532102207, "loss": 0.1346, "step": 442 }, { "epoch": 0.78, "grad_norm": 0.06495650112628937, "learning_rate": 0.0008457237915279476, "loss": 0.1307, "step": 443 }, { "epoch": 0.79, "grad_norm": 0.1892390102148056, "learning_rate": 0.0008450497280543173, "loss": 0.12, "step": 444 }, { "epoch": 0.79, "grad_norm": 0.2579623758792877, "learning_rate": 0.0008443744651324827, "loss": 0.1531, "step": 445 }, { "epoch": 0.79, "grad_norm": 0.149379700422287, "learning_rate": 0.000843698005109766, "loss": 0.1385, "step": 446 }, { "epoch": 0.79, "grad_norm": 0.19281132519245148, "learning_rate": 0.0008430203503376506, "loss": 0.1033, "step": 447 }, { "epoch": 0.79, "grad_norm": 0.33208444714546204, "learning_rate": 0.0008423415031717733, "loss": 0.1525, "step": 448 }, { "epoch": 0.79, "grad_norm": 0.15149784088134766, "learning_rate": 0.0008416614659719157, "loss": 0.1282, "step": 449 }, { "epoch": 0.8, "grad_norm": 0.24646438658237457, "learning_rate": 0.0008409802411019962, "loss": 0.1393, "step": 450 }, { "epoch": 0.8, "grad_norm": 0.2505553662776947, "learning_rate": 0.000840297830930062, "loss": 0.1453, "step": 451 }, { "epoch": 0.8, "grad_norm": 0.1632508784532547, "learning_rate": 0.0008396142378282799, "loss": 0.1274, "step": 452 }, { "epoch": 0.8, "grad_norm": 0.12370573729276657, "learning_rate": 0.0008389294641729292, "loss": 0.1201, "step": 453 }, { "epoch": 0.8, "grad_norm": 0.08046772330999374, "learning_rate": 0.0008382435123443934, "loss": 0.1263, "step": 454 }, { "epoch": 0.8, "grad_norm": 0.19015488028526306, "learning_rate": 0.0008375563847271506, "loss": 0.1318, "step": 455 }, { "epoch": 0.81, "grad_norm": 0.3562954366207123, "learning_rate": 0.0008368680837097669, "loss": 0.132, "step": 456 }, { "epoch": 0.81, "grad_norm": 0.06315189599990845, "learning_rate": 0.000836178611684887, "loss": 0.1113, "step": 457 }, { "epoch": 0.81, "grad_norm": 0.43667125701904297, "learning_rate": 0.0008354879710492264, "loss": 0.1908, "step": 458 }, { "epoch": 0.81, "grad_norm": 0.0708879753947258, "learning_rate": 0.0008347961642035624, "loss": 0.1399, "step": 459 }, { "epoch": 0.81, "grad_norm": 0.04855835437774658, "learning_rate": 0.0008341031935527267, "loss": 0.1258, "step": 460 }, { "epoch": 0.82, "grad_norm": 0.1364990919828415, "learning_rate": 0.0008334090615055965, "loss": 0.1344, "step": 461 }, { "epoch": 0.82, "grad_norm": 0.08166524022817612, "learning_rate": 0.0008327137704750862, "loss": 0.134, "step": 462 }, { "epoch": 0.82, "grad_norm": 0.09308458864688873, "learning_rate": 0.0008320173228781389, "loss": 0.1507, "step": 463 }, { "epoch": 0.82, "grad_norm": 0.07796576619148254, "learning_rate": 0.000831319721135718, "loss": 0.1284, "step": 464 }, { "epoch": 0.82, "grad_norm": 0.12168626487255096, "learning_rate": 0.0008306209676727993, "loss": 0.148, "step": 465 }, { "epoch": 0.82, "grad_norm": 0.18862847983837128, "learning_rate": 0.000829921064918362, "loss": 0.1229, "step": 466 }, { "epoch": 0.83, "grad_norm": 0.23615515232086182, "learning_rate": 0.00082922001530538, "loss": 0.1322, "step": 467 }, { "epoch": 0.83, "grad_norm": 0.34108766913414, "learning_rate": 0.0008285178212708142, "loss": 0.1338, "step": 468 }, { "epoch": 0.83, "grad_norm": 0.39579400420188904, "learning_rate": 0.0008278144852556042, "loss": 0.1341, "step": 469 }, { "epoch": 0.83, "grad_norm": 0.2620592713356018, "learning_rate": 0.0008271100097046585, "loss": 0.1395, "step": 470 }, { "epoch": 0.83, "grad_norm": 0.08778171986341476, "learning_rate": 0.0008264043970668469, "loss": 0.1328, "step": 471 }, { "epoch": 0.84, "grad_norm": 0.6086364388465881, "learning_rate": 0.0008256976497949924, "loss": 0.1271, "step": 472 }, { "epoch": 0.84, "grad_norm": 0.08982394635677338, "learning_rate": 0.0008249897703458619, "loss": 0.1346, "step": 473 }, { "epoch": 0.84, "grad_norm": 0.054080091416835785, "learning_rate": 0.0008242807611801578, "loss": 0.1218, "step": 474 }, { "epoch": 0.84, "grad_norm": 0.5981457829475403, "learning_rate": 0.0008235706247625098, "loss": 0.1715, "step": 475 }, { "epoch": 0.84, "grad_norm": 0.9139420986175537, "learning_rate": 0.0008228593635614659, "loss": 0.1983, "step": 476 }, { "epoch": 0.84, "grad_norm": 0.05938498303294182, "learning_rate": 0.0008221469800494841, "loss": 0.1308, "step": 477 }, { "epoch": 0.85, "grad_norm": 0.11526026576757431, "learning_rate": 0.0008214334767029239, "loss": 0.1422, "step": 478 }, { "epoch": 0.85, "grad_norm": 0.3049907386302948, "learning_rate": 0.0008207188560020373, "loss": 0.1419, "step": 479 }, { "epoch": 0.85, "grad_norm": 0.04782035946846008, "learning_rate": 0.0008200031204309604, "loss": 0.138, "step": 480 }, { "epoch": 0.85, "grad_norm": 0.12950918078422546, "learning_rate": 0.000819286272477705, "loss": 0.1315, "step": 481 }, { "epoch": 0.85, "grad_norm": 0.0429329015314579, "learning_rate": 0.0008185683146341496, "loss": 0.1354, "step": 482 }, { "epoch": 0.85, "grad_norm": 0.4792588949203491, "learning_rate": 0.0008178492493960308, "loss": 0.1476, "step": 483 }, { "epoch": 0.86, "grad_norm": 0.19784927368164062, "learning_rate": 0.0008171290792629346, "loss": 0.1394, "step": 484 }, { "epoch": 0.86, "grad_norm": 0.1172945499420166, "learning_rate": 0.000816407806738288, "loss": 0.1302, "step": 485 }, { "epoch": 0.86, "grad_norm": 0.3732689917087555, "learning_rate": 0.0008156854343293501, "loss": 0.1416, "step": 486 }, { "epoch": 0.86, "grad_norm": 0.5152392983436584, "learning_rate": 0.0008149619645472031, "loss": 0.1403, "step": 487 }, { "epoch": 0.86, "grad_norm": 0.15429601073265076, "learning_rate": 0.000814237399906744, "loss": 0.1322, "step": 488 }, { "epoch": 0.87, "grad_norm": 1.0002127885818481, "learning_rate": 0.0008135117429266756, "loss": 0.1303, "step": 489 }, { "epoch": 0.87, "grad_norm": 0.7232715487480164, "learning_rate": 0.0008127849961294984, "loss": 0.143, "step": 490 }, { "epoch": 0.87, "grad_norm": 0.13510456681251526, "learning_rate": 0.0008120571620415006, "loss": 0.1536, "step": 491 }, { "epoch": 0.87, "grad_norm": 0.5168789625167847, "learning_rate": 0.0008113282431927503, "loss": 0.1312, "step": 492 }, { "epoch": 0.87, "grad_norm": 0.7039850950241089, "learning_rate": 0.000810598242117086, "loss": 0.118, "step": 493 }, { "epoch": 0.87, "grad_norm": 1.5126641988754272, "learning_rate": 0.0008098671613521089, "loss": 0.2343, "step": 494 }, { "epoch": 0.88, "grad_norm": 0.6958308815956116, "learning_rate": 0.0008091350034391731, "loss": 0.1648, "step": 495 }, { "epoch": 0.88, "grad_norm": 6.979303359985352, "learning_rate": 0.0008084017709233766, "loss": 0.2261, "step": 496 }, { "epoch": 0.88, "grad_norm": 0.3389752507209778, "learning_rate": 0.0008076674663535537, "loss": 0.146, "step": 497 }, { "epoch": 0.88, "grad_norm": 0.19990071654319763, "learning_rate": 0.0008069320922822643, "loss": 0.1429, "step": 498 }, { "epoch": 0.88, "grad_norm": 0.33689868450164795, "learning_rate": 0.0008061956512657871, "loss": 0.147, "step": 499 }, { "epoch": 0.88, "grad_norm": 0.09925112873315811, "learning_rate": 0.000805458145864109, "loss": 0.1342, "step": 500 }, { "epoch": 0.89, "grad_norm": 1.961702585220337, "learning_rate": 0.0008047195786409172, "loss": 0.1361, "step": 501 }, { "epoch": 0.89, "grad_norm": 0.4342229962348938, "learning_rate": 0.0008039799521635895, "loss": 0.1485, "step": 502 }, { "epoch": 0.89, "grad_norm": 0.1798858642578125, "learning_rate": 0.0008032392690031867, "loss": 0.1314, "step": 503 }, { "epoch": 0.89, "grad_norm": 1.3653756380081177, "learning_rate": 0.0008024975317344421, "loss": 0.1388, "step": 504 }, { "epoch": 0.89, "grad_norm": 9.677605628967285, "learning_rate": 0.0008017547429357531, "loss": 0.4186, "step": 505 }, { "epoch": 0.9, "grad_norm": 8.348475456237793, "learning_rate": 0.0008010109051891731, "loss": 0.3806, "step": 506 }, { "epoch": 0.9, "grad_norm": 35.19770050048828, "learning_rate": 0.0008002660210804011, "loss": 3.6145, "step": 507 }, { "epoch": 0.9, "grad_norm": 9.18663501739502, "learning_rate": 0.0007995200931987743, "loss": 0.6162, "step": 508 }, { "epoch": 0.9, "grad_norm": 0.05997322499752045, "learning_rate": 0.0007987731241372571, "loss": 0.1129, "step": 509 }, { "epoch": 0.9, "grad_norm": 0.41408172249794006, "learning_rate": 0.000798025116492434, "loss": 0.1512, "step": 510 }, { "epoch": 0.9, "grad_norm": 0.4445393979549408, "learning_rate": 0.0007972760728644996, "loss": 0.1463, "step": 511 }, { "epoch": 0.91, "grad_norm": 0.19678063690662384, "learning_rate": 0.0007965259958572495, "loss": 0.1386, "step": 512 }, { "epoch": 0.91, "grad_norm": 0.45497119426727295, "learning_rate": 0.0007957748880780721, "loss": 0.1373, "step": 513 }, { "epoch": 0.91, "grad_norm": 0.6455509066581726, "learning_rate": 0.0007950227521379381, "loss": 0.1584, "step": 514 }, { "epoch": 0.91, "grad_norm": 0.3793765604496002, "learning_rate": 0.0007942695906513929, "loss": 0.1236, "step": 515 }, { "epoch": 0.91, "grad_norm": 0.20562775433063507, "learning_rate": 0.0007935154062365467, "loss": 0.1364, "step": 516 }, { "epoch": 0.91, "grad_norm": 1.3131325244903564, "learning_rate": 0.0007927602015150655, "loss": 0.1556, "step": 517 }, { "epoch": 0.92, "grad_norm": 0.1705670803785324, "learning_rate": 0.0007920039791121617, "loss": 0.1372, "step": 518 }, { "epoch": 0.92, "grad_norm": 6.6207499504089355, "learning_rate": 0.0007912467416565861, "loss": 0.22, "step": 519 }, { "epoch": 0.92, "grad_norm": 0.34343230724334717, "learning_rate": 0.0007904884917806173, "loss": 0.1453, "step": 520 }, { "epoch": 0.92, "grad_norm": 0.4290754497051239, "learning_rate": 0.0007897292321200537, "loss": 0.1177, "step": 521 }, { "epoch": 0.92, "grad_norm": 0.24469922482967377, "learning_rate": 0.0007889689653142036, "loss": 0.1369, "step": 522 }, { "epoch": 0.93, "grad_norm": 0.5307168960571289, "learning_rate": 0.0007882076940058763, "loss": 0.1542, "step": 523 }, { "epoch": 0.93, "grad_norm": 0.13802866637706757, "learning_rate": 0.000787445420841373, "loss": 0.1372, "step": 524 }, { "epoch": 0.93, "grad_norm": 0.36055922508239746, "learning_rate": 0.0007866821484704776, "loss": 0.1413, "step": 525 }, { "epoch": 0.93, "grad_norm": 0.36655113101005554, "learning_rate": 0.0007859178795464472, "loss": 0.1438, "step": 526 }, { "epoch": 0.93, "grad_norm": 0.6237390637397766, "learning_rate": 0.0007851526167260034, "loss": 0.1382, "step": 527 }, { "epoch": 0.93, "grad_norm": 0.42217007279396057, "learning_rate": 0.0007843863626693221, "loss": 0.1408, "step": 528 }, { "epoch": 0.94, "grad_norm": 24.023250579833984, "learning_rate": 0.0007836191200400256, "loss": 0.1517, "step": 529 }, { "epoch": 0.94, "grad_norm": 0.31599146127700806, "learning_rate": 0.0007828508915051723, "loss": 0.1353, "step": 530 }, { "epoch": 0.94, "grad_norm": 0.6795622706413269, "learning_rate": 0.0007820816797352479, "loss": 0.1515, "step": 531 }, { "epoch": 0.94, "grad_norm": 0.37493640184402466, "learning_rate": 0.0007813114874041557, "loss": 0.141, "step": 532 }, { "epoch": 0.94, "grad_norm": 1.7365546226501465, "learning_rate": 0.0007805403171892079, "loss": 0.1347, "step": 533 }, { "epoch": 0.94, "grad_norm": 18.393390655517578, "learning_rate": 0.000779768171771116, "loss": 0.1753, "step": 534 }, { "epoch": 0.95, "grad_norm": 2.2978413105010986, "learning_rate": 0.0007789950538339812, "loss": 0.1418, "step": 535 }, { "epoch": 0.95, "grad_norm": 0.495151162147522, "learning_rate": 0.0007782209660652854, "loss": 0.146, "step": 536 }, { "epoch": 0.95, "grad_norm": 7.705572605133057, "learning_rate": 0.0007774459111558821, "loss": 0.2042, "step": 537 }, { "epoch": 0.95, "grad_norm": 0.6036086678504944, "learning_rate": 0.0007766698917999862, "loss": 0.1695, "step": 538 }, { "epoch": 0.95, "grad_norm": 127.21215057373047, "learning_rate": 0.0007758929106951656, "loss": 18.5136, "step": 539 }, { "epoch": 0.96, "grad_norm": 40.58448791503906, "learning_rate": 0.0007751149705423312, "loss": 0.5973, "step": 540 }, { "epoch": 0.96, "grad_norm": 0.6296218633651733, "learning_rate": 0.0007743360740457278, "loss": 0.1849, "step": 541 }, { "epoch": 0.96, "grad_norm": 0.4533160924911499, "learning_rate": 0.0007735562239129247, "loss": 0.1464, "step": 542 }, { "epoch": 0.96, "grad_norm": 0.2379036247730255, "learning_rate": 0.0007727754228548058, "loss": 0.1267, "step": 543 }, { "epoch": 0.96, "grad_norm": 0.8904889225959778, "learning_rate": 0.000771993673585561, "loss": 0.2181, "step": 544 }, { "epoch": 0.96, "grad_norm": 0.8934443593025208, "learning_rate": 0.0007712109788226762, "loss": 0.2158, "step": 545 }, { "epoch": 0.97, "grad_norm": 0.3368353545665741, "learning_rate": 0.0007704273412869238, "loss": 0.1489, "step": 546 }, { "epoch": 0.97, "grad_norm": 0.2570180594921112, "learning_rate": 0.0007696427637023537, "loss": 0.144, "step": 547 }, { "epoch": 0.97, "grad_norm": 2.865034580230713, "learning_rate": 0.0007688572487962834, "loss": 0.1664, "step": 548 }, { "epoch": 0.97, "grad_norm": 0.4369525611400604, "learning_rate": 0.0007680707992992888, "loss": 0.1777, "step": 549 }, { "epoch": 0.97, "grad_norm": 0.2545509934425354, "learning_rate": 0.0007672834179451942, "loss": 0.1536, "step": 550 }, { "epoch": 0.97, "grad_norm": 0.14455465972423553, "learning_rate": 0.0007664951074710638, "loss": 0.1256, "step": 551 }, { "epoch": 0.98, "grad_norm": 0.16001886129379272, "learning_rate": 0.0007657058706171911, "loss": 0.1356, "step": 552 }, { "epoch": 0.98, "grad_norm": 0.2537885308265686, "learning_rate": 0.0007649157101270903, "loss": 0.1393, "step": 553 }, { "epoch": 0.98, "grad_norm": 0.33060047030448914, "learning_rate": 0.0007641246287474854, "loss": 0.148, "step": 554 }, { "epoch": 0.98, "grad_norm": 1.691941499710083, "learning_rate": 0.0007633326292283028, "loss": 0.1764, "step": 555 }, { "epoch": 0.98, "grad_norm": 0.20472805202007294, "learning_rate": 0.0007625397143226595, "loss": 0.1424, "step": 556 }, { "epoch": 0.99, "grad_norm": 0.7124485969543457, "learning_rate": 0.0007617458867868553, "loss": 0.1482, "step": 557 }, { "epoch": 0.99, "grad_norm": 0.09631184488534927, "learning_rate": 0.0007609511493803615, "loss": 0.1392, "step": 558 }, { "epoch": 0.99, "grad_norm": 0.20814809203147888, "learning_rate": 0.0007601555048658133, "loss": 0.1384, "step": 559 }, { "epoch": 0.99, "grad_norm": 0.19566737115383148, "learning_rate": 0.0007593589560089984, "loss": 0.1394, "step": 560 }, { "epoch": 0.99, "grad_norm": 0.13406091928482056, "learning_rate": 0.0007585615055788484, "loss": 0.1389, "step": 561 }, { "epoch": 0.99, "grad_norm": 0.07635807991027832, "learning_rate": 0.0007577631563474291, "loss": 0.1376, "step": 562 }, { "epoch": 1.0, "grad_norm": 0.11265091598033905, "learning_rate": 0.0007569639110899302, "loss": 0.1395, "step": 563 }, { "epoch": 1.0, "grad_norm": 0.31152746081352234, "learning_rate": 0.0007561637725846567, "loss": 0.1407, "step": 564 }, { "epoch": 1.0, "grad_norm": 0.13474373519420624, "learning_rate": 0.0007553627436130183, "loss": 0.1386, "step": 565 } ], "logging_steps": 1, "max_steps": 1695, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 565, "total_flos": 5.169945694856806e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }