diff --git "a/checkpoint-1695/trainer_state.json" "b/checkpoint-1695/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1695/trainer_state.json" @@ -0,0 +1,11982 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9986731534719153, + "eval_steps": 142, + "global_step": 1695, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.705155372619629, + "learning_rate": 0.0001, + "loss": 3.3182, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 3.3362529277801514, + "eval_runtime": 14.4366, + "eval_samples_per_second": 33.041, + "eval_steps_per_second": 8.312, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.644563674926758, + "learning_rate": 0.0002, + "loss": 3.2788, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 4.3825764656066895, + "learning_rate": 0.0003, + "loss": 2.9231, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 2.904296636581421, + "learning_rate": 0.0004, + "loss": 1.1824, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 1.6988284587860107, + "learning_rate": 0.0005, + "loss": 0.3472, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 0.5742101073265076, + "learning_rate": 0.0006, + "loss": 0.1478, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 0.6511944532394409, + "learning_rate": 0.0007, + "loss": 0.1532, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 2.305083751678467, + "learning_rate": 0.0008, + "loss": 0.2397, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 4.7435078620910645, + "learning_rate": 0.0009000000000000001, + "loss": 0.434, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.6199779510498047, + "learning_rate": 0.001, + "loss": 0.1743, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 0.14406554400920868, + "learning_rate": 0.0009999991309598973, + "loss": 0.1404, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.1965201050043106, + "learning_rate": 0.0009999965238426103, + "loss": 0.1418, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 26.520109176635742, + "learning_rate": 0.0009999921786572016, + "loss": 0.2689, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 0.0870603695511818, + "learning_rate": 0.0009999860954187755, + "loss": 0.1338, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 0.05200817808508873, + "learning_rate": 0.0009999782741484788, + "loss": 0.1308, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 0.2145700752735138, + "learning_rate": 0.0009999687148734995, + "loss": 0.1375, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.19921083748340607, + "learning_rate": 0.0009999574176270667, + "loss": 0.1388, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 1.193419337272644, + "learning_rate": 0.0009999443824484518, + "loss": 0.1978, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.4399484395980835, + "learning_rate": 0.0009999296093829671, + "loss": 0.1518, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 44.88853073120117, + "learning_rate": 0.0009999130984819661, + "loss": 0.9033, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 0.3220385015010834, + "learning_rate": 0.0009998948498028434, + "loss": 0.1234, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.5420748591423035, + "learning_rate": 0.0009998748634090344, + "loss": 0.1602, + "step": 22 + }, + { + "epoch": 0.04, + "grad_norm": 0.5249865651130676, + "learning_rate": 0.0009998531393700149, + "loss": 0.1538, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.056158341467380524, + "learning_rate": 0.000999829677761301, + "loss": 0.1374, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.19818872213363647, + "learning_rate": 0.0009998044786644492, + "loss": 0.1413, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 0.27901849150657654, + "learning_rate": 0.0009997775421670557, + "loss": 0.1395, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 0.22768354415893555, + "learning_rate": 0.0009997488683627558, + "loss": 0.1241, + "step": 27 + }, + { + "epoch": 0.05, + "grad_norm": 0.14878959953784943, + "learning_rate": 0.0009997184573512245, + "loss": 0.1243, + "step": 28 + }, + { + "epoch": 0.05, + "grad_norm": 1.0589066743850708, + "learning_rate": 0.000999686309238175, + "loss": 0.2499, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.11455405503511429, + "learning_rate": 0.00099965242413536, + "loss": 0.1254, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 0.16566088795661926, + "learning_rate": 0.000999616802160569, + "loss": 0.1416, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 1.3691716194152832, + "learning_rate": 0.0009995794434376297, + "loss": 0.1465, + "step": 32 + }, + { + "epoch": 0.06, + "grad_norm": 0.09674070030450821, + "learning_rate": 0.000999540348096407, + "loss": 0.1373, + "step": 33 + }, + { + "epoch": 0.06, + "grad_norm": 0.5034632086753845, + "learning_rate": 0.000999499516272803, + "loss": 0.1471, + "step": 34 + }, + { + "epoch": 0.06, + "grad_norm": 0.26572930812835693, + "learning_rate": 0.0009994569481087553, + "loss": 0.1424, + "step": 35 + }, + { + "epoch": 0.06, + "grad_norm": 0.20631802082061768, + "learning_rate": 0.0009994126437522376, + "loss": 0.1449, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 0.11268749833106995, + "learning_rate": 0.0009993666033572591, + "loss": 0.1403, + "step": 37 + }, + { + "epoch": 0.07, + "grad_norm": 0.6610996723175049, + "learning_rate": 0.0009993188270838635, + "loss": 0.1424, + "step": 38 + }, + { + "epoch": 0.07, + "grad_norm": 98.93838500976562, + "learning_rate": 0.0009992693150981291, + "loss": 2.775, + "step": 39 + }, + { + "epoch": 0.07, + "grad_norm": 32.53168869018555, + "learning_rate": 0.0009992180675721671, + "loss": 0.6932, + "step": 40 + }, + { + "epoch": 0.07, + "grad_norm": 54.8778076171875, + "learning_rate": 0.0009991650846841226, + "loss": 5.7008, + "step": 41 + }, + { + "epoch": 0.07, + "grad_norm": 2.0524775981903076, + "learning_rate": 0.000999110366618172, + "loss": 0.1623, + "step": 42 + }, + { + "epoch": 0.08, + "grad_norm": 0.404278427362442, + "learning_rate": 0.0009990539135645246, + "loss": 0.1427, + "step": 43 + }, + { + "epoch": 0.08, + "grad_norm": 1.7963409423828125, + "learning_rate": 0.0009989957257194198, + "loss": 0.174, + "step": 44 + }, + { + "epoch": 0.08, + "grad_norm": 0.11620022356510162, + "learning_rate": 0.0009989358032851284, + "loss": 0.1339, + "step": 45 + }, + { + "epoch": 0.08, + "grad_norm": 0.5025681853294373, + "learning_rate": 0.00099887414646995, + "loss": 0.1558, + "step": 46 + }, + { + "epoch": 0.08, + "grad_norm": 78.1165771484375, + "learning_rate": 0.0009988107554882138, + "loss": 2.2938, + "step": 47 + }, + { + "epoch": 0.08, + "grad_norm": 0.08389786630868912, + "learning_rate": 0.0009987456305602768, + "loss": 0.1409, + "step": 48 + }, + { + "epoch": 0.09, + "grad_norm": 7.123101711273193, + "learning_rate": 0.0009986787719125242, + "loss": 0.1524, + "step": 49 + }, + { + "epoch": 0.09, + "grad_norm": 0.5341290235519409, + "learning_rate": 0.0009986101797773666, + "loss": 0.1598, + "step": 50 + }, + { + "epoch": 0.09, + "grad_norm": 0.05239284038543701, + "learning_rate": 0.000998539854393242, + "loss": 0.1386, + "step": 51 + }, + { + "epoch": 0.09, + "grad_norm": 0.0722254291176796, + "learning_rate": 0.0009984677960046123, + "loss": 0.1385, + "step": 52 + }, + { + "epoch": 0.09, + "grad_norm": 0.11535236239433289, + "learning_rate": 0.000998394004861964, + "loss": 0.1369, + "step": 53 + }, + { + "epoch": 0.1, + "grad_norm": 0.7584894299507141, + "learning_rate": 0.0009983184812218072, + "loss": 0.108, + "step": 54 + }, + { + "epoch": 0.1, + "grad_norm": 0.8361538052558899, + "learning_rate": 0.000998241225346674, + "loss": 0.1703, + "step": 55 + }, + { + "epoch": 0.1, + "grad_norm": 0.37683162093162537, + "learning_rate": 0.0009981622375051184, + "loss": 0.1368, + "step": 56 + }, + { + "epoch": 0.1, + "grad_norm": 0.5335961580276489, + "learning_rate": 0.0009980815179717144, + "loss": 0.1559, + "step": 57 + }, + { + "epoch": 0.1, + "grad_norm": 0.2806299328804016, + "learning_rate": 0.0009979990670270565, + "loss": 0.1397, + "step": 58 + }, + { + "epoch": 0.1, + "grad_norm": 0.4967437982559204, + "learning_rate": 0.0009979148849577574, + "loss": 0.1543, + "step": 59 + }, + { + "epoch": 0.11, + "grad_norm": 0.10350017994642258, + "learning_rate": 0.0009978289720564471, + "loss": 0.1367, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 78.35698699951172, + "learning_rate": 0.0009977413286217727, + "loss": 2.2474, + "step": 61 + }, + { + "epoch": 0.11, + "grad_norm": 0.15264186263084412, + "learning_rate": 0.0009976519549583973, + "loss": 0.1311, + "step": 62 + }, + { + "epoch": 0.11, + "grad_norm": 0.31865784525871277, + "learning_rate": 0.0009975608513769975, + "loss": 0.1407, + "step": 63 + }, + { + "epoch": 0.11, + "grad_norm": 0.32891547679901123, + "learning_rate": 0.0009974680181942645, + "loss": 0.1423, + "step": 64 + }, + { + "epoch": 0.11, + "grad_norm": 0.15653717517852783, + "learning_rate": 0.0009973734557329008, + "loss": 0.1365, + "step": 65 + }, + { + "epoch": 0.12, + "grad_norm": 0.3237778842449188, + "learning_rate": 0.0009972771643216212, + "loss": 0.1407, + "step": 66 + }, + { + "epoch": 0.12, + "grad_norm": 0.13634416460990906, + "learning_rate": 0.0009971791442951496, + "loss": 0.1378, + "step": 67 + }, + { + "epoch": 0.12, + "grad_norm": 0.3488883376121521, + "learning_rate": 0.0009970793959942197, + "loss": 0.1429, + "step": 68 + }, + { + "epoch": 0.12, + "grad_norm": 0.5150622129440308, + "learning_rate": 0.0009969779197655725, + "loss": 0.1492, + "step": 69 + }, + { + "epoch": 0.12, + "grad_norm": 0.3482552468776703, + "learning_rate": 0.0009968747159619555, + "loss": 0.1415, + "step": 70 + }, + { + "epoch": 0.13, + "grad_norm": 0.22551549971103668, + "learning_rate": 0.000996769784942122, + "loss": 0.1418, + "step": 71 + }, + { + "epoch": 0.13, + "grad_norm": 0.20759086310863495, + "learning_rate": 0.0009966631270708287, + "loss": 0.1366, + "step": 72 + }, + { + "epoch": 0.13, + "grad_norm": 13.050313949584961, + "learning_rate": 0.0009965547427188356, + "loss": 0.1375, + "step": 73 + }, + { + "epoch": 0.13, + "grad_norm": 0.18372055888175964, + "learning_rate": 0.0009964446322629043, + "loss": 0.1285, + "step": 74 + }, + { + "epoch": 0.13, + "grad_norm": 0.4404817819595337, + "learning_rate": 0.000996332796085796, + "loss": 0.1501, + "step": 75 + }, + { + "epoch": 0.13, + "grad_norm": 1.269240379333496, + "learning_rate": 0.0009962192345762716, + "loss": 0.1346, + "step": 76 + }, + { + "epoch": 0.14, + "grad_norm": 32.20164108276367, + "learning_rate": 0.0009961039481290888, + "loss": 0.3348, + "step": 77 + }, + { + "epoch": 0.14, + "grad_norm": 82.92976379394531, + "learning_rate": 0.0009959869371450021, + "loss": 5.8309, + "step": 78 + }, + { + "epoch": 0.14, + "grad_norm": 0.3416314721107483, + "learning_rate": 0.0009958682020307602, + "loss": 0.1418, + "step": 79 + }, + { + "epoch": 0.14, + "grad_norm": 31.961870193481445, + "learning_rate": 0.0009957477431991053, + "loss": 0.1899, + "step": 80 + }, + { + "epoch": 0.14, + "grad_norm": 38.58375930786133, + "learning_rate": 0.000995625561068772, + "loss": 0.5641, + "step": 81 + }, + { + "epoch": 0.15, + "grad_norm": 0.32622194290161133, + "learning_rate": 0.0009955016560644846, + "loss": 0.1144, + "step": 82 + }, + { + "epoch": 0.15, + "grad_norm": 6.264970779418945, + "learning_rate": 0.0009953760286169572, + "loss": 0.4788, + "step": 83 + }, + { + "epoch": 0.15, + "grad_norm": 0.07168668508529663, + "learning_rate": 0.0009952486791628904, + "loss": 0.1326, + "step": 84 + }, + { + "epoch": 0.15, + "grad_norm": 35.18340301513672, + "learning_rate": 0.000995119608144972, + "loss": 0.3884, + "step": 85 + }, + { + "epoch": 0.15, + "grad_norm": 0.03896519914269447, + "learning_rate": 0.000994988816011873, + "loss": 0.1249, + "step": 86 + }, + { + "epoch": 0.15, + "grad_norm": 14.499520301818848, + "learning_rate": 0.000994856303218248, + "loss": 0.3756, + "step": 87 + }, + { + "epoch": 0.16, + "grad_norm": 0.3134947419166565, + "learning_rate": 0.000994722070224733, + "loss": 0.1539, + "step": 88 + }, + { + "epoch": 0.16, + "grad_norm": 117.39696502685547, + "learning_rate": 0.000994586117497943, + "loss": 0.5885, + "step": 89 + }, + { + "epoch": 0.16, + "grad_norm": 37.93465805053711, + "learning_rate": 0.0009944484455104716, + "loss": 0.7709, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 236.63330078125, + "learning_rate": 0.0009943090547408888, + "loss": 6.0182, + "step": 91 + }, + { + "epoch": 0.16, + "grad_norm": 1.1088515520095825, + "learning_rate": 0.0009941679456737394, + "loss": 0.1931, + "step": 92 + }, + { + "epoch": 0.16, + "grad_norm": 0.11310256272554398, + "learning_rate": 0.0009940251187995411, + "loss": 0.1293, + "step": 93 + }, + { + "epoch": 0.17, + "grad_norm": 0.6143047213554382, + "learning_rate": 0.0009938805746147828, + "loss": 0.2364, + "step": 94 + }, + { + "epoch": 0.17, + "grad_norm": 0.2461577206850052, + "learning_rate": 0.0009937343136219232, + "loss": 0.1504, + "step": 95 + }, + { + "epoch": 0.17, + "grad_norm": 97.17162322998047, + "learning_rate": 0.0009935863363293895, + "loss": 5.764, + "step": 96 + }, + { + "epoch": 0.17, + "grad_norm": 0.5417380928993225, + "learning_rate": 0.000993436643251574, + "loss": 0.1576, + "step": 97 + }, + { + "epoch": 0.17, + "grad_norm": 0.2737255096435547, + "learning_rate": 0.0009932852349088341, + "loss": 0.1437, + "step": 98 + }, + { + "epoch": 0.18, + "grad_norm": 138.00778198242188, + "learning_rate": 0.0009931321118274896, + "loss": 4.1331, + "step": 99 + }, + { + "epoch": 0.18, + "grad_norm": 46.4688606262207, + "learning_rate": 0.0009929772745398205, + "loss": 0.6178, + "step": 100 + }, + { + "epoch": 0.18, + "grad_norm": 0.49907386302948, + "learning_rate": 0.0009928207235840663, + "loss": 0.1445, + "step": 101 + }, + { + "epoch": 0.18, + "grad_norm": 0.33814460039138794, + "learning_rate": 0.0009926624595044233, + "loss": 0.139, + "step": 102 + }, + { + "epoch": 0.18, + "grad_norm": 0.3241071403026581, + "learning_rate": 0.0009925024828510427, + "loss": 0.1404, + "step": 103 + }, + { + "epoch": 0.18, + "grad_norm": 78.4036865234375, + "learning_rate": 0.000992340794180029, + "loss": 1.2663, + "step": 104 + }, + { + "epoch": 0.19, + "grad_norm": 1.113776445388794, + "learning_rate": 0.000992177394053438, + "loss": 0.162, + "step": 105 + }, + { + "epoch": 0.19, + "grad_norm": 16.512048721313477, + "learning_rate": 0.0009920122830392748, + "loss": 3.3373, + "step": 106 + }, + { + "epoch": 0.19, + "grad_norm": 111.53176879882812, + "learning_rate": 0.0009918454617114918, + "loss": 2.3969, + "step": 107 + }, + { + "epoch": 0.19, + "grad_norm": 14.91741943359375, + "learning_rate": 0.0009916769306499865, + "loss": 1.8837, + "step": 108 + }, + { + "epoch": 0.19, + "grad_norm": 61.30055618286133, + "learning_rate": 0.0009915066904406, + "loss": 10.4922, + "step": 109 + }, + { + "epoch": 0.19, + "grad_norm": 0.6948704123497009, + "learning_rate": 0.0009913347416751147, + "loss": 0.1536, + "step": 110 + }, + { + "epoch": 0.2, + "grad_norm": 0.7721084356307983, + "learning_rate": 0.000991161084951252, + "loss": 0.1356, + "step": 111 + }, + { + "epoch": 0.2, + "grad_norm": 0.24614596366882324, + "learning_rate": 0.0009909857208726704, + "loss": 0.1339, + "step": 112 + }, + { + "epoch": 0.2, + "grad_norm": 7.189969062805176, + "learning_rate": 0.0009908086500489638, + "loss": 0.2551, + "step": 113 + }, + { + "epoch": 0.2, + "grad_norm": 0.8675662279129028, + "learning_rate": 0.0009906298730956585, + "loss": 0.1668, + "step": 114 + }, + { + "epoch": 0.2, + "grad_norm": 0.605249285697937, + "learning_rate": 0.0009904493906342123, + "loss": 0.1478, + "step": 115 + }, + { + "epoch": 0.21, + "grad_norm": 0.8765722513198853, + "learning_rate": 0.0009902672032920106, + "loss": 0.1598, + "step": 116 + }, + { + "epoch": 0.21, + "grad_norm": 0.6021157503128052, + "learning_rate": 0.0009900833117023665, + "loss": 0.1506, + "step": 117 + }, + { + "epoch": 0.21, + "grad_norm": 0.28180792927742004, + "learning_rate": 0.000989897716504516, + "loss": 0.1389, + "step": 118 + }, + { + "epoch": 0.21, + "grad_norm": 0.21730898320674896, + "learning_rate": 0.0009897104183436184, + "loss": 0.1377, + "step": 119 + }, + { + "epoch": 0.21, + "grad_norm": 0.977118730545044, + "learning_rate": 0.0009895214178707516, + "loss": 0.1698, + "step": 120 + }, + { + "epoch": 0.21, + "grad_norm": 2.674729585647583, + "learning_rate": 0.0009893307157429118, + "loss": 0.1559, + "step": 121 + }, + { + "epoch": 0.22, + "grad_norm": 0.9852035045623779, + "learning_rate": 0.0009891383126230102, + "loss": 0.2027, + "step": 122 + }, + { + "epoch": 0.22, + "grad_norm": 0.36689773201942444, + "learning_rate": 0.0009889442091798712, + "loss": 0.1498, + "step": 123 + }, + { + "epoch": 0.22, + "grad_norm": 0.104621522128582, + "learning_rate": 0.000988748406088229, + "loss": 0.1379, + "step": 124 + }, + { + "epoch": 0.22, + "grad_norm": 74.17496490478516, + "learning_rate": 0.0009885509040287268, + "loss": 0.7724, + "step": 125 + }, + { + "epoch": 0.22, + "grad_norm": 1.2943025827407837, + "learning_rate": 0.0009883517036879132, + "loss": 0.2643, + "step": 126 + }, + { + "epoch": 0.22, + "grad_norm": 0.828774094581604, + "learning_rate": 0.000988150805758241, + "loss": 0.1852, + "step": 127 + }, + { + "epoch": 0.23, + "grad_norm": 0.13165877759456635, + "learning_rate": 0.0009879482109380632, + "loss": 0.1429, + "step": 128 + }, + { + "epoch": 0.23, + "grad_norm": 0.662426769733429, + "learning_rate": 0.0009877439199316323, + "loss": 0.1643, + "step": 129 + }, + { + "epoch": 0.23, + "grad_norm": 0.6256189942359924, + "learning_rate": 0.0009875379334490962, + "loss": 0.157, + "step": 130 + }, + { + "epoch": 0.23, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0009873302522064972, + "loss": 0.1484, + "step": 131 + }, + { + "epoch": 0.23, + "grad_norm": 1.4133671522140503, + "learning_rate": 0.0009871208769257685, + "loss": 0.1736, + "step": 132 + }, + { + "epoch": 0.24, + "grad_norm": 0.7930824756622314, + "learning_rate": 0.0009869098083347323, + "loss": 0.1543, + "step": 133 + }, + { + "epoch": 0.24, + "grad_norm": 0.5717449188232422, + "learning_rate": 0.0009866970471670965, + "loss": 0.1338, + "step": 134 + }, + { + "epoch": 0.24, + "grad_norm": 0.582081139087677, + "learning_rate": 0.0009864825941624537, + "loss": 0.1692, + "step": 135 + }, + { + "epoch": 0.24, + "grad_norm": 10.226588249206543, + "learning_rate": 0.0009862664500662763, + "loss": 0.2425, + "step": 136 + }, + { + "epoch": 0.24, + "grad_norm": 1.1186953783035278, + "learning_rate": 0.0009860486156299164, + "loss": 0.2052, + "step": 137 + }, + { + "epoch": 0.24, + "grad_norm": 0.2953661382198334, + "learning_rate": 0.000985829091610601, + "loss": 0.1408, + "step": 138 + }, + { + "epoch": 0.25, + "grad_norm": 0.8647088408470154, + "learning_rate": 0.000985607878771431, + "loss": 0.1571, + "step": 139 + }, + { + "epoch": 0.25, + "grad_norm": 0.41964420676231384, + "learning_rate": 0.0009853849778813776, + "loss": 0.1477, + "step": 140 + }, + { + "epoch": 0.25, + "grad_norm": 0.25675931572914124, + "learning_rate": 0.0009851603897152803, + "loss": 0.1398, + "step": 141 + }, + { + "epoch": 0.25, + "grad_norm": 0.2311631143093109, + "learning_rate": 0.0009849341150538434, + "loss": 0.1432, + "step": 142 + }, + { + "epoch": 0.25, + "eval_loss": 1.5366541147232056, + "eval_runtime": 14.6962, + "eval_samples_per_second": 32.457, + "eval_steps_per_second": 8.165, + "step": 142 + }, + { + "epoch": 0.25, + "grad_norm": 41.83562469482422, + "learning_rate": 0.0009847061546836339, + "loss": 1.1525, + "step": 143 + }, + { + "epoch": 0.25, + "grad_norm": 0.27440375089645386, + "learning_rate": 0.0009844765093970787, + "loss": 0.1452, + "step": 144 + }, + { + "epoch": 0.26, + "grad_norm": 0.27643319964408875, + "learning_rate": 0.0009842451799924616, + "loss": 0.1069, + "step": 145 + }, + { + "epoch": 0.26, + "grad_norm": 0.21519601345062256, + "learning_rate": 0.0009840121672739207, + "loss": 0.1358, + "step": 146 + }, + { + "epoch": 0.26, + "grad_norm": 0.4073689877986908, + "learning_rate": 0.0009837774720514456, + "loss": 0.1545, + "step": 147 + }, + { + "epoch": 0.26, + "grad_norm": 0.13685636222362518, + "learning_rate": 0.0009835410951408747, + "loss": 0.1259, + "step": 148 + }, + { + "epoch": 0.26, + "grad_norm": 0.07474564015865326, + "learning_rate": 0.000983303037363892, + "loss": 0.1356, + "step": 149 + }, + { + "epoch": 0.27, + "grad_norm": 0.45116662979125977, + "learning_rate": 0.0009830632995480241, + "loss": 0.1379, + "step": 150 + }, + { + "epoch": 0.27, + "grad_norm": 0.1297813504934311, + "learning_rate": 0.0009828218825266388, + "loss": 0.1343, + "step": 151 + }, + { + "epoch": 0.27, + "grad_norm": 0.5846492052078247, + "learning_rate": 0.00098257878713894, + "loss": 0.1563, + "step": 152 + }, + { + "epoch": 0.27, + "grad_norm": 0.38457778096199036, + "learning_rate": 0.0009823340142299662, + "loss": 0.1477, + "step": 153 + }, + { + "epoch": 0.27, + "grad_norm": 0.09184035658836365, + "learning_rate": 0.0009820875646505873, + "loss": 0.1376, + "step": 154 + }, + { + "epoch": 0.27, + "grad_norm": 0.5166211128234863, + "learning_rate": 0.0009818394392575019, + "loss": 0.1498, + "step": 155 + }, + { + "epoch": 0.28, + "grad_norm": 0.2788640260696411, + "learning_rate": 0.0009815896389132332, + "loss": 0.1434, + "step": 156 + }, + { + "epoch": 0.28, + "grad_norm": 0.3762676417827606, + "learning_rate": 0.0009813381644861276, + "loss": 0.1482, + "step": 157 + }, + { + "epoch": 0.28, + "grad_norm": 0.3615610897541046, + "learning_rate": 0.0009810850168503506, + "loss": 0.1312, + "step": 158 + }, + { + "epoch": 0.28, + "grad_norm": 0.03483320027589798, + "learning_rate": 0.0009808301968858837, + "loss": 0.1239, + "step": 159 + }, + { + "epoch": 0.28, + "grad_norm": 0.5616227984428406, + "learning_rate": 0.0009805737054785222, + "loss": 0.1881, + "step": 160 + }, + { + "epoch": 0.28, + "grad_norm": 0.029542161151766777, + "learning_rate": 0.000980315543519871, + "loss": 0.1254, + "step": 161 + }, + { + "epoch": 0.29, + "grad_norm": 0.142581045627594, + "learning_rate": 0.0009800557119073433, + "loss": 0.1258, + "step": 162 + }, + { + "epoch": 0.29, + "grad_norm": 0.7289375066757202, + "learning_rate": 0.0009797942115441546, + "loss": 0.1526, + "step": 163 + }, + { + "epoch": 0.29, + "grad_norm": 0.6975064873695374, + "learning_rate": 0.0009795310433393224, + "loss": 0.1487, + "step": 164 + }, + { + "epoch": 0.29, + "grad_norm": 1.3072260618209839, + "learning_rate": 0.0009792662082076617, + "loss": 0.1712, + "step": 165 + }, + { + "epoch": 0.29, + "grad_norm": 0.2993917465209961, + "learning_rate": 0.000978999707069782, + "loss": 0.1424, + "step": 166 + }, + { + "epoch": 0.3, + "grad_norm": 0.3258236050605774, + "learning_rate": 0.0009787315408520839, + "loss": 0.135, + "step": 167 + }, + { + "epoch": 0.3, + "grad_norm": 0.26566603779792786, + "learning_rate": 0.000978461710486756, + "loss": 0.1441, + "step": 168 + }, + { + "epoch": 0.3, + "grad_norm": 1.1709599494934082, + "learning_rate": 0.0009781902169117718, + "loss": 0.2084, + "step": 169 + }, + { + "epoch": 0.3, + "grad_norm": 0.6554279923439026, + "learning_rate": 0.000977917061070887, + "loss": 0.1634, + "step": 170 + }, + { + "epoch": 0.3, + "grad_norm": 0.1635073721408844, + "learning_rate": 0.000977642243913635, + "loss": 0.1371, + "step": 171 + }, + { + "epoch": 0.3, + "grad_norm": 0.4419834017753601, + "learning_rate": 0.0009773657663953242, + "loss": 0.1523, + "step": 172 + }, + { + "epoch": 0.31, + "grad_norm": 0.839259147644043, + "learning_rate": 0.000977087629477035, + "loss": 0.1628, + "step": 173 + }, + { + "epoch": 0.31, + "grad_norm": 0.1979222148656845, + "learning_rate": 0.0009768078341256155, + "loss": 0.1367, + "step": 174 + }, + { + "epoch": 0.31, + "grad_norm": 0.2939910888671875, + "learning_rate": 0.0009765263813136795, + "loss": 0.1349, + "step": 175 + }, + { + "epoch": 0.31, + "grad_norm": 0.19882674515247345, + "learning_rate": 0.0009762432720196024, + "loss": 0.1424, + "step": 176 + }, + { + "epoch": 0.31, + "grad_norm": 0.07146954536437988, + "learning_rate": 0.000975958507227517, + "loss": 0.1237, + "step": 177 + }, + { + "epoch": 0.31, + "grad_norm": 0.5031868815422058, + "learning_rate": 0.0009756720879273117, + "loss": 0.1592, + "step": 178 + }, + { + "epoch": 0.32, + "grad_norm": 0.14860151708126068, + "learning_rate": 0.0009753840151146258, + "loss": 0.1396, + "step": 179 + }, + { + "epoch": 0.32, + "grad_norm": 0.10280521959066391, + "learning_rate": 0.0009750942897908468, + "loss": 0.1333, + "step": 180 + }, + { + "epoch": 0.32, + "grad_norm": 0.4652903974056244, + "learning_rate": 0.0009748029129631061, + "loss": 0.1421, + "step": 181 + }, + { + "epoch": 0.32, + "grad_norm": 0.3985591530799866, + "learning_rate": 0.0009745098856442768, + "loss": 0.1459, + "step": 182 + }, + { + "epoch": 0.32, + "grad_norm": 0.20321591198444366, + "learning_rate": 0.0009742152088529683, + "loss": 0.1381, + "step": 183 + }, + { + "epoch": 0.33, + "grad_norm": 0.7694361805915833, + "learning_rate": 0.0009739188836135246, + "loss": 0.1676, + "step": 184 + }, + { + "epoch": 0.33, + "grad_norm": 0.04469340294599533, + "learning_rate": 0.0009736209109560201, + "loss": 0.136, + "step": 185 + }, + { + "epoch": 0.33, + "grad_norm": 0.08576061576604843, + "learning_rate": 0.0009733212919162549, + "loss": 0.1408, + "step": 186 + }, + { + "epoch": 0.33, + "grad_norm": 0.042906519025564194, + "learning_rate": 0.0009730200275357535, + "loss": 0.1364, + "step": 187 + }, + { + "epoch": 0.33, + "grad_norm": 0.30054494738578796, + "learning_rate": 0.0009727171188617588, + "loss": 0.1539, + "step": 188 + }, + { + "epoch": 0.33, + "grad_norm": 0.05149005725979805, + "learning_rate": 0.0009724125669472299, + "loss": 0.1352, + "step": 189 + }, + { + "epoch": 0.34, + "grad_norm": 0.1381620466709137, + "learning_rate": 0.0009721063728508383, + "loss": 0.1409, + "step": 190 + }, + { + "epoch": 0.34, + "grad_norm": 0.37344205379486084, + "learning_rate": 0.0009717985376369639, + "loss": 0.1299, + "step": 191 + }, + { + "epoch": 0.34, + "grad_norm": 0.1037706583738327, + "learning_rate": 0.0009714890623756912, + "loss": 0.1341, + "step": 192 + }, + { + "epoch": 0.34, + "grad_norm": 0.14189712703227997, + "learning_rate": 0.0009711779481428056, + "loss": 0.1418, + "step": 193 + }, + { + "epoch": 0.34, + "grad_norm": 0.15108801424503326, + "learning_rate": 0.0009708651960197903, + "loss": 0.142, + "step": 194 + }, + { + "epoch": 0.34, + "grad_norm": 0.037045519798994064, + "learning_rate": 0.0009705508070938218, + "loss": 0.1315, + "step": 195 + }, + { + "epoch": 0.35, + "grad_norm": 0.23301652073860168, + "learning_rate": 0.0009702347824577666, + "loss": 0.1396, + "step": 196 + }, + { + "epoch": 0.35, + "grad_norm": 0.08476269990205765, + "learning_rate": 0.0009699171232101768, + "loss": 0.1392, + "step": 197 + }, + { + "epoch": 0.35, + "grad_norm": 0.4222690463066101, + "learning_rate": 0.000969597830455287, + "loss": 0.1463, + "step": 198 + }, + { + "epoch": 0.35, + "grad_norm": 0.3234136402606964, + "learning_rate": 0.0009692769053030099, + "loss": 0.1257, + "step": 199 + }, + { + "epoch": 0.35, + "grad_norm": 0.04025443643331528, + "learning_rate": 0.0009689543488689332, + "loss": 0.1303, + "step": 200 + }, + { + "epoch": 0.36, + "grad_norm": 0.07074520736932755, + "learning_rate": 0.0009686301622743144, + "loss": 0.1289, + "step": 201 + }, + { + "epoch": 0.36, + "grad_norm": 0.0788850486278534, + "learning_rate": 0.0009683043466460782, + "loss": 0.1236, + "step": 202 + }, + { + "epoch": 0.36, + "grad_norm": 0.525541365146637, + "learning_rate": 0.000967976903116812, + "loss": 0.1564, + "step": 203 + }, + { + "epoch": 0.36, + "grad_norm": 0.6145509481430054, + "learning_rate": 0.0009676478328247623, + "loss": 0.156, + "step": 204 + }, + { + "epoch": 0.36, + "grad_norm": 0.230132058262825, + "learning_rate": 0.0009673171369138296, + "loss": 0.1425, + "step": 205 + }, + { + "epoch": 0.36, + "grad_norm": 0.03262978792190552, + "learning_rate": 0.0009669848165335666, + "loss": 0.1297, + "step": 206 + }, + { + "epoch": 0.37, + "grad_norm": 0.0462469644844532, + "learning_rate": 0.0009666508728391718, + "loss": 0.1177, + "step": 207 + }, + { + "epoch": 0.37, + "grad_norm": 0.06880385428667068, + "learning_rate": 0.0009663153069914874, + "loss": 0.1207, + "step": 208 + }, + { + "epoch": 0.37, + "grad_norm": 0.4248260259628296, + "learning_rate": 0.000965978120156994, + "loss": 0.1571, + "step": 209 + }, + { + "epoch": 0.37, + "grad_norm": 0.060492075979709625, + "learning_rate": 0.0009656393135078068, + "loss": 0.1219, + "step": 210 + }, + { + "epoch": 0.37, + "grad_norm": 0.12135621905326843, + "learning_rate": 0.0009652988882216725, + "loss": 0.1323, + "step": 211 + }, + { + "epoch": 0.38, + "grad_norm": 0.252119243144989, + "learning_rate": 0.0009649568454819637, + "loss": 0.1366, + "step": 212 + }, + { + "epoch": 0.38, + "grad_norm": 0.5283567905426025, + "learning_rate": 0.0009646131864776761, + "loss": 0.1246, + "step": 213 + }, + { + "epoch": 0.38, + "grad_norm": 2.224665880203247, + "learning_rate": 0.0009642679124034233, + "loss": 0.2582, + "step": 214 + }, + { + "epoch": 0.38, + "grad_norm": 1.9277523756027222, + "learning_rate": 0.0009639210244594335, + "loss": 0.2131, + "step": 215 + }, + { + "epoch": 0.38, + "grad_norm": 0.5668452978134155, + "learning_rate": 0.0009635725238515446, + "loss": 0.141, + "step": 216 + }, + { + "epoch": 0.38, + "grad_norm": 0.13912492990493774, + "learning_rate": 0.000963222411791201, + "loss": 0.1418, + "step": 217 + }, + { + "epoch": 0.39, + "grad_norm": 0.39307814836502075, + "learning_rate": 0.0009628706894954479, + "loss": 0.1477, + "step": 218 + }, + { + "epoch": 0.39, + "grad_norm": 0.26248928904533386, + "learning_rate": 0.000962517358186929, + "loss": 0.1315, + "step": 219 + }, + { + "epoch": 0.39, + "grad_norm": 0.2875257730484009, + "learning_rate": 0.0009621624190938803, + "loss": 0.1321, + "step": 220 + }, + { + "epoch": 0.39, + "grad_norm": 0.6386964917182922, + "learning_rate": 0.0009618058734501269, + "loss": 0.1668, + "step": 221 + }, + { + "epoch": 0.39, + "grad_norm": 0.16165001690387726, + "learning_rate": 0.0009614477224950789, + "loss": 0.1272, + "step": 222 + }, + { + "epoch": 0.39, + "grad_norm": 0.6959558129310608, + "learning_rate": 0.0009610879674737262, + "loss": 0.1381, + "step": 223 + }, + { + "epoch": 0.4, + "grad_norm": 0.1701437532901764, + "learning_rate": 0.0009607266096366352, + "loss": 0.1366, + "step": 224 + }, + { + "epoch": 0.4, + "grad_norm": 0.2511409819126129, + "learning_rate": 0.0009603636502399437, + "loss": 0.126, + "step": 225 + }, + { + "epoch": 0.4, + "grad_norm": 0.04554220288991928, + "learning_rate": 0.0009599990905453566, + "loss": 0.1321, + "step": 226 + }, + { + "epoch": 0.4, + "grad_norm": 0.3964705765247345, + "learning_rate": 0.000959632931820142, + "loss": 0.1383, + "step": 227 + }, + { + "epoch": 0.4, + "grad_norm": 0.10925984382629395, + "learning_rate": 0.0009592651753371264, + "loss": 0.1226, + "step": 228 + }, + { + "epoch": 0.41, + "grad_norm": 0.19012318551540375, + "learning_rate": 0.0009588958223746903, + "loss": 0.1255, + "step": 229 + }, + { + "epoch": 0.41, + "grad_norm": 0.23432157933712006, + "learning_rate": 0.0009585248742167639, + "loss": 0.1152, + "step": 230 + }, + { + "epoch": 0.41, + "grad_norm": 0.1737753301858902, + "learning_rate": 0.0009581523321528223, + "loss": 0.1468, + "step": 231 + }, + { + "epoch": 0.41, + "grad_norm": 0.2625434100627899, + "learning_rate": 0.0009577781974778817, + "loss": 0.1296, + "step": 232 + }, + { + "epoch": 0.41, + "grad_norm": 0.3056884706020355, + "learning_rate": 0.000957402471492494, + "loss": 0.1574, + "step": 233 + }, + { + "epoch": 0.41, + "grad_norm": 0.4111999273300171, + "learning_rate": 0.0009570251555027432, + "loss": 0.1434, + "step": 234 + }, + { + "epoch": 0.42, + "grad_norm": 0.056673482060432434, + "learning_rate": 0.0009566462508202401, + "loss": 0.1337, + "step": 235 + }, + { + "epoch": 0.42, + "grad_norm": 0.3861597180366516, + "learning_rate": 0.0009562657587621184, + "loss": 0.1609, + "step": 236 + }, + { + "epoch": 0.42, + "grad_norm": 0.35893362760543823, + "learning_rate": 0.0009558836806510292, + "loss": 0.1189, + "step": 237 + }, + { + "epoch": 0.42, + "grad_norm": 0.40538331866264343, + "learning_rate": 0.0009555000178151374, + "loss": 0.1504, + "step": 238 + }, + { + "epoch": 0.42, + "grad_norm": 81.36141967773438, + "learning_rate": 0.0009551147715881167, + "loss": 4.7235, + "step": 239 + }, + { + "epoch": 0.42, + "grad_norm": 0.21178042888641357, + "learning_rate": 0.0009547279433091446, + "loss": 0.1139, + "step": 240 + }, + { + "epoch": 0.43, + "grad_norm": 0.27380529046058655, + "learning_rate": 0.0009543395343228983, + "loss": 0.1504, + "step": 241 + }, + { + "epoch": 0.43, + "grad_norm": 41.42683410644531, + "learning_rate": 0.0009539495459795498, + "loss": 1.2477, + "step": 242 + }, + { + "epoch": 0.43, + "grad_norm": 0.14853385090827942, + "learning_rate": 0.0009535579796347612, + "loss": 0.1343, + "step": 243 + }, + { + "epoch": 0.43, + "grad_norm": 0.3484509289264679, + "learning_rate": 0.0009531648366496798, + "loss": 0.15, + "step": 244 + }, + { + "epoch": 0.43, + "grad_norm": 0.20152732729911804, + "learning_rate": 0.0009527701183909336, + "loss": 0.1399, + "step": 245 + }, + { + "epoch": 0.44, + "grad_norm": 80.84031677246094, + "learning_rate": 0.000952373826230627, + "loss": 3.1939, + "step": 246 + }, + { + "epoch": 0.44, + "grad_norm": 15.475607872009277, + "learning_rate": 0.0009519759615463346, + "loss": 3.3935, + "step": 247 + }, + { + "epoch": 0.44, + "grad_norm": 77.19477081298828, + "learning_rate": 0.0009515765257210979, + "loss": 6.5034, + "step": 248 + }, + { + "epoch": 0.44, + "grad_norm": 0.1174071803689003, + "learning_rate": 0.0009511755201434205, + "loss": 0.1212, + "step": 249 + }, + { + "epoch": 0.44, + "grad_norm": 16.503982543945312, + "learning_rate": 0.0009507729462072614, + "loss": 0.3753, + "step": 250 + }, + { + "epoch": 0.44, + "grad_norm": 76.65412902832031, + "learning_rate": 0.0009503688053120326, + "loss": 0.9386, + "step": 251 + }, + { + "epoch": 0.45, + "grad_norm": 94.82160186767578, + "learning_rate": 0.0009499630988625925, + "loss": 4.7449, + "step": 252 + }, + { + "epoch": 0.45, + "grad_norm": 0.2721010148525238, + "learning_rate": 0.0009495558282692421, + "loss": 0.1358, + "step": 253 + }, + { + "epoch": 0.45, + "grad_norm": 0.5150814056396484, + "learning_rate": 0.0009491469949477187, + "loss": 0.1622, + "step": 254 + }, + { + "epoch": 0.45, + "grad_norm": 51.050167083740234, + "learning_rate": 0.0009487366003191931, + "loss": 0.7818, + "step": 255 + }, + { + "epoch": 0.45, + "grad_norm": 11.698090553283691, + "learning_rate": 0.0009483246458102625, + "loss": 0.3862, + "step": 256 + }, + { + "epoch": 0.45, + "grad_norm": 0.648543655872345, + "learning_rate": 0.0009479111328529472, + "loss": 0.1884, + "step": 257 + }, + { + "epoch": 0.46, + "grad_norm": 0.745293140411377, + "learning_rate": 0.0009474960628846843, + "loss": 0.1562, + "step": 258 + }, + { + "epoch": 0.46, + "grad_norm": 0.17890043556690216, + "learning_rate": 0.0009470794373483235, + "loss": 0.1425, + "step": 259 + }, + { + "epoch": 0.46, + "grad_norm": 0.5058090686798096, + "learning_rate": 0.0009466612576921223, + "loss": 0.17, + "step": 260 + }, + { + "epoch": 0.46, + "grad_norm": 1.3177820444107056, + "learning_rate": 0.00094624152536974, + "loss": 0.15, + "step": 261 + }, + { + "epoch": 0.46, + "grad_norm": 0.49652573466300964, + "learning_rate": 0.0009458202418402337, + "loss": 0.145, + "step": 262 + }, + { + "epoch": 0.47, + "grad_norm": 11.423394203186035, + "learning_rate": 0.0009453974085680526, + "loss": 0.349, + "step": 263 + }, + { + "epoch": 0.47, + "grad_norm": 1.5422337055206299, + "learning_rate": 0.0009449730270230326, + "loss": 0.211, + "step": 264 + }, + { + "epoch": 0.47, + "grad_norm": 103.68435668945312, + "learning_rate": 0.0009445470986803921, + "loss": 17.4069, + "step": 265 + }, + { + "epoch": 0.47, + "grad_norm": 54.51758575439453, + "learning_rate": 0.0009441196250207267, + "loss": 15.685, + "step": 266 + }, + { + "epoch": 0.47, + "grad_norm": 14.596623420715332, + "learning_rate": 0.0009436906075300032, + "loss": 0.791, + "step": 267 + }, + { + "epoch": 0.47, + "grad_norm": 3.3164780139923096, + "learning_rate": 0.000943260047699555, + "loss": 0.3611, + "step": 268 + }, + { + "epoch": 0.48, + "grad_norm": 0.3087855577468872, + "learning_rate": 0.0009428279470260776, + "loss": 0.1332, + "step": 269 + }, + { + "epoch": 0.48, + "grad_norm": 1.1544523239135742, + "learning_rate": 0.0009423943070116219, + "loss": 0.2405, + "step": 270 + }, + { + "epoch": 0.48, + "grad_norm": 0.27010253071784973, + "learning_rate": 0.00094195912916359, + "loss": 0.1241, + "step": 271 + }, + { + "epoch": 0.48, + "grad_norm": 0.2287709265947342, + "learning_rate": 0.0009415224149947306, + "loss": 0.1366, + "step": 272 + }, + { + "epoch": 0.48, + "grad_norm": 0.5216432809829712, + "learning_rate": 0.0009410841660231316, + "loss": 0.1641, + "step": 273 + }, + { + "epoch": 0.48, + "grad_norm": 1.3091949224472046, + "learning_rate": 0.0009406443837722167, + "loss": 0.2524, + "step": 274 + }, + { + "epoch": 0.49, + "grad_norm": 0.11813609302043915, + "learning_rate": 0.0009402030697707398, + "loss": 0.1353, + "step": 275 + }, + { + "epoch": 0.49, + "grad_norm": 1.3709551095962524, + "learning_rate": 0.000939760225552779, + "loss": 0.2714, + "step": 276 + }, + { + "epoch": 0.49, + "grad_norm": 8.527563095092773, + "learning_rate": 0.0009393158526577322, + "loss": 0.1955, + "step": 277 + }, + { + "epoch": 0.49, + "grad_norm": 21.874027252197266, + "learning_rate": 0.0009388699526303105, + "loss": 0.2398, + "step": 278 + }, + { + "epoch": 0.49, + "grad_norm": 51.793731689453125, + "learning_rate": 0.0009384225270205339, + "loss": 1.3069, + "step": 279 + }, + { + "epoch": 0.5, + "grad_norm": 0.6711062788963318, + "learning_rate": 0.0009379735773837259, + "loss": 0.1664, + "step": 280 + }, + { + "epoch": 0.5, + "grad_norm": 5.93789005279541, + "learning_rate": 0.0009375231052805072, + "loss": 0.2455, + "step": 281 + }, + { + "epoch": 0.5, + "grad_norm": 62.527198791503906, + "learning_rate": 0.0009370711122767912, + "loss": 6.6447, + "step": 282 + }, + { + "epoch": 0.5, + "grad_norm": 22.35348129272461, + "learning_rate": 0.000936617599943778, + "loss": 2.5015, + "step": 283 + }, + { + "epoch": 0.5, + "grad_norm": 0.7277780175209045, + "learning_rate": 0.0009361625698579493, + "loss": 0.1667, + "step": 284 + }, + { + "epoch": 0.5, + "eval_loss": 0.14179374277591705, + "eval_runtime": 14.7139, + "eval_samples_per_second": 32.418, + "eval_steps_per_second": 8.156, + "step": 284 + }, + { + "epoch": 0.5, + "grad_norm": 0.26271358132362366, + "learning_rate": 0.0009357060236010625, + "loss": 0.1429, + "step": 285 + }, + { + "epoch": 0.51, + "grad_norm": 21.24464988708496, + "learning_rate": 0.0009352479627601457, + "loss": 2.0706, + "step": 286 + }, + { + "epoch": 0.51, + "grad_norm": 6.5764265060424805, + "learning_rate": 0.0009347883889274922, + "loss": 0.3337, + "step": 287 + }, + { + "epoch": 0.51, + "grad_norm": 0.6868380904197693, + "learning_rate": 0.0009343273037006539, + "loss": 0.1994, + "step": 288 + }, + { + "epoch": 0.51, + "grad_norm": 0.9018234610557556, + "learning_rate": 0.0009338647086824372, + "loss": 0.1908, + "step": 289 + }, + { + "epoch": 0.51, + "grad_norm": 1.7751502990722656, + "learning_rate": 0.0009334006054808966, + "loss": 0.2028, + "step": 290 + }, + { + "epoch": 0.51, + "grad_norm": 0.5386408567428589, + "learning_rate": 0.0009329349957093293, + "loss": 0.1853, + "step": 291 + }, + { + "epoch": 0.52, + "grad_norm": 1.4171103239059448, + "learning_rate": 0.0009324678809862695, + "loss": 0.3597, + "step": 292 + }, + { + "epoch": 0.52, + "grad_norm": 0.4105970561504364, + "learning_rate": 0.0009319992629354827, + "loss": 0.1344, + "step": 293 + }, + { + "epoch": 0.52, + "grad_norm": 0.26628127694129944, + "learning_rate": 0.000931529143185961, + "loss": 0.1453, + "step": 294 + }, + { + "epoch": 0.52, + "grad_norm": 14.981964111328125, + "learning_rate": 0.0009310575233719154, + "loss": 0.2563, + "step": 295 + }, + { + "epoch": 0.52, + "grad_norm": 0.6945788264274597, + "learning_rate": 0.0009305844051327725, + "loss": 0.1229, + "step": 296 + }, + { + "epoch": 0.53, + "grad_norm": 31.034496307373047, + "learning_rate": 0.000930109790113167, + "loss": 1.2974, + "step": 297 + }, + { + "epoch": 0.53, + "grad_norm": 1.5794603824615479, + "learning_rate": 0.0009296336799629368, + "loss": 0.22, + "step": 298 + }, + { + "epoch": 0.53, + "grad_norm": 0.33219394087791443, + "learning_rate": 0.0009291560763371172, + "loss": 0.1262, + "step": 299 + }, + { + "epoch": 0.53, + "grad_norm": 2.597118377685547, + "learning_rate": 0.000928676980895935, + "loss": 0.4026, + "step": 300 + }, + { + "epoch": 0.53, + "grad_norm": 13.547090530395508, + "learning_rate": 0.0009281963953048029, + "loss": 1.3086, + "step": 301 + }, + { + "epoch": 0.53, + "grad_norm": 1.289302945137024, + "learning_rate": 0.0009277143212343134, + "loss": 0.2215, + "step": 302 + }, + { + "epoch": 0.54, + "grad_norm": 1.2176313400268555, + "learning_rate": 0.0009272307603602334, + "loss": 0.15, + "step": 303 + }, + { + "epoch": 0.54, + "grad_norm": 4.436944007873535, + "learning_rate": 0.0009267457143634979, + "loss": 0.514, + "step": 304 + }, + { + "epoch": 0.54, + "grad_norm": 29.960241317749023, + "learning_rate": 0.0009262591849302047, + "loss": 3.5389, + "step": 305 + }, + { + "epoch": 0.54, + "grad_norm": 5.514049530029297, + "learning_rate": 0.0009257711737516082, + "loss": 0.2902, + "step": 306 + }, + { + "epoch": 0.54, + "grad_norm": 2.331019401550293, + "learning_rate": 0.0009252816825241135, + "loss": 0.2775, + "step": 307 + }, + { + "epoch": 0.54, + "grad_norm": 0.5708584189414978, + "learning_rate": 0.0009247907129492707, + "loss": 0.1438, + "step": 308 + }, + { + "epoch": 0.55, + "grad_norm": 2.16607928276062, + "learning_rate": 0.0009242982667337685, + "loss": 0.2383, + "step": 309 + }, + { + "epoch": 0.55, + "grad_norm": 1.5346423387527466, + "learning_rate": 0.0009238043455894293, + "loss": 0.1793, + "step": 310 + }, + { + "epoch": 0.55, + "grad_norm": 0.6200052499771118, + "learning_rate": 0.000923308951233202, + "loss": 0.1473, + "step": 311 + }, + { + "epoch": 0.55, + "grad_norm": 64.87612915039062, + "learning_rate": 0.0009228120853871572, + "loss": 0.8875, + "step": 312 + }, + { + "epoch": 0.55, + "grad_norm": 1.077471137046814, + "learning_rate": 0.0009223137497784797, + "loss": 0.2114, + "step": 313 + }, + { + "epoch": 0.56, + "grad_norm": 3.5934722423553467, + "learning_rate": 0.0009218139461394644, + "loss": 0.2852, + "step": 314 + }, + { + "epoch": 0.56, + "grad_norm": 0.10276800394058228, + "learning_rate": 0.0009213126762075088, + "loss": 0.1365, + "step": 315 + }, + { + "epoch": 0.56, + "grad_norm": 3.9422831535339355, + "learning_rate": 0.0009208099417251077, + "loss": 0.2949, + "step": 316 + }, + { + "epoch": 0.56, + "grad_norm": 1.7574914693832397, + "learning_rate": 0.0009203057444398468, + "loss": 0.2621, + "step": 317 + }, + { + "epoch": 0.56, + "grad_norm": 0.29479530453681946, + "learning_rate": 0.0009198000861043967, + "loss": 0.1341, + "step": 318 + }, + { + "epoch": 0.56, + "grad_norm": 0.5362827181816101, + "learning_rate": 0.0009192929684765068, + "loss": 0.1398, + "step": 319 + }, + { + "epoch": 0.57, + "grad_norm": 0.8159481287002563, + "learning_rate": 0.0009187843933189994, + "loss": 0.1863, + "step": 320 + }, + { + "epoch": 0.57, + "grad_norm": 0.9413295388221741, + "learning_rate": 0.0009182743623997634, + "loss": 0.2104, + "step": 321 + }, + { + "epoch": 0.57, + "grad_norm": 0.5306220650672913, + "learning_rate": 0.0009177628774917479, + "loss": 0.1537, + "step": 322 + }, + { + "epoch": 0.57, + "grad_norm": 0.8887706398963928, + "learning_rate": 0.0009172499403729567, + "loss": 0.1963, + "step": 323 + }, + { + "epoch": 0.57, + "grad_norm": 0.8467744588851929, + "learning_rate": 0.0009167355528264414, + "loss": 0.204, + "step": 324 + }, + { + "epoch": 0.57, + "grad_norm": 0.19867151975631714, + "learning_rate": 0.0009162197166402956, + "loss": 0.1407, + "step": 325 + }, + { + "epoch": 0.58, + "grad_norm": 0.13638383150100708, + "learning_rate": 0.0009157024336076487, + "loss": 0.1408, + "step": 326 + }, + { + "epoch": 0.58, + "grad_norm": 0.2027496099472046, + "learning_rate": 0.0009151837055266594, + "loss": 0.1444, + "step": 327 + }, + { + "epoch": 0.58, + "grad_norm": 0.370151549577713, + "learning_rate": 0.0009146635342005098, + "loss": 0.158, + "step": 328 + }, + { + "epoch": 0.58, + "grad_norm": 0.3114052414894104, + "learning_rate": 0.000914141921437399, + "loss": 0.1464, + "step": 329 + }, + { + "epoch": 0.58, + "grad_norm": 0.15394961833953857, + "learning_rate": 0.0009136188690505362, + "loss": 0.1341, + "step": 330 + }, + { + "epoch": 0.59, + "grad_norm": 0.46498528122901917, + "learning_rate": 0.0009130943788581359, + "loss": 0.1426, + "step": 331 + }, + { + "epoch": 0.59, + "grad_norm": 0.28067877888679504, + "learning_rate": 0.00091256845268341, + "loss": 0.1409, + "step": 332 + }, + { + "epoch": 0.59, + "grad_norm": 0.061186857521533966, + "learning_rate": 0.0009120410923545619, + "loss": 0.1401, + "step": 333 + }, + { + "epoch": 0.59, + "grad_norm": 0.26736098527908325, + "learning_rate": 0.0009115122997047811, + "loss": 0.1467, + "step": 334 + }, + { + "epoch": 0.59, + "grad_norm": 0.5139696598052979, + "learning_rate": 0.0009109820765722356, + "loss": 0.1585, + "step": 335 + }, + { + "epoch": 0.59, + "grad_norm": 0.40007275342941284, + "learning_rate": 0.000910450424800066, + "loss": 0.1473, + "step": 336 + }, + { + "epoch": 0.6, + "grad_norm": 0.66825270652771, + "learning_rate": 0.0009099173462363792, + "loss": 0.1572, + "step": 337 + }, + { + "epoch": 0.6, + "grad_norm": 0.5313024520874023, + "learning_rate": 0.0009093828427342418, + "loss": 0.1555, + "step": 338 + }, + { + "epoch": 0.6, + "grad_norm": 0.4224655330181122, + "learning_rate": 0.0009088469161516735, + "loss": 0.1429, + "step": 339 + }, + { + "epoch": 0.6, + "grad_norm": 0.03462248668074608, + "learning_rate": 0.0009083095683516414, + "loss": 0.1325, + "step": 340 + }, + { + "epoch": 0.6, + "grad_norm": 0.542322039604187, + "learning_rate": 0.0009077708012020524, + "loss": 0.1755, + "step": 341 + }, + { + "epoch": 0.61, + "grad_norm": 0.2164747267961502, + "learning_rate": 0.0009072306165757476, + "loss": 0.1458, + "step": 342 + }, + { + "epoch": 0.61, + "grad_norm": 0.27414461970329285, + "learning_rate": 0.0009066890163504955, + "loss": 0.1512, + "step": 343 + }, + { + "epoch": 0.61, + "grad_norm": 0.1911482959985733, + "learning_rate": 0.0009061460024089853, + "loss": 0.1185, + "step": 344 + }, + { + "epoch": 0.61, + "grad_norm": 0.1287711262702942, + "learning_rate": 0.0009056015766388205, + "loss": 0.1372, + "step": 345 + }, + { + "epoch": 0.61, + "grad_norm": 0.18598809838294983, + "learning_rate": 0.0009050557409325125, + "loss": 0.1341, + "step": 346 + }, + { + "epoch": 0.61, + "grad_norm": 0.18694853782653809, + "learning_rate": 0.0009045084971874737, + "loss": 0.141, + "step": 347 + }, + { + "epoch": 0.62, + "grad_norm": 0.06479912996292114, + "learning_rate": 0.0009039598473060113, + "loss": 0.1368, + "step": 348 + }, + { + "epoch": 0.62, + "grad_norm": 0.17768733203411102, + "learning_rate": 0.0009034097931953201, + "loss": 0.1381, + "step": 349 + }, + { + "epoch": 0.62, + "grad_norm": 0.28938984870910645, + "learning_rate": 0.0009028583367674765, + "loss": 0.1365, + "step": 350 + }, + { + "epoch": 0.62, + "grad_norm": 0.2924034893512726, + "learning_rate": 0.0009023054799394316, + "loss": 0.1282, + "step": 351 + }, + { + "epoch": 0.62, + "grad_norm": 0.28439652919769287, + "learning_rate": 0.0009017512246330042, + "loss": 0.151, + "step": 352 + }, + { + "epoch": 0.62, + "grad_norm": 0.14329224824905396, + "learning_rate": 0.0009011955727748749, + "loss": 0.1419, + "step": 353 + }, + { + "epoch": 0.63, + "grad_norm": 0.15245947241783142, + "learning_rate": 0.0009006385262965785, + "loss": 0.1163, + "step": 354 + }, + { + "epoch": 0.63, + "grad_norm": 0.052399642765522, + "learning_rate": 0.000900080087134498, + "loss": 0.1241, + "step": 355 + }, + { + "epoch": 0.63, + "grad_norm": 0.030301153659820557, + "learning_rate": 0.0008995202572298575, + "loss": 0.1232, + "step": 356 + }, + { + "epoch": 0.63, + "grad_norm": 0.41738417744636536, + "learning_rate": 0.0008989590385287155, + "loss": 0.1675, + "step": 357 + }, + { + "epoch": 0.63, + "grad_norm": 0.19307875633239746, + "learning_rate": 0.0008983964329819583, + "loss": 0.1328, + "step": 358 + }, + { + "epoch": 0.64, + "grad_norm": 0.05682377517223358, + "learning_rate": 0.000897832442545293, + "loss": 0.1322, + "step": 359 + }, + { + "epoch": 0.64, + "grad_norm": 0.15418089926242828, + "learning_rate": 0.0008972670691792409, + "loss": 0.1414, + "step": 360 + }, + { + "epoch": 0.64, + "grad_norm": 0.07167459279298782, + "learning_rate": 0.0008967003148491304, + "loss": 0.1414, + "step": 361 + }, + { + "epoch": 0.64, + "grad_norm": 0.2866109609603882, + "learning_rate": 0.0008961321815250904, + "loss": 0.1381, + "step": 362 + }, + { + "epoch": 0.64, + "grad_norm": 0.281264990568161, + "learning_rate": 0.0008955626711820438, + "loss": 0.1365, + "step": 363 + }, + { + "epoch": 0.64, + "grad_norm": 0.19263768196105957, + "learning_rate": 0.0008949917857996997, + "loss": 0.1394, + "step": 364 + }, + { + "epoch": 0.65, + "grad_norm": 0.30531641840934753, + "learning_rate": 0.0008944195273625471, + "loss": 0.1478, + "step": 365 + }, + { + "epoch": 0.65, + "grad_norm": 0.16229306161403656, + "learning_rate": 0.0008938458978598483, + "loss": 0.1412, + "step": 366 + }, + { + "epoch": 0.65, + "grad_norm": 0.09315463900566101, + "learning_rate": 0.0008932708992856315, + "loss": 0.1397, + "step": 367 + }, + { + "epoch": 0.65, + "grad_norm": 0.04228806868195534, + "learning_rate": 0.0008926945336386838, + "loss": 0.1383, + "step": 368 + }, + { + "epoch": 0.65, + "grad_norm": 0.2209407389163971, + "learning_rate": 0.0008921168029225448, + "loss": 0.1434, + "step": 369 + }, + { + "epoch": 0.65, + "grad_norm": 0.04254443198442459, + "learning_rate": 0.0008915377091454992, + "loss": 0.1326, + "step": 370 + }, + { + "epoch": 0.66, + "grad_norm": 0.09651175886392593, + "learning_rate": 0.0008909572543205698, + "loss": 0.134, + "step": 371 + }, + { + "epoch": 0.66, + "grad_norm": 0.2821654975414276, + "learning_rate": 0.0008903754404655105, + "loss": 0.1498, + "step": 372 + }, + { + "epoch": 0.66, + "grad_norm": 0.43042680621147156, + "learning_rate": 0.0008897922696027998, + "loss": 0.1571, + "step": 373 + }, + { + "epoch": 0.66, + "grad_norm": 0.06591568142175674, + "learning_rate": 0.0008892077437596332, + "loss": 0.1391, + "step": 374 + }, + { + "epoch": 0.66, + "grad_norm": 0.08771979063749313, + "learning_rate": 0.0008886218649679161, + "loss": 0.1375, + "step": 375 + }, + { + "epoch": 0.67, + "grad_norm": 0.03339942544698715, + "learning_rate": 0.0008880346352642574, + "loss": 0.1368, + "step": 376 + }, + { + "epoch": 0.67, + "grad_norm": 0.15352453291416168, + "learning_rate": 0.0008874460566899616, + "loss": 0.1447, + "step": 377 + }, + { + "epoch": 0.67, + "grad_norm": 0.1778584122657776, + "learning_rate": 0.0008868561312910222, + "loss": 0.1189, + "step": 378 + }, + { + "epoch": 0.67, + "grad_norm": 0.11893154680728912, + "learning_rate": 0.0008862648611181144, + "loss": 0.1167, + "step": 379 + }, + { + "epoch": 0.67, + "grad_norm": 0.4323861598968506, + "learning_rate": 0.0008856722482265886, + "loss": 0.1691, + "step": 380 + }, + { + "epoch": 0.67, + "grad_norm": 0.28813356161117554, + "learning_rate": 0.0008850782946764618, + "loss": 0.1505, + "step": 381 + }, + { + "epoch": 0.68, + "grad_norm": 0.5008757710456848, + "learning_rate": 0.0008844830025324122, + "loss": 0.1671, + "step": 382 + }, + { + "epoch": 0.68, + "grad_norm": 0.12061876803636551, + "learning_rate": 0.0008838863738637705, + "loss": 0.1375, + "step": 383 + }, + { + "epoch": 0.68, + "grad_norm": 0.6747052073478699, + "learning_rate": 0.0008832884107445138, + "loss": 0.1663, + "step": 384 + }, + { + "epoch": 0.68, + "grad_norm": 0.18846777081489563, + "learning_rate": 0.0008826891152532579, + "loss": 0.1148, + "step": 385 + }, + { + "epoch": 0.68, + "grad_norm": 0.0950111448764801, + "learning_rate": 0.0008820884894732497, + "loss": 0.1138, + "step": 386 + }, + { + "epoch": 0.68, + "grad_norm": 0.42371127009391785, + "learning_rate": 0.0008814865354923613, + "loss": 0.142, + "step": 387 + }, + { + "epoch": 0.69, + "grad_norm": 0.17662374675273895, + "learning_rate": 0.0008808832554030808, + "loss": 0.1255, + "step": 388 + }, + { + "epoch": 0.69, + "grad_norm": 0.7766286134719849, + "learning_rate": 0.0008802786513025068, + "loss": 0.1613, + "step": 389 + }, + { + "epoch": 0.69, + "grad_norm": 0.49581214785575867, + "learning_rate": 0.0008796727252923403, + "loss": 0.1346, + "step": 390 + }, + { + "epoch": 0.69, + "grad_norm": 0.6148929595947266, + "learning_rate": 0.0008790654794788768, + "loss": 0.1426, + "step": 391 + }, + { + "epoch": 0.69, + "grad_norm": 0.15860037505626678, + "learning_rate": 0.0008784569159730007, + "loss": 0.1382, + "step": 392 + }, + { + "epoch": 0.7, + "grad_norm": 0.6793199777603149, + "learning_rate": 0.0008778470368901761, + "loss": 0.1398, + "step": 393 + }, + { + "epoch": 0.7, + "grad_norm": 0.40314817428588867, + "learning_rate": 0.0008772358443504404, + "loss": 0.1428, + "step": 394 + }, + { + "epoch": 0.7, + "grad_norm": 0.6403933167457581, + "learning_rate": 0.0008766233404783974, + "loss": 0.1556, + "step": 395 + }, + { + "epoch": 0.7, + "grad_norm": 0.33554157614707947, + "learning_rate": 0.0008760095274032083, + "loss": 0.1439, + "step": 396 + }, + { + "epoch": 0.7, + "grad_norm": 0.45690324902534485, + "learning_rate": 0.000875394407258586, + "loss": 0.1374, + "step": 397 + }, + { + "epoch": 0.7, + "grad_norm": 0.0541120283305645, + "learning_rate": 0.0008747779821827868, + "loss": 0.1314, + "step": 398 + }, + { + "epoch": 0.71, + "grad_norm": 0.6533159613609314, + "learning_rate": 0.0008741602543186031, + "loss": 0.169, + "step": 399 + }, + { + "epoch": 0.71, + "grad_norm": 0.4919282793998718, + "learning_rate": 0.0008735412258133561, + "loss": 0.1569, + "step": 400 + }, + { + "epoch": 0.71, + "grad_norm": 0.30325594544410706, + "learning_rate": 0.0008729208988188881, + "loss": 0.1471, + "step": 401 + }, + { + "epoch": 0.71, + "grad_norm": 0.3497300148010254, + "learning_rate": 0.0008722992754915554, + "loss": 0.1457, + "step": 402 + }, + { + "epoch": 0.71, + "grad_norm": 0.22892774641513824, + "learning_rate": 0.0008716763579922203, + "loss": 0.1334, + "step": 403 + }, + { + "epoch": 0.71, + "grad_norm": 0.20050272345542908, + "learning_rate": 0.0008710521484862439, + "loss": 0.1446, + "step": 404 + }, + { + "epoch": 0.72, + "grad_norm": 0.5029633641242981, + "learning_rate": 0.0008704266491434787, + "loss": 0.171, + "step": 405 + }, + { + "epoch": 0.72, + "grad_norm": 0.2720576226711273, + "learning_rate": 0.0008697998621382607, + "loss": 0.144, + "step": 406 + }, + { + "epoch": 0.72, + "grad_norm": 0.10961242765188217, + "learning_rate": 0.000869171789649402, + "loss": 0.1349, + "step": 407 + }, + { + "epoch": 0.72, + "grad_norm": 0.13584192097187042, + "learning_rate": 0.0008685424338601833, + "loss": 0.1385, + "step": 408 + }, + { + "epoch": 0.72, + "grad_norm": 0.6586437821388245, + "learning_rate": 0.0008679117969583464, + "loss": 0.1459, + "step": 409 + }, + { + "epoch": 0.73, + "grad_norm": 0.24006032943725586, + "learning_rate": 0.0008672798811360864, + "loss": 0.1344, + "step": 410 + }, + { + "epoch": 0.73, + "grad_norm": 0.1859387755393982, + "learning_rate": 0.0008666466885900438, + "loss": 0.1358, + "step": 411 + }, + { + "epoch": 0.73, + "grad_norm": 0.5095134973526001, + "learning_rate": 0.0008660122215212977, + "loss": 0.1387, + "step": 412 + }, + { + "epoch": 0.73, + "grad_norm": 0.1827729493379593, + "learning_rate": 0.0008653764821353573, + "loss": 0.1377, + "step": 413 + }, + { + "epoch": 0.73, + "grad_norm": 0.14332665503025055, + "learning_rate": 0.0008647394726421547, + "loss": 0.131, + "step": 414 + }, + { + "epoch": 0.73, + "grad_norm": 0.383101224899292, + "learning_rate": 0.0008641011952560371, + "loss": 0.146, + "step": 415 + }, + { + "epoch": 0.74, + "grad_norm": 0.19079791009426117, + "learning_rate": 0.000863461652195759, + "loss": 0.1255, + "step": 416 + }, + { + "epoch": 0.74, + "grad_norm": 0.49537310004234314, + "learning_rate": 0.0008628208456844747, + "loss": 0.1602, + "step": 417 + }, + { + "epoch": 0.74, + "grad_norm": 0.5658069849014282, + "learning_rate": 0.0008621787779497306, + "loss": 0.1518, + "step": 418 + }, + { + "epoch": 0.74, + "grad_norm": 0.2572256326675415, + "learning_rate": 0.0008615354512234569, + "loss": 0.1369, + "step": 419 + }, + { + "epoch": 0.74, + "grad_norm": 1.1088945865631104, + "learning_rate": 0.0008608908677419605, + "loss": 0.1773, + "step": 420 + }, + { + "epoch": 0.74, + "grad_norm": 0.35405099391937256, + "learning_rate": 0.0008602450297459173, + "loss": 0.1441, + "step": 421 + }, + { + "epoch": 0.75, + "grad_norm": 0.39150556921958923, + "learning_rate": 0.0008595979394803633, + "loss": 0.147, + "step": 422 + }, + { + "epoch": 0.75, + "grad_norm": 0.07459918409585953, + "learning_rate": 0.0008589495991946885, + "loss": 0.1338, + "step": 423 + }, + { + "epoch": 0.75, + "grad_norm": 0.2999761402606964, + "learning_rate": 0.0008583000111426276, + "loss": 0.1357, + "step": 424 + }, + { + "epoch": 0.75, + "grad_norm": 0.28417065739631653, + "learning_rate": 0.0008576491775822525, + "loss": 0.1411, + "step": 425 + }, + { + "epoch": 0.75, + "grad_norm": 0.32605019211769104, + "learning_rate": 0.0008569971007759657, + "loss": 0.1329, + "step": 426 + }, + { + "epoch": 0.75, + "eval_loss": 0.13750587403774261, + "eval_runtime": 15.1749, + "eval_samples_per_second": 31.433, + "eval_steps_per_second": 7.908, + "step": 426 + }, + { + "epoch": 0.76, + "grad_norm": 0.047430120408535004, + "learning_rate": 0.0008563437829904903, + "loss": 0.1373, + "step": 427 + }, + { + "epoch": 0.76, + "grad_norm": 0.4616542160511017, + "learning_rate": 0.0008556892264968639, + "loss": 0.1534, + "step": 428 + }, + { + "epoch": 0.76, + "grad_norm": 0.12317585945129395, + "learning_rate": 0.0008550334335704297, + "loss": 0.1338, + "step": 429 + }, + { + "epoch": 0.76, + "grad_norm": 0.39604276418685913, + "learning_rate": 0.0008543764064908295, + "loss": 0.1434, + "step": 430 + }, + { + "epoch": 0.76, + "grad_norm": 0.3490678369998932, + "learning_rate": 0.0008537181475419944, + "loss": 0.1365, + "step": 431 + }, + { + "epoch": 0.76, + "grad_norm": 0.15001270174980164, + "learning_rate": 0.0008530586590121383, + "loss": 0.1358, + "step": 432 + }, + { + "epoch": 0.77, + "grad_norm": 0.33340635895729065, + "learning_rate": 0.0008523979431937492, + "loss": 0.1367, + "step": 433 + }, + { + "epoch": 0.77, + "grad_norm": 0.06029750779271126, + "learning_rate": 0.0008517360023835809, + "loss": 0.1366, + "step": 434 + }, + { + "epoch": 0.77, + "grad_norm": 0.07978738099336624, + "learning_rate": 0.0008510728388826463, + "loss": 0.1345, + "step": 435 + }, + { + "epoch": 0.77, + "grad_norm": 0.27599036693573, + "learning_rate": 0.0008504084549962079, + "loss": 0.1447, + "step": 436 + }, + { + "epoch": 0.77, + "grad_norm": 0.13302059471607208, + "learning_rate": 0.0008497428530337706, + "loss": 0.1407, + "step": 437 + }, + { + "epoch": 0.77, + "grad_norm": 0.20869582891464233, + "learning_rate": 0.0008490760353090737, + "loss": 0.1374, + "step": 438 + }, + { + "epoch": 0.78, + "grad_norm": 0.10881117731332779, + "learning_rate": 0.0008484080041400825, + "loss": 0.1429, + "step": 439 + }, + { + "epoch": 0.78, + "grad_norm": 0.20344361662864685, + "learning_rate": 0.0008477387618489807, + "loss": 0.139, + "step": 440 + }, + { + "epoch": 0.78, + "grad_norm": 0.07153432071208954, + "learning_rate": 0.0008470683107621615, + "loss": 0.1315, + "step": 441 + }, + { + "epoch": 0.78, + "grad_norm": 0.08688751608133316, + "learning_rate": 0.0008463966532102207, + "loss": 0.1346, + "step": 442 + }, + { + "epoch": 0.78, + "grad_norm": 0.06495650112628937, + "learning_rate": 0.0008457237915279476, + "loss": 0.1307, + "step": 443 + }, + { + "epoch": 0.79, + "grad_norm": 0.1892390102148056, + "learning_rate": 0.0008450497280543173, + "loss": 0.12, + "step": 444 + }, + { + "epoch": 0.79, + "grad_norm": 0.2579623758792877, + "learning_rate": 0.0008443744651324827, + "loss": 0.1531, + "step": 445 + }, + { + "epoch": 0.79, + "grad_norm": 0.149379700422287, + "learning_rate": 0.000843698005109766, + "loss": 0.1385, + "step": 446 + }, + { + "epoch": 0.79, + "grad_norm": 0.19281132519245148, + "learning_rate": 0.0008430203503376506, + "loss": 0.1033, + "step": 447 + }, + { + "epoch": 0.79, + "grad_norm": 0.33208444714546204, + "learning_rate": 0.0008423415031717733, + "loss": 0.1525, + "step": 448 + }, + { + "epoch": 0.79, + "grad_norm": 0.15149784088134766, + "learning_rate": 0.0008416614659719157, + "loss": 0.1282, + "step": 449 + }, + { + "epoch": 0.8, + "grad_norm": 0.24646438658237457, + "learning_rate": 0.0008409802411019962, + "loss": 0.1393, + "step": 450 + }, + { + "epoch": 0.8, + "grad_norm": 0.2505553662776947, + "learning_rate": 0.000840297830930062, + "loss": 0.1453, + "step": 451 + }, + { + "epoch": 0.8, + "grad_norm": 0.1632508784532547, + "learning_rate": 0.0008396142378282799, + "loss": 0.1274, + "step": 452 + }, + { + "epoch": 0.8, + "grad_norm": 0.12370573729276657, + "learning_rate": 0.0008389294641729292, + "loss": 0.1201, + "step": 453 + }, + { + "epoch": 0.8, + "grad_norm": 0.08046772330999374, + "learning_rate": 0.0008382435123443934, + "loss": 0.1263, + "step": 454 + }, + { + "epoch": 0.8, + "grad_norm": 0.19015488028526306, + "learning_rate": 0.0008375563847271506, + "loss": 0.1318, + "step": 455 + }, + { + "epoch": 0.81, + "grad_norm": 0.3562954366207123, + "learning_rate": 0.0008368680837097669, + "loss": 0.132, + "step": 456 + }, + { + "epoch": 0.81, + "grad_norm": 0.06315189599990845, + "learning_rate": 0.000836178611684887, + "loss": 0.1113, + "step": 457 + }, + { + "epoch": 0.81, + "grad_norm": 0.43667125701904297, + "learning_rate": 0.0008354879710492264, + "loss": 0.1908, + "step": 458 + }, + { + "epoch": 0.81, + "grad_norm": 0.0708879753947258, + "learning_rate": 0.0008347961642035624, + "loss": 0.1399, + "step": 459 + }, + { + "epoch": 0.81, + "grad_norm": 0.04855835437774658, + "learning_rate": 0.0008341031935527267, + "loss": 0.1258, + "step": 460 + }, + { + "epoch": 0.82, + "grad_norm": 0.1364990919828415, + "learning_rate": 0.0008334090615055965, + "loss": 0.1344, + "step": 461 + }, + { + "epoch": 0.82, + "grad_norm": 0.08166524022817612, + "learning_rate": 0.0008327137704750862, + "loss": 0.134, + "step": 462 + }, + { + "epoch": 0.82, + "grad_norm": 0.09308458864688873, + "learning_rate": 0.0008320173228781389, + "loss": 0.1507, + "step": 463 + }, + { + "epoch": 0.82, + "grad_norm": 0.07796576619148254, + "learning_rate": 0.000831319721135718, + "loss": 0.1284, + "step": 464 + }, + { + "epoch": 0.82, + "grad_norm": 0.12168626487255096, + "learning_rate": 0.0008306209676727993, + "loss": 0.148, + "step": 465 + }, + { + "epoch": 0.82, + "grad_norm": 0.18862847983837128, + "learning_rate": 0.000829921064918362, + "loss": 0.1229, + "step": 466 + }, + { + "epoch": 0.83, + "grad_norm": 0.23615515232086182, + "learning_rate": 0.00082922001530538, + "loss": 0.1322, + "step": 467 + }, + { + "epoch": 0.83, + "grad_norm": 0.34108766913414, + "learning_rate": 0.0008285178212708142, + "loss": 0.1338, + "step": 468 + }, + { + "epoch": 0.83, + "grad_norm": 0.39579400420188904, + "learning_rate": 0.0008278144852556042, + "loss": 0.1341, + "step": 469 + }, + { + "epoch": 0.83, + "grad_norm": 0.2620592713356018, + "learning_rate": 0.0008271100097046585, + "loss": 0.1395, + "step": 470 + }, + { + "epoch": 0.83, + "grad_norm": 0.08778171986341476, + "learning_rate": 0.0008264043970668469, + "loss": 0.1328, + "step": 471 + }, + { + "epoch": 0.84, + "grad_norm": 0.6086364388465881, + "learning_rate": 0.0008256976497949924, + "loss": 0.1271, + "step": 472 + }, + { + "epoch": 0.84, + "grad_norm": 0.08982394635677338, + "learning_rate": 0.0008249897703458619, + "loss": 0.1346, + "step": 473 + }, + { + "epoch": 0.84, + "grad_norm": 0.054080091416835785, + "learning_rate": 0.0008242807611801578, + "loss": 0.1218, + "step": 474 + }, + { + "epoch": 0.84, + "grad_norm": 0.5981457829475403, + "learning_rate": 0.0008235706247625098, + "loss": 0.1715, + "step": 475 + }, + { + "epoch": 0.84, + "grad_norm": 0.9139420986175537, + "learning_rate": 0.0008228593635614659, + "loss": 0.1983, + "step": 476 + }, + { + "epoch": 0.84, + "grad_norm": 0.05938498303294182, + "learning_rate": 0.0008221469800494841, + "loss": 0.1308, + "step": 477 + }, + { + "epoch": 0.85, + "grad_norm": 0.11526026576757431, + "learning_rate": 0.0008214334767029239, + "loss": 0.1422, + "step": 478 + }, + { + "epoch": 0.85, + "grad_norm": 0.3049907386302948, + "learning_rate": 0.0008207188560020373, + "loss": 0.1419, + "step": 479 + }, + { + "epoch": 0.85, + "grad_norm": 0.04782035946846008, + "learning_rate": 0.0008200031204309604, + "loss": 0.138, + "step": 480 + }, + { + "epoch": 0.85, + "grad_norm": 0.12950918078422546, + "learning_rate": 0.000819286272477705, + "loss": 0.1315, + "step": 481 + }, + { + "epoch": 0.85, + "grad_norm": 0.0429329015314579, + "learning_rate": 0.0008185683146341496, + "loss": 0.1354, + "step": 482 + }, + { + "epoch": 0.85, + "grad_norm": 0.4792588949203491, + "learning_rate": 0.0008178492493960308, + "loss": 0.1476, + "step": 483 + }, + { + "epoch": 0.86, + "grad_norm": 0.19784927368164062, + "learning_rate": 0.0008171290792629346, + "loss": 0.1394, + "step": 484 + }, + { + "epoch": 0.86, + "grad_norm": 0.1172945499420166, + "learning_rate": 0.000816407806738288, + "loss": 0.1302, + "step": 485 + }, + { + "epoch": 0.86, + "grad_norm": 0.3732689917087555, + "learning_rate": 0.0008156854343293501, + "loss": 0.1416, + "step": 486 + }, + { + "epoch": 0.86, + "grad_norm": 0.5152392983436584, + "learning_rate": 0.0008149619645472031, + "loss": 0.1403, + "step": 487 + }, + { + "epoch": 0.86, + "grad_norm": 0.15429601073265076, + "learning_rate": 0.000814237399906744, + "loss": 0.1322, + "step": 488 + }, + { + "epoch": 0.87, + "grad_norm": 1.0002127885818481, + "learning_rate": 0.0008135117429266756, + "loss": 0.1303, + "step": 489 + }, + { + "epoch": 0.87, + "grad_norm": 0.7232715487480164, + "learning_rate": 0.0008127849961294984, + "loss": 0.143, + "step": 490 + }, + { + "epoch": 0.87, + "grad_norm": 0.13510456681251526, + "learning_rate": 0.0008120571620415006, + "loss": 0.1536, + "step": 491 + }, + { + "epoch": 0.87, + "grad_norm": 0.5168789625167847, + "learning_rate": 0.0008113282431927503, + "loss": 0.1312, + "step": 492 + }, + { + "epoch": 0.87, + "grad_norm": 0.7039850950241089, + "learning_rate": 0.000810598242117086, + "loss": 0.118, + "step": 493 + }, + { + "epoch": 0.87, + "grad_norm": 1.5126641988754272, + "learning_rate": 0.0008098671613521089, + "loss": 0.2343, + "step": 494 + }, + { + "epoch": 0.88, + "grad_norm": 0.6958308815956116, + "learning_rate": 0.0008091350034391731, + "loss": 0.1648, + "step": 495 + }, + { + "epoch": 0.88, + "grad_norm": 6.979303359985352, + "learning_rate": 0.0008084017709233766, + "loss": 0.2261, + "step": 496 + }, + { + "epoch": 0.88, + "grad_norm": 0.3389752507209778, + "learning_rate": 0.0008076674663535537, + "loss": 0.146, + "step": 497 + }, + { + "epoch": 0.88, + "grad_norm": 0.19990071654319763, + "learning_rate": 0.0008069320922822643, + "loss": 0.1429, + "step": 498 + }, + { + "epoch": 0.88, + "grad_norm": 0.33689868450164795, + "learning_rate": 0.0008061956512657871, + "loss": 0.147, + "step": 499 + }, + { + "epoch": 0.88, + "grad_norm": 0.09925112873315811, + "learning_rate": 0.000805458145864109, + "loss": 0.1342, + "step": 500 + }, + { + "epoch": 0.89, + "grad_norm": 1.961702585220337, + "learning_rate": 0.0008047195786409172, + "loss": 0.1361, + "step": 501 + }, + { + "epoch": 0.89, + "grad_norm": 0.4342229962348938, + "learning_rate": 0.0008039799521635895, + "loss": 0.1485, + "step": 502 + }, + { + "epoch": 0.89, + "grad_norm": 0.1798858642578125, + "learning_rate": 0.0008032392690031867, + "loss": 0.1314, + "step": 503 + }, + { + "epoch": 0.89, + "grad_norm": 1.3653756380081177, + "learning_rate": 0.0008024975317344421, + "loss": 0.1388, + "step": 504 + }, + { + "epoch": 0.89, + "grad_norm": 9.677605628967285, + "learning_rate": 0.0008017547429357531, + "loss": 0.4186, + "step": 505 + }, + { + "epoch": 0.9, + "grad_norm": 8.348475456237793, + "learning_rate": 0.0008010109051891731, + "loss": 0.3806, + "step": 506 + }, + { + "epoch": 0.9, + "grad_norm": 35.19770050048828, + "learning_rate": 0.0008002660210804011, + "loss": 3.6145, + "step": 507 + }, + { + "epoch": 0.9, + "grad_norm": 9.18663501739502, + "learning_rate": 0.0007995200931987743, + "loss": 0.6162, + "step": 508 + }, + { + "epoch": 0.9, + "grad_norm": 0.05997322499752045, + "learning_rate": 0.0007987731241372571, + "loss": 0.1129, + "step": 509 + }, + { + "epoch": 0.9, + "grad_norm": 0.41408172249794006, + "learning_rate": 0.000798025116492434, + "loss": 0.1512, + "step": 510 + }, + { + "epoch": 0.9, + "grad_norm": 0.4445393979549408, + "learning_rate": 0.0007972760728644996, + "loss": 0.1463, + "step": 511 + }, + { + "epoch": 0.91, + "grad_norm": 0.19678063690662384, + "learning_rate": 0.0007965259958572495, + "loss": 0.1386, + "step": 512 + }, + { + "epoch": 0.91, + "grad_norm": 0.45497119426727295, + "learning_rate": 0.0007957748880780721, + "loss": 0.1373, + "step": 513 + }, + { + "epoch": 0.91, + "grad_norm": 0.6455509066581726, + "learning_rate": 0.0007950227521379381, + "loss": 0.1584, + "step": 514 + }, + { + "epoch": 0.91, + "grad_norm": 0.3793765604496002, + "learning_rate": 0.0007942695906513929, + "loss": 0.1236, + "step": 515 + }, + { + "epoch": 0.91, + "grad_norm": 0.20562775433063507, + "learning_rate": 0.0007935154062365467, + "loss": 0.1364, + "step": 516 + }, + { + "epoch": 0.91, + "grad_norm": 1.3131325244903564, + "learning_rate": 0.0007927602015150655, + "loss": 0.1556, + "step": 517 + }, + { + "epoch": 0.92, + "grad_norm": 0.1705670803785324, + "learning_rate": 0.0007920039791121617, + "loss": 0.1372, + "step": 518 + }, + { + "epoch": 0.92, + "grad_norm": 6.6207499504089355, + "learning_rate": 0.0007912467416565861, + "loss": 0.22, + "step": 519 + }, + { + "epoch": 0.92, + "grad_norm": 0.34343230724334717, + "learning_rate": 0.0007904884917806173, + "loss": 0.1453, + "step": 520 + }, + { + "epoch": 0.92, + "grad_norm": 0.4290754497051239, + "learning_rate": 0.0007897292321200537, + "loss": 0.1177, + "step": 521 + }, + { + "epoch": 0.92, + "grad_norm": 0.24469922482967377, + "learning_rate": 0.0007889689653142036, + "loss": 0.1369, + "step": 522 + }, + { + "epoch": 0.93, + "grad_norm": 0.5307168960571289, + "learning_rate": 0.0007882076940058763, + "loss": 0.1542, + "step": 523 + }, + { + "epoch": 0.93, + "grad_norm": 0.13802866637706757, + "learning_rate": 0.000787445420841373, + "loss": 0.1372, + "step": 524 + }, + { + "epoch": 0.93, + "grad_norm": 0.36055922508239746, + "learning_rate": 0.0007866821484704776, + "loss": 0.1413, + "step": 525 + }, + { + "epoch": 0.93, + "grad_norm": 0.36655113101005554, + "learning_rate": 0.0007859178795464472, + "loss": 0.1438, + "step": 526 + }, + { + "epoch": 0.93, + "grad_norm": 0.6237390637397766, + "learning_rate": 0.0007851526167260034, + "loss": 0.1382, + "step": 527 + }, + { + "epoch": 0.93, + "grad_norm": 0.42217007279396057, + "learning_rate": 0.0007843863626693221, + "loss": 0.1408, + "step": 528 + }, + { + "epoch": 0.94, + "grad_norm": 24.023250579833984, + "learning_rate": 0.0007836191200400256, + "loss": 0.1517, + "step": 529 + }, + { + "epoch": 0.94, + "grad_norm": 0.31599146127700806, + "learning_rate": 0.0007828508915051723, + "loss": 0.1353, + "step": 530 + }, + { + "epoch": 0.94, + "grad_norm": 0.6795622706413269, + "learning_rate": 0.0007820816797352479, + "loss": 0.1515, + "step": 531 + }, + { + "epoch": 0.94, + "grad_norm": 0.37493640184402466, + "learning_rate": 0.0007813114874041557, + "loss": 0.141, + "step": 532 + }, + { + "epoch": 0.94, + "grad_norm": 1.7365546226501465, + "learning_rate": 0.0007805403171892079, + "loss": 0.1347, + "step": 533 + }, + { + "epoch": 0.94, + "grad_norm": 18.393390655517578, + "learning_rate": 0.000779768171771116, + "loss": 0.1753, + "step": 534 + }, + { + "epoch": 0.95, + "grad_norm": 2.2978413105010986, + "learning_rate": 0.0007789950538339812, + "loss": 0.1418, + "step": 535 + }, + { + "epoch": 0.95, + "grad_norm": 0.495151162147522, + "learning_rate": 0.0007782209660652854, + "loss": 0.146, + "step": 536 + }, + { + "epoch": 0.95, + "grad_norm": 7.705572605133057, + "learning_rate": 0.0007774459111558821, + "loss": 0.2042, + "step": 537 + }, + { + "epoch": 0.95, + "grad_norm": 0.6036086678504944, + "learning_rate": 0.0007766698917999862, + "loss": 0.1695, + "step": 538 + }, + { + "epoch": 0.95, + "grad_norm": 127.21215057373047, + "learning_rate": 0.0007758929106951656, + "loss": 18.5136, + "step": 539 + }, + { + "epoch": 0.96, + "grad_norm": 40.58448791503906, + "learning_rate": 0.0007751149705423312, + "loss": 0.5973, + "step": 540 + }, + { + "epoch": 0.96, + "grad_norm": 0.6296218633651733, + "learning_rate": 0.0007743360740457278, + "loss": 0.1849, + "step": 541 + }, + { + "epoch": 0.96, + "grad_norm": 0.4533160924911499, + "learning_rate": 0.0007735562239129247, + "loss": 0.1464, + "step": 542 + }, + { + "epoch": 0.96, + "grad_norm": 0.2379036247730255, + "learning_rate": 0.0007727754228548058, + "loss": 0.1267, + "step": 543 + }, + { + "epoch": 0.96, + "grad_norm": 0.8904889225959778, + "learning_rate": 0.000771993673585561, + "loss": 0.2181, + "step": 544 + }, + { + "epoch": 0.96, + "grad_norm": 0.8934443593025208, + "learning_rate": 0.0007712109788226762, + "loss": 0.2158, + "step": 545 + }, + { + "epoch": 0.97, + "grad_norm": 0.3368353545665741, + "learning_rate": 0.0007704273412869238, + "loss": 0.1489, + "step": 546 + }, + { + "epoch": 0.97, + "grad_norm": 0.2570180594921112, + "learning_rate": 0.0007696427637023537, + "loss": 0.144, + "step": 547 + }, + { + "epoch": 0.97, + "grad_norm": 2.865034580230713, + "learning_rate": 0.0007688572487962834, + "loss": 0.1664, + "step": 548 + }, + { + "epoch": 0.97, + "grad_norm": 0.4369525611400604, + "learning_rate": 0.0007680707992992888, + "loss": 0.1777, + "step": 549 + }, + { + "epoch": 0.97, + "grad_norm": 0.2545509934425354, + "learning_rate": 0.0007672834179451942, + "loss": 0.1536, + "step": 550 + }, + { + "epoch": 0.97, + "grad_norm": 0.14455465972423553, + "learning_rate": 0.0007664951074710638, + "loss": 0.1256, + "step": 551 + }, + { + "epoch": 0.98, + "grad_norm": 0.16001886129379272, + "learning_rate": 0.0007657058706171911, + "loss": 0.1356, + "step": 552 + }, + { + "epoch": 0.98, + "grad_norm": 0.2537885308265686, + "learning_rate": 0.0007649157101270903, + "loss": 0.1393, + "step": 553 + }, + { + "epoch": 0.98, + "grad_norm": 0.33060047030448914, + "learning_rate": 0.0007641246287474854, + "loss": 0.148, + "step": 554 + }, + { + "epoch": 0.98, + "grad_norm": 1.691941499710083, + "learning_rate": 0.0007633326292283028, + "loss": 0.1764, + "step": 555 + }, + { + "epoch": 0.98, + "grad_norm": 0.20472805202007294, + "learning_rate": 0.0007625397143226595, + "loss": 0.1424, + "step": 556 + }, + { + "epoch": 0.99, + "grad_norm": 0.7124485969543457, + "learning_rate": 0.0007617458867868553, + "loss": 0.1482, + "step": 557 + }, + { + "epoch": 0.99, + "grad_norm": 0.09631184488534927, + "learning_rate": 0.0007609511493803615, + "loss": 0.1392, + "step": 558 + }, + { + "epoch": 0.99, + "grad_norm": 0.20814809203147888, + "learning_rate": 0.0007601555048658133, + "loss": 0.1384, + "step": 559 + }, + { + "epoch": 0.99, + "grad_norm": 0.19566737115383148, + "learning_rate": 0.0007593589560089984, + "loss": 0.1394, + "step": 560 + }, + { + "epoch": 0.99, + "grad_norm": 0.13406091928482056, + "learning_rate": 0.0007585615055788484, + "loss": 0.1389, + "step": 561 + }, + { + "epoch": 0.99, + "grad_norm": 0.07635807991027832, + "learning_rate": 0.0007577631563474291, + "loss": 0.1376, + "step": 562 + }, + { + "epoch": 1.0, + "grad_norm": 0.11265091598033905, + "learning_rate": 0.0007569639110899302, + "loss": 0.1395, + "step": 563 + }, + { + "epoch": 1.0, + "grad_norm": 0.31152746081352234, + "learning_rate": 0.0007561637725846567, + "loss": 0.1407, + "step": 564 + }, + { + "epoch": 1.0, + "grad_norm": 0.13474373519420624, + "learning_rate": 0.0007553627436130183, + "loss": 0.1386, + "step": 565 + }, + { + "epoch": 1.0, + "grad_norm": 0.23706336319446564, + "learning_rate": 0.0007545608269595201, + "loss": 0.1417, + "step": 566 + }, + { + "epoch": 1.0, + "grad_norm": 0.30558836460113525, + "learning_rate": 0.0007537580254117531, + "loss": 0.1452, + "step": 567 + }, + { + "epoch": 1.0, + "grad_norm": 0.106146439909935, + "learning_rate": 0.0007529543417603843, + "loss": 0.1372, + "step": 568 + }, + { + "epoch": 1.0, + "eval_loss": 0.13968442380428314, + "eval_runtime": 15.1558, + "eval_samples_per_second": 31.473, + "eval_steps_per_second": 7.918, + "step": 568 + }, + { + "epoch": 1.01, + "grad_norm": 0.3243511915206909, + "learning_rate": 0.0007521497787991472, + "loss": 0.1424, + "step": 569 + }, + { + "epoch": 1.01, + "grad_norm": 0.19688986241817474, + "learning_rate": 0.0007513443393248312, + "loss": 0.1403, + "step": 570 + }, + { + "epoch": 1.01, + "grad_norm": 0.1128445565700531, + "learning_rate": 0.0007505380261372734, + "loss": 0.1397, + "step": 571 + }, + { + "epoch": 1.01, + "grad_norm": 0.11025507003068924, + "learning_rate": 0.0007497308420393477, + "loss": 0.1391, + "step": 572 + }, + { + "epoch": 1.01, + "grad_norm": 0.19862700998783112, + "learning_rate": 0.0007489227898369558, + "loss": 0.1345, + "step": 573 + }, + { + "epoch": 1.02, + "grad_norm": 0.11129032075405121, + "learning_rate": 0.0007481138723390164, + "loss": 0.1342, + "step": 574 + }, + { + "epoch": 1.02, + "grad_norm": 0.21451863646507263, + "learning_rate": 0.0007473040923574567, + "loss": 0.132, + "step": 575 + }, + { + "epoch": 1.02, + "grad_norm": 0.6781334280967712, + "learning_rate": 0.0007464934527072016, + "loss": 0.1688, + "step": 576 + }, + { + "epoch": 1.02, + "grad_norm": 0.3881673812866211, + "learning_rate": 0.0007456819562061648, + "loss": 0.15, + "step": 577 + }, + { + "epoch": 1.02, + "grad_norm": 0.0530267171561718, + "learning_rate": 0.0007448696056752383, + "loss": 0.139, + "step": 578 + }, + { + "epoch": 1.02, + "grad_norm": 0.2782767415046692, + "learning_rate": 0.0007440564039382827, + "loss": 0.1334, + "step": 579 + }, + { + "epoch": 1.03, + "grad_norm": 0.693821370601654, + "learning_rate": 0.0007432423538221178, + "loss": 0.1673, + "step": 580 + }, + { + "epoch": 1.03, + "grad_norm": 0.27020275592803955, + "learning_rate": 0.0007424274581565122, + "loss": 0.1464, + "step": 581 + }, + { + "epoch": 1.03, + "grad_norm": 0.42129820585250854, + "learning_rate": 0.0007416117197741742, + "loss": 0.1507, + "step": 582 + }, + { + "epoch": 1.03, + "grad_norm": 0.21161474287509918, + "learning_rate": 0.0007407951415107412, + "loss": 0.1398, + "step": 583 + }, + { + "epoch": 1.03, + "grad_norm": 0.13954728841781616, + "learning_rate": 0.00073997772620477, + "loss": 0.1387, + "step": 584 + }, + { + "epoch": 1.03, + "grad_norm": 0.08771730959415436, + "learning_rate": 0.0007391594766977276, + "loss": 0.1419, + "step": 585 + }, + { + "epoch": 1.04, + "grad_norm": 0.2630119025707245, + "learning_rate": 0.0007383403958339806, + "loss": 0.1483, + "step": 586 + }, + { + "epoch": 1.04, + "grad_norm": 0.07496945559978485, + "learning_rate": 0.0007375204864607851, + "loss": 0.1339, + "step": 587 + }, + { + "epoch": 1.04, + "grad_norm": 0.25115033984184265, + "learning_rate": 0.0007366997514282782, + "loss": 0.129, + "step": 588 + }, + { + "epoch": 1.04, + "grad_norm": 0.24599210917949677, + "learning_rate": 0.0007358781935894659, + "loss": 0.1491, + "step": 589 + }, + { + "epoch": 1.04, + "grad_norm": 0.14762777090072632, + "learning_rate": 0.0007350558158002153, + "loss": 0.1287, + "step": 590 + }, + { + "epoch": 1.05, + "grad_norm": 0.03114377148449421, + "learning_rate": 0.0007342326209192435, + "loss": 0.1379, + "step": 591 + }, + { + "epoch": 1.05, + "grad_norm": 0.5076407194137573, + "learning_rate": 0.000733408611808108, + "loss": 0.1122, + "step": 592 + }, + { + "epoch": 1.05, + "grad_norm": 0.10492309182882309, + "learning_rate": 0.0007325837913311966, + "loss": 0.1284, + "step": 593 + }, + { + "epoch": 1.05, + "grad_norm": 0.1740669459104538, + "learning_rate": 0.0007317581623557177, + "loss": 0.1458, + "step": 594 + }, + { + "epoch": 1.05, + "grad_norm": 0.20419681072235107, + "learning_rate": 0.00073093172775169, + "loss": 0.1511, + "step": 595 + }, + { + "epoch": 1.05, + "grad_norm": 0.1906755119562149, + "learning_rate": 0.0007301044903919325, + "loss": 0.1052, + "step": 596 + }, + { + "epoch": 1.06, + "grad_norm": 0.10201478004455566, + "learning_rate": 0.0007292764531520552, + "loss": 0.1147, + "step": 597 + }, + { + "epoch": 1.06, + "grad_norm": 0.4594266712665558, + "learning_rate": 0.0007284476189104485, + "loss": 0.1739, + "step": 598 + }, + { + "epoch": 1.06, + "grad_norm": 0.1697234809398651, + "learning_rate": 0.0007276179905482729, + "loss": 0.1049, + "step": 599 + }, + { + "epoch": 1.06, + "grad_norm": 0.09107261896133423, + "learning_rate": 0.0007267875709494499, + "loss": 0.1319, + "step": 600 + }, + { + "epoch": 1.06, + "grad_norm": 0.07888934016227722, + "learning_rate": 0.0007259563630006512, + "loss": 0.1323, + "step": 601 + }, + { + "epoch": 1.07, + "grad_norm": 0.06149132549762726, + "learning_rate": 0.0007251243695912886, + "loss": 0.1239, + "step": 602 + }, + { + "epoch": 1.07, + "grad_norm": 0.3240460157394409, + "learning_rate": 0.0007242915936135052, + "loss": 0.1614, + "step": 603 + }, + { + "epoch": 1.07, + "grad_norm": 0.04239710047841072, + "learning_rate": 0.0007234580379621635, + "loss": 0.1336, + "step": 604 + }, + { + "epoch": 1.07, + "grad_norm": 0.04415787383913994, + "learning_rate": 0.000722623705534837, + "loss": 0.1336, + "step": 605 + }, + { + "epoch": 1.07, + "grad_norm": 0.13500471413135529, + "learning_rate": 0.0007217885992317985, + "loss": 0.1429, + "step": 606 + }, + { + "epoch": 1.07, + "grad_norm": 0.09405327588319778, + "learning_rate": 0.0007209527219560119, + "loss": 0.1399, + "step": 607 + }, + { + "epoch": 1.08, + "grad_norm": 0.16369308531284332, + "learning_rate": 0.0007201160766131207, + "loss": 0.1389, + "step": 608 + }, + { + "epoch": 1.08, + "grad_norm": 0.24509336054325104, + "learning_rate": 0.0007192786661114383, + "loss": 0.1376, + "step": 609 + }, + { + "epoch": 1.08, + "grad_norm": 0.29961100220680237, + "learning_rate": 0.0007184404933619377, + "loss": 0.133, + "step": 610 + }, + { + "epoch": 1.08, + "grad_norm": 0.4641360640525818, + "learning_rate": 0.0007176015612782421, + "loss": 0.1491, + "step": 611 + }, + { + "epoch": 1.08, + "grad_norm": 0.059663355350494385, + "learning_rate": 0.0007167618727766138, + "loss": 0.1365, + "step": 612 + }, + { + "epoch": 1.08, + "grad_norm": 0.16221192479133606, + "learning_rate": 0.0007159214307759448, + "loss": 0.1395, + "step": 613 + }, + { + "epoch": 1.09, + "grad_norm": 0.04930780455470085, + "learning_rate": 0.0007150802381977463, + "loss": 0.1368, + "step": 614 + }, + { + "epoch": 1.09, + "grad_norm": 0.6152715086936951, + "learning_rate": 0.0007142382979661386, + "loss": 0.1291, + "step": 615 + }, + { + "epoch": 1.09, + "grad_norm": 0.15247471630573273, + "learning_rate": 0.0007133956130078411, + "loss": 0.1404, + "step": 616 + }, + { + "epoch": 1.09, + "grad_norm": 0.7167736887931824, + "learning_rate": 0.000712552186252162, + "loss": 0.1642, + "step": 617 + }, + { + "epoch": 1.09, + "grad_norm": 0.2419363558292389, + "learning_rate": 0.0007117080206309878, + "loss": 0.1317, + "step": 618 + }, + { + "epoch": 1.1, + "grad_norm": 0.5636677742004395, + "learning_rate": 0.0007108631190787735, + "loss": 0.147, + "step": 619 + }, + { + "epoch": 1.1, + "grad_norm": 0.26012521982192993, + "learning_rate": 0.0007100174845325327, + "loss": 0.1344, + "step": 620 + }, + { + "epoch": 1.1, + "grad_norm": 0.14845141768455505, + "learning_rate": 0.0007091711199318265, + "loss": 0.1299, + "step": 621 + }, + { + "epoch": 1.1, + "grad_norm": 0.1344316601753235, + "learning_rate": 0.0007083240282187542, + "loss": 0.1401, + "step": 622 + }, + { + "epoch": 1.1, + "grad_norm": 0.08974921703338623, + "learning_rate": 0.0007074762123379423, + "loss": 0.1257, + "step": 623 + }, + { + "epoch": 1.1, + "grad_norm": 0.3263636529445648, + "learning_rate": 0.0007066276752365351, + "loss": 0.1587, + "step": 624 + }, + { + "epoch": 1.11, + "grad_norm": 0.12418147176504135, + "learning_rate": 0.0007057784198641835, + "loss": 0.1361, + "step": 625 + }, + { + "epoch": 1.11, + "grad_norm": 0.3086402714252472, + "learning_rate": 0.0007049284491730353, + "loss": 0.1496, + "step": 626 + }, + { + "epoch": 1.11, + "grad_norm": 0.3529713749885559, + "learning_rate": 0.000704077766117725, + "loss": 0.1523, + "step": 627 + }, + { + "epoch": 1.11, + "grad_norm": 0.11717434972524643, + "learning_rate": 0.0007032263736553634, + "loss": 0.1402, + "step": 628 + }, + { + "epoch": 1.11, + "grad_norm": 0.19153334200382233, + "learning_rate": 0.0007023742747455275, + "loss": 0.1407, + "step": 629 + }, + { + "epoch": 1.11, + "grad_norm": 0.22798961400985718, + "learning_rate": 0.0007015214723502495, + "loss": 0.1262, + "step": 630 + }, + { + "epoch": 1.12, + "grad_norm": 0.4415830373764038, + "learning_rate": 0.0007006679694340073, + "loss": 0.1494, + "step": 631 + }, + { + "epoch": 1.12, + "grad_norm": 0.554614782333374, + "learning_rate": 0.0006998137689637142, + "loss": 0.1324, + "step": 632 + }, + { + "epoch": 1.12, + "grad_norm": 0.32604262232780457, + "learning_rate": 0.0006989588739087078, + "loss": 0.1433, + "step": 633 + }, + { + "epoch": 1.12, + "grad_norm": 0.5145484209060669, + "learning_rate": 0.0006981032872407406, + "loss": 0.152, + "step": 634 + }, + { + "epoch": 1.12, + "grad_norm": 0.3538295030593872, + "learning_rate": 0.0006972470119339691, + "loss": 0.137, + "step": 635 + }, + { + "epoch": 1.13, + "grad_norm": 0.2451559156179428, + "learning_rate": 0.0006963900509649435, + "loss": 0.1418, + "step": 636 + }, + { + "epoch": 1.13, + "grad_norm": 0.2240092009305954, + "learning_rate": 0.0006955324073125978, + "loss": 0.1406, + "step": 637 + }, + { + "epoch": 1.13, + "grad_norm": 0.5672935843467712, + "learning_rate": 0.0006946740839582387, + "loss": 0.1682, + "step": 638 + }, + { + "epoch": 1.13, + "grad_norm": 0.3396548628807068, + "learning_rate": 0.000693815083885536, + "loss": 0.1538, + "step": 639 + }, + { + "epoch": 1.13, + "grad_norm": 0.2464788407087326, + "learning_rate": 0.0006929554100805117, + "loss": 0.145, + "step": 640 + }, + { + "epoch": 1.13, + "grad_norm": 0.08380208164453506, + "learning_rate": 0.0006920950655315297, + "loss": 0.1333, + "step": 641 + }, + { + "epoch": 1.14, + "grad_norm": 0.04563472419977188, + "learning_rate": 0.000691234053229286, + "loss": 0.1371, + "step": 642 + }, + { + "epoch": 1.14, + "grad_norm": 0.0336502380669117, + "learning_rate": 0.0006903723761667972, + "loss": 0.1383, + "step": 643 + }, + { + "epoch": 1.14, + "grad_norm": 0.11504160612821579, + "learning_rate": 0.0006895100373393912, + "loss": 0.1366, + "step": 644 + }, + { + "epoch": 1.14, + "grad_norm": 0.4302406311035156, + "learning_rate": 0.0006886470397446957, + "loss": 0.1464, + "step": 645 + }, + { + "epoch": 1.14, + "grad_norm": 0.13670873641967773, + "learning_rate": 0.0006877833863826295, + "loss": 0.1399, + "step": 646 + }, + { + "epoch": 1.14, + "grad_norm": 0.11441440135240555, + "learning_rate": 0.0006869190802553894, + "loss": 0.1389, + "step": 647 + }, + { + "epoch": 1.15, + "grad_norm": 0.07245034724473953, + "learning_rate": 0.0006860541243674426, + "loss": 0.1376, + "step": 648 + }, + { + "epoch": 1.15, + "grad_norm": 0.12628068029880524, + "learning_rate": 0.0006851885217255144, + "loss": 0.1314, + "step": 649 + }, + { + "epoch": 1.15, + "grad_norm": 0.345865935087204, + "learning_rate": 0.0006843222753385784, + "loss": 0.1469, + "step": 650 + }, + { + "epoch": 1.15, + "grad_norm": 0.18721798062324524, + "learning_rate": 0.0006834553882178463, + "loss": 0.129, + "step": 651 + }, + { + "epoch": 1.15, + "grad_norm": 0.1566080003976822, + "learning_rate": 0.0006825878633767564, + "loss": 0.1296, + "step": 652 + }, + { + "epoch": 1.16, + "grad_norm": 0.13990430533885956, + "learning_rate": 0.0006817197038309643, + "loss": 0.1245, + "step": 653 + }, + { + "epoch": 1.16, + "grad_norm": 0.26073744893074036, + "learning_rate": 0.000680850912598332, + "loss": 0.1437, + "step": 654 + }, + { + "epoch": 1.16, + "grad_norm": 0.05034814029932022, + "learning_rate": 0.0006799814926989171, + "loss": 0.1209, + "step": 655 + }, + { + "epoch": 1.16, + "grad_norm": 0.29498428106307983, + "learning_rate": 0.0006791114471549626, + "loss": 0.1476, + "step": 656 + }, + { + "epoch": 1.16, + "grad_norm": 0.24109311401844025, + "learning_rate": 0.0006782407789908863, + "loss": 0.1421, + "step": 657 + }, + { + "epoch": 1.16, + "grad_norm": 0.2070060819387436, + "learning_rate": 0.0006773694912332707, + "loss": 0.1174, + "step": 658 + }, + { + "epoch": 1.17, + "grad_norm": 0.05099210515618324, + "learning_rate": 0.0006764975869108514, + "loss": 0.1325, + "step": 659 + }, + { + "epoch": 1.17, + "grad_norm": 0.03778371214866638, + "learning_rate": 0.0006756250690545078, + "loss": 0.1326, + "step": 660 + }, + { + "epoch": 1.17, + "grad_norm": 0.23074184358119965, + "learning_rate": 0.0006747519406972524, + "loss": 0.1417, + "step": 661 + }, + { + "epoch": 1.17, + "grad_norm": 0.162948340177536, + "learning_rate": 0.0006738782048742187, + "loss": 0.1422, + "step": 662 + }, + { + "epoch": 1.17, + "grad_norm": 0.1257455050945282, + "learning_rate": 0.0006730038646226531, + "loss": 0.1352, + "step": 663 + }, + { + "epoch": 1.17, + "grad_norm": 0.1732119917869568, + "learning_rate": 0.0006721289229819024, + "loss": 0.1313, + "step": 664 + }, + { + "epoch": 1.18, + "grad_norm": 0.15348908305168152, + "learning_rate": 0.0006712533829934043, + "loss": 0.139, + "step": 665 + }, + { + "epoch": 1.18, + "grad_norm": 0.06923094391822815, + "learning_rate": 0.0006703772477006757, + "loss": 0.1381, + "step": 666 + }, + { + "epoch": 1.18, + "grad_norm": 0.307449609041214, + "learning_rate": 0.0006695005201493037, + "loss": 0.1365, + "step": 667 + }, + { + "epoch": 1.18, + "grad_norm": 0.09788268059492111, + "learning_rate": 0.0006686232033869343, + "loss": 0.1358, + "step": 668 + }, + { + "epoch": 1.18, + "grad_norm": 0.23847998678684235, + "learning_rate": 0.0006677453004632608, + "loss": 0.1399, + "step": 669 + }, + { + "epoch": 1.19, + "grad_norm": 0.08175510168075562, + "learning_rate": 0.0006668668144300149, + "loss": 0.1372, + "step": 670 + }, + { + "epoch": 1.19, + "grad_norm": 0.18189309537410736, + "learning_rate": 0.0006659877483409545, + "loss": 0.1401, + "step": 671 + }, + { + "epoch": 1.19, + "grad_norm": 0.08665986359119415, + "learning_rate": 0.000665108105251855, + "loss": 0.1345, + "step": 672 + }, + { + "epoch": 1.19, + "grad_norm": 0.40454381704330444, + "learning_rate": 0.0006642278882204963, + "loss": 0.1342, + "step": 673 + }, + { + "epoch": 1.19, + "grad_norm": 0.27606263756752014, + "learning_rate": 0.0006633471003066543, + "loss": 0.1363, + "step": 674 + }, + { + "epoch": 1.19, + "grad_norm": 0.06796804070472717, + "learning_rate": 0.000662465744572089, + "loss": 0.1353, + "step": 675 + }, + { + "epoch": 1.2, + "grad_norm": 0.4458450376987457, + "learning_rate": 0.0006615838240805343, + "loss": 0.1521, + "step": 676 + }, + { + "epoch": 1.2, + "grad_norm": 0.3369523286819458, + "learning_rate": 0.0006607013418976873, + "loss": 0.1489, + "step": 677 + }, + { + "epoch": 1.2, + "grad_norm": 0.20170435309410095, + "learning_rate": 0.0006598183010911978, + "loss": 0.1263, + "step": 678 + }, + { + "epoch": 1.2, + "grad_norm": 0.11186213046312332, + "learning_rate": 0.0006589347047306571, + "loss": 0.1344, + "step": 679 + }, + { + "epoch": 1.2, + "grad_norm": 0.12327159941196442, + "learning_rate": 0.0006580505558875878, + "loss": 0.1354, + "step": 680 + }, + { + "epoch": 1.2, + "grad_norm": 0.05389246717095375, + "learning_rate": 0.0006571658576354334, + "loss": 0.1333, + "step": 681 + }, + { + "epoch": 1.21, + "grad_norm": 0.20890717208385468, + "learning_rate": 0.0006562806130495466, + "loss": 0.1428, + "step": 682 + }, + { + "epoch": 1.21, + "grad_norm": 0.12948615849018097, + "learning_rate": 0.0006553948252071799, + "loss": 0.1372, + "step": 683 + }, + { + "epoch": 1.21, + "grad_norm": 0.16449519991874695, + "learning_rate": 0.0006545084971874737, + "loss": 0.1418, + "step": 684 + }, + { + "epoch": 1.21, + "grad_norm": 0.04887047037482262, + "learning_rate": 0.0006536216320714466, + "loss": 0.139, + "step": 685 + }, + { + "epoch": 1.21, + "grad_norm": 0.1712215691804886, + "learning_rate": 0.0006527342329419836, + "loss": 0.1389, + "step": 686 + }, + { + "epoch": 1.22, + "grad_norm": 0.14935021102428436, + "learning_rate": 0.000651846302883827, + "loss": 0.1369, + "step": 687 + }, + { + "epoch": 1.22, + "grad_norm": 0.16822853684425354, + "learning_rate": 0.0006509578449835636, + "loss": 0.1393, + "step": 688 + }, + { + "epoch": 1.22, + "grad_norm": 0.04274258390069008, + "learning_rate": 0.0006500688623296158, + "loss": 0.1339, + "step": 689 + }, + { + "epoch": 1.22, + "grad_norm": 0.20485758781433105, + "learning_rate": 0.00064917935801223, + "loss": 0.1232, + "step": 690 + }, + { + "epoch": 1.22, + "grad_norm": 0.16438162326812744, + "learning_rate": 0.0006482893351234658, + "loss": 0.1272, + "step": 691 + }, + { + "epoch": 1.22, + "grad_norm": 0.0820753276348114, + "learning_rate": 0.0006473987967571855, + "loss": 0.1368, + "step": 692 + }, + { + "epoch": 1.23, + "grad_norm": 0.5247365832328796, + "learning_rate": 0.000646507746009043, + "loss": 0.1702, + "step": 693 + }, + { + "epoch": 1.23, + "grad_norm": 0.21259160339832306, + "learning_rate": 0.0006456161859764745, + "loss": 0.1384, + "step": 694 + }, + { + "epoch": 1.23, + "grad_norm": 0.10756111145019531, + "learning_rate": 0.0006447241197586847, + "loss": 0.1316, + "step": 695 + }, + { + "epoch": 1.23, + "grad_norm": 0.32431429624557495, + "learning_rate": 0.0006438315504566397, + "loss": 0.1505, + "step": 696 + }, + { + "epoch": 1.23, + "grad_norm": 0.09354525059461594, + "learning_rate": 0.0006429384811730528, + "loss": 0.1338, + "step": 697 + }, + { + "epoch": 1.23, + "grad_norm": 0.25492650270462036, + "learning_rate": 0.0006420449150123767, + "loss": 0.1391, + "step": 698 + }, + { + "epoch": 1.24, + "grad_norm": 0.28658854961395264, + "learning_rate": 0.0006411508550807905, + "loss": 0.1336, + "step": 699 + }, + { + "epoch": 1.24, + "grad_norm": 0.21230942010879517, + "learning_rate": 0.0006402563044861899, + "loss": 0.1369, + "step": 700 + }, + { + "epoch": 1.24, + "grad_norm": 0.13693292438983917, + "learning_rate": 0.0006393612663381763, + "loss": 0.1347, + "step": 701 + }, + { + "epoch": 1.24, + "grad_norm": 0.20328965783119202, + "learning_rate": 0.0006384657437480457, + "loss": 0.1349, + "step": 702 + }, + { + "epoch": 1.24, + "grad_norm": 0.1463640034198761, + "learning_rate": 0.0006375697398287788, + "loss": 0.1316, + "step": 703 + }, + { + "epoch": 1.25, + "grad_norm": 0.47083455324172974, + "learning_rate": 0.0006366732576950283, + "loss": 0.1538, + "step": 704 + }, + { + "epoch": 1.25, + "grad_norm": 0.18148604035377502, + "learning_rate": 0.0006357763004631105, + "loss": 0.1264, + "step": 705 + }, + { + "epoch": 1.25, + "grad_norm": 0.10440527647733688, + "learning_rate": 0.000634878871250992, + "loss": 0.1209, + "step": 706 + }, + { + "epoch": 1.25, + "grad_norm": 0.42732179164886475, + "learning_rate": 0.000633980973178281, + "loss": 0.1581, + "step": 707 + }, + { + "epoch": 1.25, + "grad_norm": 0.09864400327205658, + "learning_rate": 0.0006330826093662157, + "loss": 0.1398, + "step": 708 + }, + { + "epoch": 1.25, + "grad_norm": 0.2839510142803192, + "learning_rate": 0.000632183782937652, + "loss": 0.1448, + "step": 709 + }, + { + "epoch": 1.26, + "grad_norm": 0.18296539783477783, + "learning_rate": 0.0006312844970170551, + "loss": 0.1369, + "step": 710 + }, + { + "epoch": 1.26, + "eval_loss": 0.13823845982551575, + "eval_runtime": 15.0864, + "eval_samples_per_second": 31.618, + "eval_steps_per_second": 7.954, + "step": 710 + }, + { + "epoch": 1.26, + "grad_norm": 0.2305176705121994, + "learning_rate": 0.0006303847547304872, + "loss": 0.1253, + "step": 711 + }, + { + "epoch": 1.26, + "grad_norm": 0.2792215049266815, + "learning_rate": 0.0006294845592055967, + "loss": 0.1292, + "step": 712 + }, + { + "epoch": 1.26, + "grad_norm": 0.3560260236263275, + "learning_rate": 0.0006285839135716078, + "loss": 0.142, + "step": 713 + }, + { + "epoch": 1.26, + "grad_norm": 0.5769198536872864, + "learning_rate": 0.000627682820959309, + "loss": 0.1629, + "step": 714 + }, + { + "epoch": 1.26, + "grad_norm": 0.24223147332668304, + "learning_rate": 0.000626781284501043, + "loss": 0.1436, + "step": 715 + }, + { + "epoch": 1.27, + "grad_norm": 0.20025451481342316, + "learning_rate": 0.0006258793073306948, + "loss": 0.1271, + "step": 716 + }, + { + "epoch": 1.27, + "grad_norm": 0.5235323905944824, + "learning_rate": 0.0006249768925836822, + "loss": 0.1362, + "step": 717 + }, + { + "epoch": 1.27, + "grad_norm": 0.12457533925771713, + "learning_rate": 0.0006240740433969432, + "loss": 0.1267, + "step": 718 + }, + { + "epoch": 1.27, + "grad_norm": 0.22851230204105377, + "learning_rate": 0.0006231707629089263, + "loss": 0.1368, + "step": 719 + }, + { + "epoch": 1.27, + "grad_norm": 0.12162783741950989, + "learning_rate": 0.0006222670542595799, + "loss": 0.1348, + "step": 720 + }, + { + "epoch": 1.28, + "grad_norm": 0.21811127662658691, + "learning_rate": 0.0006213629205903399, + "loss": 0.1302, + "step": 721 + }, + { + "epoch": 1.28, + "grad_norm": 0.1042797714471817, + "learning_rate": 0.0006204583650441201, + "loss": 0.1227, + "step": 722 + }, + { + "epoch": 1.28, + "grad_norm": 0.5917842388153076, + "learning_rate": 0.0006195533907653003, + "loss": 0.1218, + "step": 723 + }, + { + "epoch": 1.28, + "grad_norm": 0.6369093656539917, + "learning_rate": 0.000618648000899717, + "loss": 0.1309, + "step": 724 + }, + { + "epoch": 1.28, + "grad_norm": 0.298677921295166, + "learning_rate": 0.0006177421985946498, + "loss": 0.1329, + "step": 725 + }, + { + "epoch": 1.28, + "grad_norm": 0.5087531208992004, + "learning_rate": 0.0006168359869988133, + "loss": 0.1619, + "step": 726 + }, + { + "epoch": 1.29, + "grad_norm": 0.5805624723434448, + "learning_rate": 0.0006159293692623443, + "loss": 0.1388, + "step": 727 + }, + { + "epoch": 1.29, + "grad_norm": 0.595432698726654, + "learning_rate": 0.0006150223485367914, + "loss": 0.1363, + "step": 728 + }, + { + "epoch": 1.29, + "grad_norm": 2.0664656162261963, + "learning_rate": 0.0006141149279751042, + "loss": 0.1373, + "step": 729 + }, + { + "epoch": 1.29, + "grad_norm": 1.3190929889678955, + "learning_rate": 0.0006132071107316221, + "loss": 0.1434, + "step": 730 + }, + { + "epoch": 1.29, + "grad_norm": 0.19045250117778778, + "learning_rate": 0.0006122988999620634, + "loss": 0.1177, + "step": 731 + }, + { + "epoch": 1.3, + "grad_norm": 1.3130842447280884, + "learning_rate": 0.0006113902988235145, + "loss": 0.1542, + "step": 732 + }, + { + "epoch": 1.3, + "grad_norm": 0.5767085552215576, + "learning_rate": 0.0006104813104744187, + "loss": 0.1627, + "step": 733 + }, + { + "epoch": 1.3, + "grad_norm": 0.45873621106147766, + "learning_rate": 0.0006095719380745653, + "loss": 0.1369, + "step": 734 + }, + { + "epoch": 1.3, + "grad_norm": 0.4458267092704773, + "learning_rate": 0.0006086621847850788, + "loss": 0.1207, + "step": 735 + }, + { + "epoch": 1.3, + "grad_norm": 0.13178426027297974, + "learning_rate": 0.0006077520537684072, + "loss": 0.1263, + "step": 736 + }, + { + "epoch": 1.3, + "grad_norm": 0.19360630214214325, + "learning_rate": 0.0006068415481883121, + "loss": 0.1366, + "step": 737 + }, + { + "epoch": 1.31, + "grad_norm": 0.12965673208236694, + "learning_rate": 0.0006059306712098571, + "loss": 0.1436, + "step": 738 + }, + { + "epoch": 1.31, + "grad_norm": 0.13222691416740417, + "learning_rate": 0.0006050194259993966, + "loss": 0.1294, + "step": 739 + }, + { + "epoch": 1.31, + "grad_norm": 0.14453792572021484, + "learning_rate": 0.0006041078157245648, + "loss": 0.1273, + "step": 740 + }, + { + "epoch": 1.31, + "grad_norm": 0.27612432837486267, + "learning_rate": 0.0006031958435542659, + "loss": 0.1145, + "step": 741 + }, + { + "epoch": 1.31, + "grad_norm": 0.6110266447067261, + "learning_rate": 0.0006022835126586609, + "loss": 0.1299, + "step": 742 + }, + { + "epoch": 1.31, + "grad_norm": 0.29062649607658386, + "learning_rate": 0.0006013708262091586, + "loss": 0.132, + "step": 743 + }, + { + "epoch": 1.32, + "grad_norm": 0.44451919198036194, + "learning_rate": 0.0006004577873784034, + "loss": 0.1235, + "step": 744 + }, + { + "epoch": 1.32, + "grad_norm": 0.15329335629940033, + "learning_rate": 0.0005995443993402648, + "loss": 0.1462, + "step": 745 + }, + { + "epoch": 1.32, + "grad_norm": 0.7718572616577148, + "learning_rate": 0.000598630665269826, + "loss": 0.1309, + "step": 746 + }, + { + "epoch": 1.32, + "grad_norm": 0.427112340927124, + "learning_rate": 0.0005977165883433733, + "loss": 0.1565, + "step": 747 + }, + { + "epoch": 1.32, + "grad_norm": 0.18447764217853546, + "learning_rate": 0.0005968021717383849, + "loss": 0.1431, + "step": 748 + }, + { + "epoch": 1.33, + "grad_norm": 0.2808150053024292, + "learning_rate": 0.0005958874186335193, + "loss": 0.1429, + "step": 749 + }, + { + "epoch": 1.33, + "grad_norm": 0.29161331057548523, + "learning_rate": 0.0005949723322086053, + "loss": 0.1238, + "step": 750 + }, + { + "epoch": 1.33, + "grad_norm": 0.3760308027267456, + "learning_rate": 0.0005940569156446298, + "loss": 0.1416, + "step": 751 + }, + { + "epoch": 1.33, + "grad_norm": 0.24679379165172577, + "learning_rate": 0.0005931411721237279, + "loss": 0.1366, + "step": 752 + }, + { + "epoch": 1.33, + "grad_norm": 0.8587498664855957, + "learning_rate": 0.0005922251048291707, + "loss": 0.1525, + "step": 753 + }, + { + "epoch": 1.33, + "grad_norm": 0.17934030294418335, + "learning_rate": 0.0005913087169453553, + "loss": 0.1287, + "step": 754 + }, + { + "epoch": 1.34, + "grad_norm": 0.43589112162590027, + "learning_rate": 0.0005903920116577931, + "loss": 0.1472, + "step": 755 + }, + { + "epoch": 1.34, + "grad_norm": 0.061591099947690964, + "learning_rate": 0.0005894749921530983, + "loss": 0.129, + "step": 756 + }, + { + "epoch": 1.34, + "grad_norm": 2.9088215827941895, + "learning_rate": 0.0005885576616189781, + "loss": 0.1379, + "step": 757 + }, + { + "epoch": 1.34, + "grad_norm": 0.5655280351638794, + "learning_rate": 0.0005876400232442205, + "loss": 0.138, + "step": 758 + }, + { + "epoch": 1.34, + "grad_norm": 0.05325045809149742, + "learning_rate": 0.0005867220802186837, + "loss": 0.1269, + "step": 759 + }, + { + "epoch": 1.34, + "grad_norm": 0.08003886044025421, + "learning_rate": 0.000585803835733285, + "loss": 0.1386, + "step": 760 + }, + { + "epoch": 1.35, + "grad_norm": 0.09640295058488846, + "learning_rate": 0.0005848852929799894, + "loss": 0.1364, + "step": 761 + }, + { + "epoch": 1.35, + "grad_norm": 0.5506378412246704, + "learning_rate": 0.0005839664551517988, + "loss": 0.1424, + "step": 762 + }, + { + "epoch": 1.35, + "grad_norm": 0.06252831220626831, + "learning_rate": 0.000583047325442741, + "loss": 0.1384, + "step": 763 + }, + { + "epoch": 1.35, + "grad_norm": 0.11976012587547302, + "learning_rate": 0.0005821279070478583, + "loss": 0.1422, + "step": 764 + }, + { + "epoch": 1.35, + "grad_norm": 0.13022871315479279, + "learning_rate": 0.0005812082031631966, + "loss": 0.1357, + "step": 765 + }, + { + "epoch": 1.36, + "grad_norm": 0.12146733701229095, + "learning_rate": 0.0005802882169857938, + "loss": 0.14, + "step": 766 + }, + { + "epoch": 1.36, + "grad_norm": 0.17471055686473846, + "learning_rate": 0.00057936795171367, + "loss": 0.1432, + "step": 767 + }, + { + "epoch": 1.36, + "grad_norm": 0.07833831012248993, + "learning_rate": 0.0005784474105458143, + "loss": 0.1425, + "step": 768 + }, + { + "epoch": 1.36, + "grad_norm": 0.17084555327892303, + "learning_rate": 0.000577526596682176, + "loss": 0.1437, + "step": 769 + }, + { + "epoch": 1.36, + "grad_norm": 0.112625353038311, + "learning_rate": 0.0005766055133236513, + "loss": 0.1429, + "step": 770 + }, + { + "epoch": 1.36, + "grad_norm": 0.15991152822971344, + "learning_rate": 0.000575684163672074, + "loss": 0.1387, + "step": 771 + }, + { + "epoch": 1.37, + "grad_norm": 0.18027663230895996, + "learning_rate": 0.0005747625509302033, + "loss": 0.1439, + "step": 772 + }, + { + "epoch": 1.37, + "grad_norm": 0.4315040111541748, + "learning_rate": 0.0005738406783017127, + "loss": 0.1524, + "step": 773 + }, + { + "epoch": 1.37, + "grad_norm": 0.3343091905117035, + "learning_rate": 0.0005729185489911797, + "loss": 0.1481, + "step": 774 + }, + { + "epoch": 1.37, + "grad_norm": 0.16762638092041016, + "learning_rate": 0.0005719961662040733, + "loss": 0.1389, + "step": 775 + }, + { + "epoch": 1.37, + "grad_norm": 0.11396286636590958, + "learning_rate": 0.0005710735331467444, + "loss": 0.1351, + "step": 776 + }, + { + "epoch": 1.37, + "grad_norm": 0.11262958496809006, + "learning_rate": 0.0005701506530264132, + "loss": 0.1343, + "step": 777 + }, + { + "epoch": 1.38, + "grad_norm": 0.06293229013681412, + "learning_rate": 0.0005692275290511592, + "loss": 0.1322, + "step": 778 + }, + { + "epoch": 1.38, + "grad_norm": 0.037539321929216385, + "learning_rate": 0.0005683041644299093, + "loss": 0.13, + "step": 779 + }, + { + "epoch": 1.38, + "grad_norm": 0.2424931526184082, + "learning_rate": 0.0005673805623724272, + "loss": 0.1333, + "step": 780 + }, + { + "epoch": 1.38, + "grad_norm": 0.2825300991535187, + "learning_rate": 0.0005664567260893019, + "loss": 0.15, + "step": 781 + }, + { + "epoch": 1.38, + "grad_norm": 0.12494263052940369, + "learning_rate": 0.000565532658791936, + "loss": 0.1317, + "step": 782 + }, + { + "epoch": 1.39, + "grad_norm": 0.29031720757484436, + "learning_rate": 0.0005646083636925362, + "loss": 0.1593, + "step": 783 + }, + { + "epoch": 1.39, + "grad_norm": 0.3038758933544159, + "learning_rate": 0.0005636838440041004, + "loss": 0.1551, + "step": 784 + }, + { + "epoch": 1.39, + "grad_norm": 0.14983759820461273, + "learning_rate": 0.0005627591029404071, + "loss": 0.1402, + "step": 785 + }, + { + "epoch": 1.39, + "grad_norm": 0.25971877574920654, + "learning_rate": 0.0005618341437160049, + "loss": 0.1389, + "step": 786 + }, + { + "epoch": 1.39, + "grad_norm": 0.2425714135169983, + "learning_rate": 0.0005609089695462002, + "loss": 0.1399, + "step": 787 + }, + { + "epoch": 1.39, + "grad_norm": 0.11737050861120224, + "learning_rate": 0.0005599835836470469, + "loss": 0.1237, + "step": 788 + }, + { + "epoch": 1.4, + "grad_norm": 0.6737673878669739, + "learning_rate": 0.0005590579892353348, + "loss": 0.1837, + "step": 789 + }, + { + "epoch": 1.4, + "grad_norm": 0.2361481785774231, + "learning_rate": 0.0005581321895285787, + "loss": 0.146, + "step": 790 + }, + { + "epoch": 1.4, + "grad_norm": 0.47753140330314636, + "learning_rate": 0.0005572061877450068, + "loss": 0.1664, + "step": 791 + }, + { + "epoch": 1.4, + "grad_norm": 0.2968634068965912, + "learning_rate": 0.0005562799871035495, + "loss": 0.1511, + "step": 792 + }, + { + "epoch": 1.4, + "grad_norm": 0.20170801877975464, + "learning_rate": 0.0005553535908238294, + "loss": 0.1408, + "step": 793 + }, + { + "epoch": 1.4, + "grad_norm": 0.11540532112121582, + "learning_rate": 0.0005544270021261482, + "loss": 0.1415, + "step": 794 + }, + { + "epoch": 1.41, + "grad_norm": 0.10350099951028824, + "learning_rate": 0.0005535002242314772, + "loss": 0.1393, + "step": 795 + }, + { + "epoch": 1.41, + "grad_norm": 0.06757602840662003, + "learning_rate": 0.0005525732603614444, + "loss": 0.1335, + "step": 796 + }, + { + "epoch": 1.41, + "grad_norm": 0.11407013237476349, + "learning_rate": 0.0005516461137383254, + "loss": 0.1342, + "step": 797 + }, + { + "epoch": 1.41, + "grad_norm": 0.40271708369255066, + "learning_rate": 0.0005507187875850305, + "loss": 0.1536, + "step": 798 + }, + { + "epoch": 1.41, + "grad_norm": 0.35031354427337646, + "learning_rate": 0.000549791285125094, + "loss": 0.1489, + "step": 799 + }, + { + "epoch": 1.42, + "grad_norm": 0.347901850938797, + "learning_rate": 0.0005488636095826636, + "loss": 0.1463, + "step": 800 + }, + { + "epoch": 1.42, + "grad_norm": 0.17497143149375916, + "learning_rate": 0.0005479357641824877, + "loss": 0.1385, + "step": 801 + }, + { + "epoch": 1.42, + "grad_norm": 0.42803797125816345, + "learning_rate": 0.0005470077521499062, + "loss": 0.1438, + "step": 802 + }, + { + "epoch": 1.42, + "grad_norm": 0.77762371301651, + "learning_rate": 0.0005460795767108378, + "loss": 0.1616, + "step": 803 + }, + { + "epoch": 1.42, + "grad_norm": 0.27612486481666565, + "learning_rate": 0.0005451512410917691, + "loss": 0.1424, + "step": 804 + }, + { + "epoch": 1.42, + "grad_norm": 0.10936840623617172, + "learning_rate": 0.0005442227485197435, + "loss": 0.1379, + "step": 805 + }, + { + "epoch": 1.43, + "grad_norm": 0.19322127103805542, + "learning_rate": 0.0005432941022223503, + "loss": 0.1279, + "step": 806 + }, + { + "epoch": 1.43, + "grad_norm": 0.14601223170757294, + "learning_rate": 0.000542365305427713, + "loss": 0.1394, + "step": 807 + }, + { + "epoch": 1.43, + "grad_norm": 0.05485713109374046, + "learning_rate": 0.0005414363613644781, + "loss": 0.1245, + "step": 808 + }, + { + "epoch": 1.43, + "grad_norm": 3.4448142051696777, + "learning_rate": 0.0005405072732618043, + "loss": 0.3245, + "step": 809 + }, + { + "epoch": 1.43, + "grad_norm": 0.3802805542945862, + "learning_rate": 0.0005395780443493508, + "loss": 0.1617, + "step": 810 + }, + { + "epoch": 1.43, + "grad_norm": 2.19525146484375, + "learning_rate": 0.0005386486778572665, + "loss": 0.3246, + "step": 811 + }, + { + "epoch": 1.44, + "grad_norm": 0.1237780973315239, + "learning_rate": 0.0005377191770161783, + "loss": 0.1348, + "step": 812 + }, + { + "epoch": 1.44, + "grad_norm": 0.17096708714962006, + "learning_rate": 0.0005367895450571801, + "loss": 0.1417, + "step": 813 + }, + { + "epoch": 1.44, + "grad_norm": 0.033038243651390076, + "learning_rate": 0.0005358597852118219, + "loss": 0.1308, + "step": 814 + }, + { + "epoch": 1.44, + "grad_norm": 0.2681437134742737, + "learning_rate": 0.000534929900712098, + "loss": 0.1465, + "step": 815 + }, + { + "epoch": 1.44, + "grad_norm": 0.11117050051689148, + "learning_rate": 0.0005339998947904363, + "loss": 0.1383, + "step": 816 + }, + { + "epoch": 1.45, + "grad_norm": 0.25842952728271484, + "learning_rate": 0.0005330697706796861, + "loss": 0.1397, + "step": 817 + }, + { + "epoch": 1.45, + "grad_norm": 0.08293187618255615, + "learning_rate": 0.0005321395316131083, + "loss": 0.1356, + "step": 818 + }, + { + "epoch": 1.45, + "grad_norm": 0.14762446284294128, + "learning_rate": 0.0005312091808243631, + "loss": 0.1416, + "step": 819 + }, + { + "epoch": 1.45, + "grad_norm": 0.4471145570278168, + "learning_rate": 0.0005302787215474991, + "loss": 0.1461, + "step": 820 + }, + { + "epoch": 1.45, + "grad_norm": 0.2443719059228897, + "learning_rate": 0.0005293481570169421, + "loss": 0.1458, + "step": 821 + }, + { + "epoch": 1.45, + "grad_norm": 0.22830860316753387, + "learning_rate": 0.0005284174904674835, + "loss": 0.139, + "step": 822 + }, + { + "epoch": 1.46, + "grad_norm": 0.5184169411659241, + "learning_rate": 0.0005274867251342694, + "loss": 0.1417, + "step": 823 + }, + { + "epoch": 1.46, + "grad_norm": 0.3812021017074585, + "learning_rate": 0.0005265558642527897, + "loss": 0.1346, + "step": 824 + }, + { + "epoch": 1.46, + "grad_norm": 0.2922486662864685, + "learning_rate": 0.0005256249110588659, + "loss": 0.1294, + "step": 825 + }, + { + "epoch": 1.46, + "grad_norm": 0.29819944500923157, + "learning_rate": 0.0005246938687886409, + "loss": 0.1401, + "step": 826 + }, + { + "epoch": 1.46, + "grad_norm": 0.07050393521785736, + "learning_rate": 0.0005237627406785666, + "loss": 0.1307, + "step": 827 + }, + { + "epoch": 1.46, + "grad_norm": 0.1988169550895691, + "learning_rate": 0.0005228315299653941, + "loss": 0.1359, + "step": 828 + }, + { + "epoch": 1.47, + "grad_norm": 0.31983789801597595, + "learning_rate": 0.0005219002398861611, + "loss": 0.1459, + "step": 829 + }, + { + "epoch": 1.47, + "grad_norm": 0.28960883617401123, + "learning_rate": 0.000520968873678181, + "loss": 0.1447, + "step": 830 + }, + { + "epoch": 1.47, + "grad_norm": 0.36790764331817627, + "learning_rate": 0.0005200374345790325, + "loss": 0.1287, + "step": 831 + }, + { + "epoch": 1.47, + "grad_norm": 0.055655404925346375, + "learning_rate": 0.0005191059258265471, + "loss": 0.1346, + "step": 832 + }, + { + "epoch": 1.47, + "grad_norm": 0.42271995544433594, + "learning_rate": 0.0005181743506587989, + "loss": 0.1445, + "step": 833 + }, + { + "epoch": 1.48, + "grad_norm": 0.4579026997089386, + "learning_rate": 0.0005172427123140923, + "loss": 0.1397, + "step": 834 + }, + { + "epoch": 1.48, + "grad_norm": 0.30265891551971436, + "learning_rate": 0.0005163110140309518, + "loss": 0.1389, + "step": 835 + }, + { + "epoch": 1.48, + "grad_norm": 0.23372715711593628, + "learning_rate": 0.0005153792590481101, + "loss": 0.1426, + "step": 836 + }, + { + "epoch": 1.48, + "grad_norm": 0.22771018743515015, + "learning_rate": 0.0005144474506044969, + "loss": 0.1412, + "step": 837 + }, + { + "epoch": 1.48, + "grad_norm": 0.32557952404022217, + "learning_rate": 0.000513515591939228, + "loss": 0.1382, + "step": 838 + }, + { + "epoch": 1.48, + "grad_norm": 0.4409979283809662, + "learning_rate": 0.0005125836862915934, + "loss": 0.1382, + "step": 839 + }, + { + "epoch": 1.49, + "grad_norm": 112.177978515625, + "learning_rate": 0.0005116517369010466, + "loss": 1.093, + "step": 840 + }, + { + "epoch": 1.49, + "grad_norm": 0.13140363991260529, + "learning_rate": 0.0005107197470071933, + "loss": 0.1344, + "step": 841 + }, + { + "epoch": 1.49, + "grad_norm": 0.0935206264257431, + "learning_rate": 0.00050978771984978, + "loss": 0.1315, + "step": 842 + }, + { + "epoch": 1.49, + "grad_norm": 0.5304569602012634, + "learning_rate": 0.0005088556586686822, + "loss": 0.1549, + "step": 843 + }, + { + "epoch": 1.49, + "grad_norm": 0.07438669353723526, + "learning_rate": 0.0005079235667038944, + "loss": 0.1311, + "step": 844 + }, + { + "epoch": 1.49, + "grad_norm": 0.17763537168502808, + "learning_rate": 0.0005069914471955179, + "loss": 0.1342, + "step": 845 + }, + { + "epoch": 1.5, + "grad_norm": 0.3256682753562927, + "learning_rate": 0.0005060593033837493, + "loss": 0.1435, + "step": 846 + }, + { + "epoch": 1.5, + "grad_norm": 0.3771526515483856, + "learning_rate": 0.0005051271385088701, + "loss": 0.1434, + "step": 847 + }, + { + "epoch": 1.5, + "grad_norm": 0.3716539740562439, + "learning_rate": 0.0005041949558112351, + "loss": 0.1329, + "step": 848 + }, + { + "epoch": 1.5, + "grad_norm": 0.13685204088687897, + "learning_rate": 0.0005032627585312608, + "loss": 0.1415, + "step": 849 + }, + { + "epoch": 1.5, + "grad_norm": 0.21241213381290436, + "learning_rate": 0.0005023305499094144, + "loss": 0.1384, + "step": 850 + }, + { + "epoch": 1.51, + "grad_norm": 0.05780967324972153, + "learning_rate": 0.0005013983331862026, + "loss": 0.1366, + "step": 851 + }, + { + "epoch": 1.51, + "grad_norm": 0.5117526650428772, + "learning_rate": 0.0005004661116021605, + "loss": 0.1537, + "step": 852 + }, + { + "epoch": 1.51, + "eval_loss": 0.14083661139011383, + "eval_runtime": 14.5613, + "eval_samples_per_second": 32.758, + "eval_steps_per_second": 8.241, + "step": 852 + }, + { + "epoch": 1.51, + "grad_norm": 0.6577463150024414, + "learning_rate": 0.0004995338883978395, + "loss": 0.1461, + "step": 853 + }, + { + "epoch": 1.51, + "grad_norm": 0.35039347410202026, + "learning_rate": 0.0004986016668137974, + "loss": 0.1345, + "step": 854 + }, + { + "epoch": 1.51, + "grad_norm": 0.1379460096359253, + "learning_rate": 0.0004976694500905857, + "loss": 0.1425, + "step": 855 + }, + { + "epoch": 1.51, + "grad_norm": 0.23959362506866455, + "learning_rate": 0.0004967372414687393, + "loss": 0.1535, + "step": 856 + }, + { + "epoch": 1.52, + "grad_norm": 0.387977659702301, + "learning_rate": 0.000495805044188765, + "loss": 0.1709, + "step": 857 + }, + { + "epoch": 1.52, + "grad_norm": 0.10591788589954376, + "learning_rate": 0.0004948728614911299, + "loss": 0.137, + "step": 858 + }, + { + "epoch": 1.52, + "grad_norm": 0.1370454728603363, + "learning_rate": 0.0004939406966162507, + "loss": 0.1413, + "step": 859 + }, + { + "epoch": 1.52, + "grad_norm": 0.10982546955347061, + "learning_rate": 0.0004930085528044823, + "loss": 0.1422, + "step": 860 + }, + { + "epoch": 1.52, + "grad_norm": 0.1631615161895752, + "learning_rate": 0.0004920764332961055, + "loss": 0.1439, + "step": 861 + }, + { + "epoch": 1.52, + "grad_norm": 0.4625565707683563, + "learning_rate": 0.0004911443413313179, + "loss": 0.13, + "step": 862 + }, + { + "epoch": 1.53, + "grad_norm": 0.09019370377063751, + "learning_rate": 0.0004902122801502201, + "loss": 0.1367, + "step": 863 + }, + { + "epoch": 1.53, + "grad_norm": 0.058873746544122696, + "learning_rate": 0.0004892802529928067, + "loss": 0.1388, + "step": 864 + }, + { + "epoch": 1.53, + "grad_norm": 0.16651901602745056, + "learning_rate": 0.0004883482630989535, + "loss": 0.1383, + "step": 865 + }, + { + "epoch": 1.53, + "grad_norm": 0.1222059577703476, + "learning_rate": 0.00048741631370840676, + "loss": 0.1391, + "step": 866 + }, + { + "epoch": 1.53, + "grad_norm": 0.15417732298374176, + "learning_rate": 0.00048648440806077226, + "loss": 0.1368, + "step": 867 + }, + { + "epoch": 1.54, + "grad_norm": 0.19719868898391724, + "learning_rate": 0.00048555254939550326, + "loss": 0.1423, + "step": 868 + }, + { + "epoch": 1.54, + "grad_norm": 0.1811167150735855, + "learning_rate": 0.0004846207409518899, + "loss": 0.1382, + "step": 869 + }, + { + "epoch": 1.54, + "grad_norm": 0.12746267020702362, + "learning_rate": 0.0004836889859690483, + "loss": 0.1375, + "step": 870 + }, + { + "epoch": 1.54, + "grad_norm": 0.18294665217399597, + "learning_rate": 0.00048275728768590776, + "loss": 0.1376, + "step": 871 + }, + { + "epoch": 1.54, + "grad_norm": 0.14346922934055328, + "learning_rate": 0.0004818256493412011, + "loss": 0.137, + "step": 872 + }, + { + "epoch": 1.54, + "grad_norm": 0.07995035499334335, + "learning_rate": 0.00048089407417345296, + "loss": 0.1356, + "step": 873 + }, + { + "epoch": 1.55, + "grad_norm": 0.14909203350543976, + "learning_rate": 0.0004799625654209675, + "loss": 0.1374, + "step": 874 + }, + { + "epoch": 1.55, + "grad_norm": 0.06708569079637527, + "learning_rate": 0.00047903112632181904, + "loss": 0.1381, + "step": 875 + }, + { + "epoch": 1.55, + "grad_norm": 0.22370800375938416, + "learning_rate": 0.00047809976011383906, + "loss": 0.1445, + "step": 876 + }, + { + "epoch": 1.55, + "grad_norm": 0.05151727795600891, + "learning_rate": 0.0004771684700346059, + "loss": 0.1371, + "step": 877 + }, + { + "epoch": 1.55, + "grad_norm": 0.12744151055812836, + "learning_rate": 0.0004762372593214335, + "loss": 0.1369, + "step": 878 + }, + { + "epoch": 1.56, + "grad_norm": 0.13104400038719177, + "learning_rate": 0.0004753061312113592, + "loss": 0.1346, + "step": 879 + }, + { + "epoch": 1.56, + "grad_norm": 0.11967893689870834, + "learning_rate": 0.00047437508894113416, + "loss": 0.1318, + "step": 880 + }, + { + "epoch": 1.56, + "grad_norm": 0.035317592322826385, + "learning_rate": 0.00047344413574721046, + "loss": 0.1352, + "step": 881 + }, + { + "epoch": 1.56, + "grad_norm": 0.15099988877773285, + "learning_rate": 0.0004725132748657307, + "loss": 0.1401, + "step": 882 + }, + { + "epoch": 1.56, + "grad_norm": 0.24859024584293365, + "learning_rate": 0.0004715825095325168, + "loss": 0.1277, + "step": 883 + }, + { + "epoch": 1.56, + "grad_norm": 0.11024681478738785, + "learning_rate": 0.00047065184298305797, + "loss": 0.1375, + "step": 884 + }, + { + "epoch": 1.57, + "grad_norm": 0.031196558848023415, + "learning_rate": 0.00046972127845250084, + "loss": 0.133, + "step": 885 + }, + { + "epoch": 1.57, + "grad_norm": 0.05172949284315109, + "learning_rate": 0.00046879081917563695, + "loss": 0.1324, + "step": 886 + }, + { + "epoch": 1.57, + "grad_norm": 0.04595587030053139, + "learning_rate": 0.0004678604683868918, + "loss": 0.1361, + "step": 887 + }, + { + "epoch": 1.57, + "grad_norm": 0.054625846445560455, + "learning_rate": 0.00046693022932031415, + "loss": 0.1334, + "step": 888 + }, + { + "epoch": 1.57, + "grad_norm": 0.18956537544727325, + "learning_rate": 0.0004660001052095639, + "loss": 0.1419, + "step": 889 + }, + { + "epoch": 1.57, + "grad_norm": 0.12293694168329239, + "learning_rate": 0.00046507009928790195, + "loss": 0.1234, + "step": 890 + }, + { + "epoch": 1.58, + "grad_norm": 0.12140147387981415, + "learning_rate": 0.00046414021478817817, + "loss": 0.1282, + "step": 891 + }, + { + "epoch": 1.58, + "grad_norm": 0.20622491836547852, + "learning_rate": 0.00046321045494282, + "loss": 0.1238, + "step": 892 + }, + { + "epoch": 1.58, + "grad_norm": 0.1974942833185196, + "learning_rate": 0.00046228082298382196, + "loss": 0.1511, + "step": 893 + }, + { + "epoch": 1.58, + "grad_norm": 0.13894042372703552, + "learning_rate": 0.0004613513221427337, + "loss": 0.1349, + "step": 894 + }, + { + "epoch": 1.58, + "grad_norm": 0.023365622386336327, + "learning_rate": 0.00046042195565064914, + "loss": 0.1371, + "step": 895 + }, + { + "epoch": 1.59, + "grad_norm": 0.16795076429843903, + "learning_rate": 0.0004594927267381958, + "loss": 0.1399, + "step": 896 + }, + { + "epoch": 1.59, + "grad_norm": 0.06654185056686401, + "learning_rate": 0.00045856363863552195, + "loss": 0.1365, + "step": 897 + }, + { + "epoch": 1.59, + "grad_norm": 0.21354056894779205, + "learning_rate": 0.00045763469457228695, + "loss": 0.1431, + "step": 898 + }, + { + "epoch": 1.59, + "grad_norm": 0.1247892901301384, + "learning_rate": 0.0004567058977776498, + "loss": 0.1391, + "step": 899 + }, + { + "epoch": 1.59, + "grad_norm": 0.1229679062962532, + "learning_rate": 0.00045577725148025647, + "loss": 0.1324, + "step": 900 + }, + { + "epoch": 1.59, + "grad_norm": 0.0285334512591362, + "learning_rate": 0.000454848758908231, + "loss": 0.1417, + "step": 901 + }, + { + "epoch": 1.6, + "grad_norm": 0.11522159725427628, + "learning_rate": 0.0004539204232891622, + "loss": 0.1349, + "step": 902 + }, + { + "epoch": 1.6, + "grad_norm": 0.04999208077788353, + "learning_rate": 0.00045299224785009374, + "loss": 0.1395, + "step": 903 + }, + { + "epoch": 1.6, + "grad_norm": 0.19387230277061462, + "learning_rate": 0.00045206423581751245, + "loss": 0.1367, + "step": 904 + }, + { + "epoch": 1.6, + "grad_norm": 0.030587391927838326, + "learning_rate": 0.0004511363904173366, + "loss": 0.1392, + "step": 905 + }, + { + "epoch": 1.6, + "grad_norm": 0.031090332195162773, + "learning_rate": 0.0004502087148749061, + "loss": 0.137, + "step": 906 + }, + { + "epoch": 1.6, + "grad_norm": 0.07691047340631485, + "learning_rate": 0.0004492812124149696, + "loss": 0.144, + "step": 907 + }, + { + "epoch": 1.61, + "grad_norm": 0.0668129101395607, + "learning_rate": 0.0004483538862616747, + "loss": 0.1337, + "step": 908 + }, + { + "epoch": 1.61, + "grad_norm": 0.3449445962905884, + "learning_rate": 0.00044742673963855576, + "loss": 0.1526, + "step": 909 + }, + { + "epoch": 1.61, + "grad_norm": 0.19746670126914978, + "learning_rate": 0.000446499775768523, + "loss": 0.1319, + "step": 910 + }, + { + "epoch": 1.61, + "grad_norm": 0.11988267302513123, + "learning_rate": 0.0004455729978738517, + "loss": 0.1411, + "step": 911 + }, + { + "epoch": 1.61, + "grad_norm": 0.17063240706920624, + "learning_rate": 0.00044464640917617063, + "loss": 0.1354, + "step": 912 + }, + { + "epoch": 1.62, + "grad_norm": 0.26187554001808167, + "learning_rate": 0.00044372001289645044, + "loss": 0.136, + "step": 913 + }, + { + "epoch": 1.62, + "grad_norm": 0.05965143442153931, + "learning_rate": 0.00044279381225499344, + "loss": 0.1398, + "step": 914 + }, + { + "epoch": 1.62, + "grad_norm": 0.07176820188760757, + "learning_rate": 0.00044186781047142134, + "loss": 0.1388, + "step": 915 + }, + { + "epoch": 1.62, + "grad_norm": 0.038787998259067535, + "learning_rate": 0.0004409420107646652, + "loss": 0.1383, + "step": 916 + }, + { + "epoch": 1.62, + "grad_norm": 0.03987140208482742, + "learning_rate": 0.0004400164163529532, + "loss": 0.1366, + "step": 917 + }, + { + "epoch": 1.62, + "grad_norm": 0.12179240584373474, + "learning_rate": 0.00043909103045379987, + "loss": 0.1306, + "step": 918 + }, + { + "epoch": 1.63, + "grad_norm": 0.09804455190896988, + "learning_rate": 0.0004381658562839953, + "loss": 0.128, + "step": 919 + }, + { + "epoch": 1.63, + "grad_norm": 0.08840085566043854, + "learning_rate": 0.00043724089705959304, + "loss": 0.1364, + "step": 920 + }, + { + "epoch": 1.63, + "grad_norm": 0.18564368784427643, + "learning_rate": 0.00043631615599589964, + "loss": 0.1485, + "step": 921 + }, + { + "epoch": 1.63, + "grad_norm": 0.3653159737586975, + "learning_rate": 0.00043539163630746384, + "loss": 0.1486, + "step": 922 + }, + { + "epoch": 1.63, + "grad_norm": 0.08679798990488052, + "learning_rate": 0.000434467341208064, + "loss": 0.1291, + "step": 923 + }, + { + "epoch": 1.63, + "grad_norm": 0.1024034321308136, + "learning_rate": 0.00043354327391069826, + "loss": 0.1275, + "step": 924 + }, + { + "epoch": 1.64, + "grad_norm": 0.041372958570718765, + "learning_rate": 0.0004326194376275729, + "loss": 0.1328, + "step": 925 + }, + { + "epoch": 1.64, + "grad_norm": 0.06509742885828018, + "learning_rate": 0.0004316958355700906, + "loss": 0.1324, + "step": 926 + }, + { + "epoch": 1.64, + "grad_norm": 0.09408631920814514, + "learning_rate": 0.0004307724709488409, + "loss": 0.1405, + "step": 927 + }, + { + "epoch": 1.64, + "grad_norm": 0.1963924914598465, + "learning_rate": 0.0004298493469735869, + "loss": 0.1436, + "step": 928 + }, + { + "epoch": 1.64, + "grad_norm": 0.10209079831838608, + "learning_rate": 0.0004289264668532557, + "loss": 0.1277, + "step": 929 + }, + { + "epoch": 1.65, + "grad_norm": 0.026920847594738007, + "learning_rate": 0.00042800383379592677, + "loss": 0.1295, + "step": 930 + }, + { + "epoch": 1.65, + "grad_norm": 0.03551056608557701, + "learning_rate": 0.00042708145100882035, + "loss": 0.1281, + "step": 931 + }, + { + "epoch": 1.65, + "grad_norm": 0.14194993674755096, + "learning_rate": 0.00042615932169828743, + "loss": 0.1398, + "step": 932 + }, + { + "epoch": 1.65, + "grad_norm": 0.2725144326686859, + "learning_rate": 0.00042523744906979683, + "loss": 0.1217, + "step": 933 + }, + { + "epoch": 1.65, + "grad_norm": 0.22893387079238892, + "learning_rate": 0.00042431583632792605, + "loss": 0.1517, + "step": 934 + }, + { + "epoch": 1.65, + "grad_norm": 0.20985311269760132, + "learning_rate": 0.00042339448667634886, + "loss": 0.1433, + "step": 935 + }, + { + "epoch": 1.66, + "grad_norm": 0.053967542946338654, + "learning_rate": 0.00042247340331782416, + "loss": 0.12, + "step": 936 + }, + { + "epoch": 1.66, + "grad_norm": 0.22838272154331207, + "learning_rate": 0.0004215525894541856, + "loss": 0.1176, + "step": 937 + }, + { + "epoch": 1.66, + "grad_norm": 0.3237338066101074, + "learning_rate": 0.0004206320482863301, + "loss": 0.1476, + "step": 938 + }, + { + "epoch": 1.66, + "grad_norm": 0.09525377303361893, + "learning_rate": 0.0004197117830142062, + "loss": 0.1342, + "step": 939 + }, + { + "epoch": 1.66, + "grad_norm": 0.05312574282288551, + "learning_rate": 0.0004187917968368036, + "loss": 0.1311, + "step": 940 + }, + { + "epoch": 1.66, + "grad_norm": 0.11625714600086212, + "learning_rate": 0.00041787209295214177, + "loss": 0.1133, + "step": 941 + }, + { + "epoch": 1.67, + "grad_norm": 0.04892723262310028, + "learning_rate": 0.000416952674557259, + "loss": 0.106, + "step": 942 + }, + { + "epoch": 1.67, + "grad_norm": 0.44417399168014526, + "learning_rate": 0.00041603354484820134, + "loss": 0.1653, + "step": 943 + }, + { + "epoch": 1.67, + "grad_norm": 0.07979090511798859, + "learning_rate": 0.00041511470702001074, + "loss": 0.1144, + "step": 944 + }, + { + "epoch": 1.67, + "grad_norm": 3.163567304611206, + "learning_rate": 0.00041419616426671517, + "loss": 0.1576, + "step": 945 + }, + { + "epoch": 1.67, + "grad_norm": 0.11929760128259659, + "learning_rate": 0.0004132779197813164, + "loss": 0.1329, + "step": 946 + }, + { + "epoch": 1.68, + "grad_norm": 0.21021872758865356, + "learning_rate": 0.0004123599767557795, + "loss": 0.0956, + "step": 947 + }, + { + "epoch": 1.68, + "grad_norm": 0.4803867042064667, + "learning_rate": 0.00041144233838102197, + "loss": 0.2027, + "step": 948 + }, + { + "epoch": 1.68, + "grad_norm": 0.0795937329530716, + "learning_rate": 0.0004105250078469018, + "loss": 0.1226, + "step": 949 + }, + { + "epoch": 1.68, + "grad_norm": 0.19914481043815613, + "learning_rate": 0.00040960798834220705, + "loss": 0.1457, + "step": 950 + }, + { + "epoch": 1.68, + "grad_norm": 1.662695288658142, + "learning_rate": 0.00040869128305464475, + "loss": 0.1465, + "step": 951 + }, + { + "epoch": 1.68, + "grad_norm": 0.1512700617313385, + "learning_rate": 0.00040777489517082924, + "loss": 0.1391, + "step": 952 + }, + { + "epoch": 1.69, + "grad_norm": 0.3317195773124695, + "learning_rate": 0.00040685882787627227, + "loss": 0.1397, + "step": 953 + }, + { + "epoch": 1.69, + "grad_norm": 0.2609153985977173, + "learning_rate": 0.00040594308435537026, + "loss": 0.1217, + "step": 954 + }, + { + "epoch": 1.69, + "grad_norm": 0.40559151768684387, + "learning_rate": 0.00040502766779139485, + "loss": 0.1317, + "step": 955 + }, + { + "epoch": 1.69, + "grad_norm": 0.21796320378780365, + "learning_rate": 0.0004041125813664808, + "loss": 0.1364, + "step": 956 + }, + { + "epoch": 1.69, + "grad_norm": 0.6307505369186401, + "learning_rate": 0.0004031978282616151, + "loss": 0.1553, + "step": 957 + }, + { + "epoch": 1.69, + "grad_norm": 0.28565332293510437, + "learning_rate": 0.00040228341165662683, + "loss": 0.1344, + "step": 958 + }, + { + "epoch": 1.7, + "grad_norm": 0.4165158271789551, + "learning_rate": 0.0004013693347301741, + "loss": 0.1455, + "step": 959 + }, + { + "epoch": 1.7, + "grad_norm": 0.821273922920227, + "learning_rate": 0.0004004556006597353, + "loss": 0.1287, + "step": 960 + }, + { + "epoch": 1.7, + "grad_norm": 0.11364096403121948, + "learning_rate": 0.0003995422126215967, + "loss": 0.1177, + "step": 961 + }, + { + "epoch": 1.7, + "grad_norm": 0.349627822637558, + "learning_rate": 0.0003986291737908414, + "loss": 0.1217, + "step": 962 + }, + { + "epoch": 1.7, + "grad_norm": 0.10462171584367752, + "learning_rate": 0.0003977164873413391, + "loss": 0.1168, + "step": 963 + }, + { + "epoch": 1.71, + "grad_norm": 0.11335984617471695, + "learning_rate": 0.0003968041564457342, + "loss": 0.1313, + "step": 964 + }, + { + "epoch": 1.71, + "grad_norm": 0.37488850951194763, + "learning_rate": 0.0003958921842754351, + "loss": 0.133, + "step": 965 + }, + { + "epoch": 1.71, + "grad_norm": 0.09337367117404938, + "learning_rate": 0.00039498057400060363, + "loss": 0.1464, + "step": 966 + }, + { + "epoch": 1.71, + "grad_norm": 0.27405792474746704, + "learning_rate": 0.000394069328790143, + "loss": 0.1322, + "step": 967 + }, + { + "epoch": 1.71, + "grad_norm": 0.5987095832824707, + "learning_rate": 0.00039315845181168784, + "loss": 0.1307, + "step": 968 + }, + { + "epoch": 1.71, + "grad_norm": 0.3096538484096527, + "learning_rate": 0.00039224794623159294, + "loss": 0.1349, + "step": 969 + }, + { + "epoch": 1.72, + "grad_norm": 0.5547122359275818, + "learning_rate": 0.0003913378152149214, + "loss": 0.1455, + "step": 970 + }, + { + "epoch": 1.72, + "grad_norm": 0.4886229634284973, + "learning_rate": 0.0003904280619254348, + "loss": 0.1251, + "step": 971 + }, + { + "epoch": 1.72, + "grad_norm": 0.18052807450294495, + "learning_rate": 0.0003895186895255814, + "loss": 0.1407, + "step": 972 + }, + { + "epoch": 1.72, + "grad_norm": 0.09001462161540985, + "learning_rate": 0.0003886097011764855, + "loss": 0.1143, + "step": 973 + }, + { + "epoch": 1.72, + "grad_norm": 0.3248112201690674, + "learning_rate": 0.0003877011000379367, + "loss": 0.1212, + "step": 974 + }, + { + "epoch": 1.72, + "grad_norm": 0.11648620665073395, + "learning_rate": 0.000386792889268378, + "loss": 0.1167, + "step": 975 + }, + { + "epoch": 1.73, + "grad_norm": 1.3816261291503906, + "learning_rate": 0.00038588507202489585, + "loss": 0.1518, + "step": 976 + }, + { + "epoch": 1.73, + "grad_norm": 0.6389634013175964, + "learning_rate": 0.00038497765146320873, + "loss": 0.1372, + "step": 977 + }, + { + "epoch": 1.73, + "grad_norm": 0.6133326888084412, + "learning_rate": 0.0003840706307376557, + "loss": 0.1252, + "step": 978 + }, + { + "epoch": 1.73, + "grad_norm": 0.20734143257141113, + "learning_rate": 0.00038316401300118674, + "loss": 0.1115, + "step": 979 + }, + { + "epoch": 1.73, + "grad_norm": 0.06368093192577362, + "learning_rate": 0.0003822578014053502, + "loss": 0.145, + "step": 980 + }, + { + "epoch": 1.74, + "grad_norm": 0.07665737718343735, + "learning_rate": 0.0003813519991002831, + "loss": 0.1557, + "step": 981 + }, + { + "epoch": 1.74, + "grad_norm": 0.09085717052221298, + "learning_rate": 0.00038044660923469963, + "loss": 0.1251, + "step": 982 + }, + { + "epoch": 1.74, + "grad_norm": 0.08342912048101425, + "learning_rate": 0.00037954163495588, + "loss": 0.1256, + "step": 983 + }, + { + "epoch": 1.74, + "grad_norm": 0.6068560481071472, + "learning_rate": 0.00037863707940966024, + "loss": 0.1506, + "step": 984 + }, + { + "epoch": 1.74, + "grad_norm": 0.22720251977443695, + "learning_rate": 0.00037773294574042015, + "loss": 0.1151, + "step": 985 + }, + { + "epoch": 1.74, + "grad_norm": 0.5528678297996521, + "learning_rate": 0.00037682923709107363, + "loss": 0.1436, + "step": 986 + }, + { + "epoch": 1.75, + "grad_norm": 0.6791836619377136, + "learning_rate": 0.00037592595660305707, + "loss": 0.1405, + "step": 987 + }, + { + "epoch": 1.75, + "grad_norm": 0.07115644961595535, + "learning_rate": 0.0003750231074163179, + "loss": 0.0997, + "step": 988 + }, + { + "epoch": 1.75, + "grad_norm": 0.36401447653770447, + "learning_rate": 0.00037412069266930514, + "loss": 0.1471, + "step": 989 + }, + { + "epoch": 1.75, + "grad_norm": 0.7831732630729675, + "learning_rate": 0.00037321871549895715, + "loss": 0.1314, + "step": 990 + }, + { + "epoch": 1.75, + "grad_norm": 0.12779557704925537, + "learning_rate": 0.00037231717904069096, + "loss": 0.1446, + "step": 991 + }, + { + "epoch": 1.75, + "grad_norm": 0.41478636860847473, + "learning_rate": 0.0003714160864283923, + "loss": 0.1429, + "step": 992 + }, + { + "epoch": 1.76, + "grad_norm": 0.1117364913225174, + "learning_rate": 0.00037051544079440334, + "loss": 0.1148, + "step": 993 + }, + { + "epoch": 1.76, + "grad_norm": 0.5920963287353516, + "learning_rate": 0.00036961524526951277, + "loss": 0.1204, + "step": 994 + }, + { + "epoch": 1.76, + "eval_loss": 0.13537168502807617, + "eval_runtime": 14.0251, + "eval_samples_per_second": 34.01, + "eval_steps_per_second": 8.556, + "step": 994 + }, + { + "epoch": 1.76, + "grad_norm": 0.43262025713920593, + "learning_rate": 0.000368715502982945, + "loss": 0.1302, + "step": 995 + }, + { + "epoch": 1.76, + "grad_norm": 0.10716990381479263, + "learning_rate": 0.00036781621706234816, + "loss": 0.133, + "step": 996 + }, + { + "epoch": 1.76, + "grad_norm": 0.0918804183602333, + "learning_rate": 0.0003669173906337846, + "loss": 0.1354, + "step": 997 + }, + { + "epoch": 1.77, + "grad_norm": 0.13421286642551422, + "learning_rate": 0.0003660190268217189, + "loss": 0.121, + "step": 998 + }, + { + "epoch": 1.77, + "grad_norm": 0.15904036164283752, + "learning_rate": 0.00036512112874900797, + "loss": 0.1341, + "step": 999 + }, + { + "epoch": 1.77, + "grad_norm": 0.3201177418231964, + "learning_rate": 0.00036422369953688973, + "loss": 0.1454, + "step": 1000 + }, + { + "epoch": 1.77, + "grad_norm": 0.3361368477344513, + "learning_rate": 0.0003633267423049717, + "loss": 0.152, + "step": 1001 + }, + { + "epoch": 1.77, + "grad_norm": 0.09073235839605331, + "learning_rate": 0.0003624302601712213, + "loss": 0.1331, + "step": 1002 + }, + { + "epoch": 1.77, + "grad_norm": 0.2717398405075073, + "learning_rate": 0.0003615342562519542, + "loss": 0.1373, + "step": 1003 + }, + { + "epoch": 1.78, + "grad_norm": 0.0666472539305687, + "learning_rate": 0.0003606387336618237, + "loss": 0.1496, + "step": 1004 + }, + { + "epoch": 1.78, + "grad_norm": 0.15450868010520935, + "learning_rate": 0.0003597436955138102, + "loss": 0.1464, + "step": 1005 + }, + { + "epoch": 1.78, + "grad_norm": 0.236429825425148, + "learning_rate": 0.0003588491449192096, + "loss": 0.1342, + "step": 1006 + }, + { + "epoch": 1.78, + "grad_norm": 0.06421036273241043, + "learning_rate": 0.0003579550849876233, + "loss": 0.1352, + "step": 1007 + }, + { + "epoch": 1.78, + "grad_norm": 0.07432877272367477, + "learning_rate": 0.00035706151882694727, + "loss": 0.131, + "step": 1008 + }, + { + "epoch": 1.79, + "grad_norm": 0.10126742720603943, + "learning_rate": 0.00035616844954336046, + "loss": 0.1471, + "step": 1009 + }, + { + "epoch": 1.79, + "grad_norm": 0.15761522948741913, + "learning_rate": 0.0003552758802413154, + "loss": 0.1358, + "step": 1010 + }, + { + "epoch": 1.79, + "grad_norm": 0.02369426190853119, + "learning_rate": 0.0003543838140235257, + "loss": 0.1296, + "step": 1011 + }, + { + "epoch": 1.79, + "grad_norm": 0.27005845308303833, + "learning_rate": 0.0003534922539909569, + "loss": 0.1412, + "step": 1012 + }, + { + "epoch": 1.79, + "grad_norm": 0.0638512596487999, + "learning_rate": 0.00035260120324281474, + "loss": 0.1366, + "step": 1013 + }, + { + "epoch": 1.79, + "grad_norm": 0.5134268403053284, + "learning_rate": 0.00035171066487653423, + "loss": 0.146, + "step": 1014 + }, + { + "epoch": 1.8, + "grad_norm": 0.2569257915019989, + "learning_rate": 0.00035082064198776997, + "loss": 0.1507, + "step": 1015 + }, + { + "epoch": 1.8, + "grad_norm": 0.16015255451202393, + "learning_rate": 0.0003499311376703842, + "loss": 0.1297, + "step": 1016 + }, + { + "epoch": 1.8, + "grad_norm": 0.22499139606952667, + "learning_rate": 0.0003490421550164364, + "loss": 0.1357, + "step": 1017 + }, + { + "epoch": 1.8, + "grad_norm": 0.5117542743682861, + "learning_rate": 0.0003481536971161732, + "loss": 0.1418, + "step": 1018 + }, + { + "epoch": 1.8, + "grad_norm": 0.27242857217788696, + "learning_rate": 0.00034726576705801636, + "loss": 0.1358, + "step": 1019 + }, + { + "epoch": 1.8, + "grad_norm": 0.3779907822608948, + "learning_rate": 0.0003463783679285535, + "loss": 0.1512, + "step": 1020 + }, + { + "epoch": 1.81, + "grad_norm": 0.2148500680923462, + "learning_rate": 0.00034549150281252633, + "loss": 0.1445, + "step": 1021 + }, + { + "epoch": 1.81, + "grad_norm": 0.08610748499631882, + "learning_rate": 0.0003446051747928202, + "loss": 0.1333, + "step": 1022 + }, + { + "epoch": 1.81, + "grad_norm": 0.07877200841903687, + "learning_rate": 0.0003437193869504535, + "loss": 0.1333, + "step": 1023 + }, + { + "epoch": 1.81, + "grad_norm": 0.1410919725894928, + "learning_rate": 0.0003428341423645668, + "loss": 0.1303, + "step": 1024 + }, + { + "epoch": 1.81, + "grad_norm": 0.1181708350777626, + "learning_rate": 0.00034194944411241213, + "loss": 0.1234, + "step": 1025 + }, + { + "epoch": 1.82, + "grad_norm": 0.13002073764801025, + "learning_rate": 0.00034106529526934303, + "loss": 0.1405, + "step": 1026 + }, + { + "epoch": 1.82, + "grad_norm": 0.14099909365177155, + "learning_rate": 0.00034018169890880225, + "loss": 0.1342, + "step": 1027 + }, + { + "epoch": 1.82, + "grad_norm": 0.24455852806568146, + "learning_rate": 0.00033929865810231264, + "loss": 0.1482, + "step": 1028 + }, + { + "epoch": 1.82, + "grad_norm": 0.23751892149448395, + "learning_rate": 0.0003384161759194658, + "loss": 0.1509, + "step": 1029 + }, + { + "epoch": 1.82, + "grad_norm": 0.40307796001434326, + "learning_rate": 0.00033753425542791104, + "loss": 0.1542, + "step": 1030 + }, + { + "epoch": 1.82, + "grad_norm": 0.123422771692276, + "learning_rate": 0.0003366528996933458, + "loss": 0.129, + "step": 1031 + }, + { + "epoch": 1.83, + "grad_norm": 0.15652324259281158, + "learning_rate": 0.00033577211177950386, + "loss": 0.1277, + "step": 1032 + }, + { + "epoch": 1.83, + "grad_norm": 0.2987327575683594, + "learning_rate": 0.0003348918947481452, + "loss": 0.1395, + "step": 1033 + }, + { + "epoch": 1.83, + "grad_norm": 0.2202194184064865, + "learning_rate": 0.00033401225165904556, + "loss": 0.1287, + "step": 1034 + }, + { + "epoch": 1.83, + "grad_norm": 0.10783470422029495, + "learning_rate": 0.0003331331855699852, + "loss": 0.1423, + "step": 1035 + }, + { + "epoch": 1.83, + "grad_norm": 0.26680612564086914, + "learning_rate": 0.0003322546995367394, + "loss": 0.1307, + "step": 1036 + }, + { + "epoch": 1.83, + "grad_norm": 0.14138604700565338, + "learning_rate": 0.00033137679661306575, + "loss": 0.1273, + "step": 1037 + }, + { + "epoch": 1.84, + "grad_norm": 0.2755129039287567, + "learning_rate": 0.0003304994798506962, + "loss": 0.1485, + "step": 1038 + }, + { + "epoch": 1.84, + "grad_norm": 0.15637388825416565, + "learning_rate": 0.00032962275229932446, + "loss": 0.1233, + "step": 1039 + }, + { + "epoch": 1.84, + "grad_norm": 0.10934972018003464, + "learning_rate": 0.00032874661700659587, + "loss": 0.1438, + "step": 1040 + }, + { + "epoch": 1.84, + "grad_norm": 0.16785453259944916, + "learning_rate": 0.00032787107701809755, + "loss": 0.1257, + "step": 1041 + }, + { + "epoch": 1.84, + "grad_norm": 0.2813141644001007, + "learning_rate": 0.0003269961353773469, + "loss": 0.1594, + "step": 1042 + }, + { + "epoch": 1.85, + "grad_norm": 0.321236789226532, + "learning_rate": 0.00032612179512578126, + "loss": 0.1476, + "step": 1043 + }, + { + "epoch": 1.85, + "grad_norm": 0.35957199335098267, + "learning_rate": 0.0003252480593027478, + "loss": 0.1614, + "step": 1044 + }, + { + "epoch": 1.85, + "grad_norm": 0.2529314160346985, + "learning_rate": 0.0003243749309454922, + "loss": 0.1436, + "step": 1045 + }, + { + "epoch": 1.85, + "grad_norm": 0.36090412735939026, + "learning_rate": 0.00032350241308914864, + "loss": 0.1578, + "step": 1046 + }, + { + "epoch": 1.85, + "grad_norm": 0.47541487216949463, + "learning_rate": 0.0003226305087667295, + "loss": 0.1247, + "step": 1047 + }, + { + "epoch": 1.85, + "grad_norm": 0.23453806340694427, + "learning_rate": 0.0003217592210091137, + "loss": 0.1435, + "step": 1048 + }, + { + "epoch": 1.86, + "grad_norm": 0.12492989748716354, + "learning_rate": 0.0003208885528450376, + "loss": 0.1229, + "step": 1049 + }, + { + "epoch": 1.86, + "grad_norm": 0.19712020456790924, + "learning_rate": 0.00032001850730108307, + "loss": 0.1292, + "step": 1050 + }, + { + "epoch": 1.86, + "grad_norm": 0.09731408208608627, + "learning_rate": 0.00031914908740166795, + "loss": 0.1333, + "step": 1051 + }, + { + "epoch": 1.86, + "grad_norm": 0.06944354623556137, + "learning_rate": 0.0003182802961690357, + "loss": 0.1292, + "step": 1052 + }, + { + "epoch": 1.86, + "grad_norm": 0.07448045909404755, + "learning_rate": 0.00031741213662324363, + "loss": 0.1349, + "step": 1053 + }, + { + "epoch": 1.86, + "grad_norm": 0.28523683547973633, + "learning_rate": 0.0003165446117821538, + "loss": 0.1452, + "step": 1054 + }, + { + "epoch": 1.87, + "grad_norm": 0.09108186513185501, + "learning_rate": 0.0003156777246614215, + "loss": 0.1361, + "step": 1055 + }, + { + "epoch": 1.87, + "grad_norm": 0.13375020027160645, + "learning_rate": 0.0003148114782744855, + "loss": 0.1381, + "step": 1056 + }, + { + "epoch": 1.87, + "grad_norm": 0.06716307252645493, + "learning_rate": 0.00031394587563255755, + "loss": 0.1383, + "step": 1057 + }, + { + "epoch": 1.87, + "grad_norm": 0.11596639454364777, + "learning_rate": 0.00031308091974461064, + "loss": 0.1313, + "step": 1058 + }, + { + "epoch": 1.87, + "grad_norm": 0.39337942004203796, + "learning_rate": 0.00031221661361737065, + "loss": 0.1359, + "step": 1059 + }, + { + "epoch": 1.88, + "grad_norm": 0.07525162398815155, + "learning_rate": 0.00031135296025530424, + "loss": 0.1326, + "step": 1060 + }, + { + "epoch": 1.88, + "grad_norm": 0.020530417561531067, + "learning_rate": 0.0003104899626606088, + "loss": 0.1368, + "step": 1061 + }, + { + "epoch": 1.88, + "grad_norm": 0.17400570213794708, + "learning_rate": 0.00030962762383320285, + "loss": 0.1309, + "step": 1062 + }, + { + "epoch": 1.88, + "grad_norm": 0.1574063003063202, + "learning_rate": 0.00030876594677071404, + "loss": 0.1365, + "step": 1063 + }, + { + "epoch": 1.88, + "grad_norm": 0.21009187400341034, + "learning_rate": 0.0003079049344684702, + "loss": 0.1382, + "step": 1064 + }, + { + "epoch": 1.88, + "grad_norm": 0.047014713287353516, + "learning_rate": 0.00030704458991948844, + "loss": 0.1327, + "step": 1065 + }, + { + "epoch": 1.89, + "grad_norm": 0.11032029241323471, + "learning_rate": 0.0003061849161144641, + "loss": 0.1252, + "step": 1066 + }, + { + "epoch": 1.89, + "grad_norm": 0.08818018436431885, + "learning_rate": 0.0003053259160417613, + "loss": 0.1495, + "step": 1067 + }, + { + "epoch": 1.89, + "grad_norm": 0.08112979680299759, + "learning_rate": 0.0003044675926874023, + "loss": 0.1408, + "step": 1068 + }, + { + "epoch": 1.89, + "grad_norm": 0.11870189756155014, + "learning_rate": 0.00030360994903505653, + "loss": 0.1342, + "step": 1069 + }, + { + "epoch": 1.89, + "grad_norm": 0.16511432826519012, + "learning_rate": 0.000302752988066031, + "loss": 0.148, + "step": 1070 + }, + { + "epoch": 1.89, + "grad_norm": 0.31104427576065063, + "learning_rate": 0.0003018967127592595, + "loss": 0.15, + "step": 1071 + }, + { + "epoch": 1.9, + "grad_norm": 0.1434139758348465, + "learning_rate": 0.0003010411260912922, + "loss": 0.142, + "step": 1072 + }, + { + "epoch": 1.9, + "grad_norm": 0.29530733823776245, + "learning_rate": 0.00030018623103628594, + "loss": 0.1284, + "step": 1073 + }, + { + "epoch": 1.9, + "grad_norm": 0.06294587254524231, + "learning_rate": 0.00029933203056599274, + "loss": 0.1379, + "step": 1074 + }, + { + "epoch": 1.9, + "grad_norm": 0.0581539049744606, + "learning_rate": 0.0002984785276497507, + "loss": 0.1278, + "step": 1075 + }, + { + "epoch": 1.9, + "grad_norm": 0.09157228469848633, + "learning_rate": 0.0002976257252544726, + "loss": 0.1348, + "step": 1076 + }, + { + "epoch": 1.91, + "grad_norm": 0.10196894407272339, + "learning_rate": 0.00029677362634463643, + "loss": 0.1386, + "step": 1077 + }, + { + "epoch": 1.91, + "grad_norm": 0.1152050793170929, + "learning_rate": 0.00029592223388227504, + "loss": 0.1337, + "step": 1078 + }, + { + "epoch": 1.91, + "grad_norm": 0.34997934103012085, + "learning_rate": 0.0002950715508269648, + "loss": 0.1346, + "step": 1079 + }, + { + "epoch": 1.91, + "grad_norm": 0.21548382937908173, + "learning_rate": 0.00029422158013581656, + "loss": 0.1309, + "step": 1080 + }, + { + "epoch": 1.91, + "grad_norm": 0.3693360686302185, + "learning_rate": 0.000293372324763465, + "loss": 0.1587, + "step": 1081 + }, + { + "epoch": 1.91, + "grad_norm": 0.2655669152736664, + "learning_rate": 0.0002925237876620576, + "loss": 0.1285, + "step": 1082 + }, + { + "epoch": 1.92, + "grad_norm": 0.08570755273103714, + "learning_rate": 0.00029167597178124583, + "loss": 0.1359, + "step": 1083 + }, + { + "epoch": 1.92, + "grad_norm": 0.2622168958187103, + "learning_rate": 0.00029082888006817364, + "loss": 0.1315, + "step": 1084 + }, + { + "epoch": 1.92, + "grad_norm": 0.020985718816518784, + "learning_rate": 0.0002899825154674674, + "loss": 0.1308, + "step": 1085 + }, + { + "epoch": 1.92, + "grad_norm": 0.15370753407478333, + "learning_rate": 0.00028913688092122665, + "loss": 0.1269, + "step": 1086 + }, + { + "epoch": 1.92, + "grad_norm": 0.1030414029955864, + "learning_rate": 0.0002882919793690123, + "loss": 0.1143, + "step": 1087 + }, + { + "epoch": 1.92, + "grad_norm": 0.31633636355400085, + "learning_rate": 0.00028744781374783813, + "loss": 0.1563, + "step": 1088 + }, + { + "epoch": 1.93, + "grad_norm": 0.03694160282611847, + "learning_rate": 0.00028660438699215895, + "loss": 0.1276, + "step": 1089 + }, + { + "epoch": 1.93, + "grad_norm": 0.14471565186977386, + "learning_rate": 0.0002857617020338614, + "loss": 0.137, + "step": 1090 + }, + { + "epoch": 1.93, + "grad_norm": 0.08222481608390808, + "learning_rate": 0.0002849197618022539, + "loss": 0.1279, + "step": 1091 + }, + { + "epoch": 1.93, + "grad_norm": 0.11603690683841705, + "learning_rate": 0.00028407856922405526, + "loss": 0.138, + "step": 1092 + }, + { + "epoch": 1.93, + "grad_norm": 0.18144792318344116, + "learning_rate": 0.0002832381272233864, + "loss": 0.1481, + "step": 1093 + }, + { + "epoch": 1.94, + "grad_norm": 0.05054265260696411, + "learning_rate": 0.00028239843872175814, + "loss": 0.1363, + "step": 1094 + }, + { + "epoch": 1.94, + "grad_norm": 2.8396074771881104, + "learning_rate": 0.00028155950663806236, + "loss": 0.1345, + "step": 1095 + }, + { + "epoch": 1.94, + "grad_norm": 0.30984073877334595, + "learning_rate": 0.0002807213338885619, + "loss": 0.1338, + "step": 1096 + }, + { + "epoch": 1.94, + "grad_norm": 0.34585434198379517, + "learning_rate": 0.00027988392338687925, + "loss": 0.1389, + "step": 1097 + }, + { + "epoch": 1.94, + "grad_norm": 0.0312834158539772, + "learning_rate": 0.0002790472780439881, + "loss": 0.1338, + "step": 1098 + }, + { + "epoch": 1.94, + "grad_norm": 0.5297988057136536, + "learning_rate": 0.0002782114007682016, + "loss": 0.1321, + "step": 1099 + }, + { + "epoch": 1.95, + "grad_norm": 0.10174748301506042, + "learning_rate": 0.0002773762944651632, + "loss": 0.1202, + "step": 1100 + }, + { + "epoch": 1.95, + "grad_norm": 0.0418776273727417, + "learning_rate": 0.0002765419620378366, + "loss": 0.1257, + "step": 1101 + }, + { + "epoch": 1.95, + "grad_norm": 0.3508782386779785, + "learning_rate": 0.0002757084063864949, + "loss": 0.1612, + "step": 1102 + }, + { + "epoch": 1.95, + "grad_norm": 0.2867041230201721, + "learning_rate": 0.00027487563040871145, + "loss": 0.1574, + "step": 1103 + }, + { + "epoch": 1.95, + "grad_norm": 0.24160172045230865, + "learning_rate": 0.00027404363699934907, + "loss": 0.1175, + "step": 1104 + }, + { + "epoch": 1.95, + "grad_norm": 0.03382538631558418, + "learning_rate": 0.0002732124290505501, + "loss": 0.1363, + "step": 1105 + }, + { + "epoch": 1.96, + "grad_norm": 0.05089818313717842, + "learning_rate": 0.000272382009451727, + "loss": 0.1384, + "step": 1106 + }, + { + "epoch": 1.96, + "grad_norm": 0.08688928186893463, + "learning_rate": 0.0002715523810895515, + "loss": 0.1415, + "step": 1107 + }, + { + "epoch": 1.96, + "grad_norm": 0.03926026448607445, + "learning_rate": 0.00027072354684794486, + "loss": 0.1357, + "step": 1108 + }, + { + "epoch": 1.96, + "grad_norm": 0.05058757960796356, + "learning_rate": 0.0002698955096080677, + "loss": 0.1356, + "step": 1109 + }, + { + "epoch": 1.96, + "grad_norm": 0.0489107221364975, + "learning_rate": 0.00026906827224831023, + "loss": 0.1394, + "step": 1110 + }, + { + "epoch": 1.97, + "grad_norm": 0.09765997529029846, + "learning_rate": 0.00026824183764428223, + "loss": 0.1388, + "step": 1111 + }, + { + "epoch": 1.97, + "grad_norm": 0.06232646107673645, + "learning_rate": 0.00026741620866880335, + "loss": 0.1322, + "step": 1112 + }, + { + "epoch": 1.97, + "grad_norm": 0.08469201624393463, + "learning_rate": 0.0002665913881918921, + "loss": 0.1369, + "step": 1113 + }, + { + "epoch": 1.97, + "grad_norm": 0.08312228322029114, + "learning_rate": 0.00026576737908075667, + "loss": 0.137, + "step": 1114 + }, + { + "epoch": 1.97, + "grad_norm": 0.13206493854522705, + "learning_rate": 0.00026494418419978485, + "loss": 0.127, + "step": 1115 + }, + { + "epoch": 1.97, + "grad_norm": 0.050156012177467346, + "learning_rate": 0.0002641218064105341, + "loss": 0.1304, + "step": 1116 + }, + { + "epoch": 1.98, + "grad_norm": 0.22207175195217133, + "learning_rate": 0.0002633002485717219, + "loss": 0.1406, + "step": 1117 + }, + { + "epoch": 1.98, + "grad_norm": 0.25118494033813477, + "learning_rate": 0.0002624795135392148, + "loss": 0.1373, + "step": 1118 + }, + { + "epoch": 1.98, + "grad_norm": 0.09831628948450089, + "learning_rate": 0.00026165960416601943, + "loss": 0.1459, + "step": 1119 + }, + { + "epoch": 1.98, + "grad_norm": 0.037262722849845886, + "learning_rate": 0.00026084052330227237, + "loss": 0.1314, + "step": 1120 + }, + { + "epoch": 1.98, + "grad_norm": 0.05748564377427101, + "learning_rate": 0.0002600222737952299, + "loss": 0.1439, + "step": 1121 + }, + { + "epoch": 1.98, + "grad_norm": 0.06702205538749695, + "learning_rate": 0.00025920485848925914, + "loss": 0.1265, + "step": 1122 + }, + { + "epoch": 1.99, + "grad_norm": 0.11122670769691467, + "learning_rate": 0.00025838828022582596, + "loss": 0.1275, + "step": 1123 + }, + { + "epoch": 1.99, + "grad_norm": 0.1843162626028061, + "learning_rate": 0.0002575725418434878, + "loss": 0.1419, + "step": 1124 + }, + { + "epoch": 1.99, + "grad_norm": 0.11396101117134094, + "learning_rate": 0.00025675764617788234, + "loss": 0.1342, + "step": 1125 + }, + { + "epoch": 1.99, + "grad_norm": 0.043184638023376465, + "learning_rate": 0.00025594359606171725, + "loss": 0.1295, + "step": 1126 + }, + { + "epoch": 1.99, + "grad_norm": 0.15845736861228943, + "learning_rate": 0.0002551303943247619, + "loss": 0.1429, + "step": 1127 + }, + { + "epoch": 2.0, + "grad_norm": 0.1975352019071579, + "learning_rate": 0.0002543180437938352, + "loss": 0.1153, + "step": 1128 + }, + { + "epoch": 2.0, + "grad_norm": 0.13739655911922455, + "learning_rate": 0.00025350654729279834, + "loss": 0.1347, + "step": 1129 + }, + { + "epoch": 2.0, + "grad_norm": 0.2761506140232086, + "learning_rate": 0.0002526959076425434, + "loss": 0.147, + "step": 1130 + }, + { + "epoch": 2.0, + "grad_norm": 0.09016864746809006, + "learning_rate": 0.0002518861276609837, + "loss": 0.1298, + "step": 1131 + }, + { + "epoch": 2.0, + "grad_norm": 0.15213081240653992, + "learning_rate": 0.00025107721016304424, + "loss": 0.126, + "step": 1132 + }, + { + "epoch": 2.0, + "grad_norm": 0.10915911197662354, + "learning_rate": 0.00025026915796065233, + "loss": 0.1533, + "step": 1133 + }, + { + "epoch": 2.01, + "grad_norm": 0.21597544848918915, + "learning_rate": 0.0002494619738627266, + "loss": 0.1352, + "step": 1134 + }, + { + "epoch": 2.01, + "grad_norm": 0.14955481886863708, + "learning_rate": 0.00024865566067516895, + "loss": 0.115, + "step": 1135 + }, + { + "epoch": 2.01, + "grad_norm": 0.07370211184024811, + "learning_rate": 0.000247850221200853, + "loss": 0.1262, + "step": 1136 + }, + { + "epoch": 2.01, + "eval_loss": 0.13425856828689575, + "eval_runtime": 14.0067, + "eval_samples_per_second": 34.055, + "eval_steps_per_second": 8.567, + "step": 1136 + }, + { + "epoch": 2.01, + "grad_norm": 0.22561423480510712, + "learning_rate": 0.00024704565823961564, + "loss": 0.1442, + "step": 1137 + }, + { + "epoch": 2.01, + "grad_norm": 0.1496383398771286, + "learning_rate": 0.0002462419745882469, + "loss": 0.1367, + "step": 1138 + }, + { + "epoch": 2.02, + "grad_norm": 0.1709650307893753, + "learning_rate": 0.00024543917304047986, + "loss": 0.1334, + "step": 1139 + }, + { + "epoch": 2.02, + "grad_norm": 0.2110009342432022, + "learning_rate": 0.0002446372563869818, + "loss": 0.1387, + "step": 1140 + }, + { + "epoch": 2.02, + "grad_norm": 0.26138541102409363, + "learning_rate": 0.00024383622741534343, + "loss": 0.1366, + "step": 1141 + }, + { + "epoch": 2.02, + "grad_norm": 0.13308700919151306, + "learning_rate": 0.00024303608891006984, + "loss": 0.1272, + "step": 1142 + }, + { + "epoch": 2.02, + "grad_norm": 0.31121209263801575, + "learning_rate": 0.0002422368436525711, + "loss": 0.158, + "step": 1143 + }, + { + "epoch": 2.02, + "grad_norm": 0.09018420428037643, + "learning_rate": 0.00024143849442115158, + "loss": 0.1333, + "step": 1144 + }, + { + "epoch": 2.03, + "grad_norm": 0.19516992568969727, + "learning_rate": 0.00024064104399100167, + "loss": 0.1276, + "step": 1145 + }, + { + "epoch": 2.03, + "grad_norm": 0.22115693986415863, + "learning_rate": 0.00023984449513418687, + "loss": 0.1184, + "step": 1146 + }, + { + "epoch": 2.03, + "grad_norm": 0.08551298081874847, + "learning_rate": 0.00023904885061963843, + "loss": 0.1318, + "step": 1147 + }, + { + "epoch": 2.03, + "grad_norm": 0.27814337611198425, + "learning_rate": 0.00023825411321314489, + "loss": 0.1454, + "step": 1148 + }, + { + "epoch": 2.03, + "grad_norm": 0.39333808422088623, + "learning_rate": 0.0002374602856773404, + "loss": 0.1491, + "step": 1149 + }, + { + "epoch": 2.03, + "grad_norm": 0.1820652335882187, + "learning_rate": 0.00023666737077169726, + "loss": 0.1464, + "step": 1150 + }, + { + "epoch": 2.04, + "grad_norm": 0.21041136980056763, + "learning_rate": 0.00023587537125251468, + "loss": 0.1437, + "step": 1151 + }, + { + "epoch": 2.04, + "grad_norm": 0.22204861044883728, + "learning_rate": 0.00023508428987290987, + "loss": 0.1375, + "step": 1152 + }, + { + "epoch": 2.04, + "grad_norm": 0.1012771725654602, + "learning_rate": 0.00023429412938280898, + "loss": 0.1271, + "step": 1153 + }, + { + "epoch": 2.04, + "grad_norm": 0.23188555240631104, + "learning_rate": 0.0002335048925289362, + "loss": 0.1254, + "step": 1154 + }, + { + "epoch": 2.04, + "grad_norm": 0.22195938229560852, + "learning_rate": 0.00023271658205480588, + "loss": 0.1374, + "step": 1155 + }, + { + "epoch": 2.05, + "grad_norm": 0.24288509786128998, + "learning_rate": 0.00023192920070071144, + "loss": 0.1386, + "step": 1156 + }, + { + "epoch": 2.05, + "grad_norm": 0.0804426372051239, + "learning_rate": 0.00023114275120371657, + "loss": 0.128, + "step": 1157 + }, + { + "epoch": 2.05, + "grad_norm": 0.2349734604358673, + "learning_rate": 0.00023035723629764615, + "loss": 0.139, + "step": 1158 + }, + { + "epoch": 2.05, + "grad_norm": 0.21826253831386566, + "learning_rate": 0.0002295726587130761, + "loss": 0.1483, + "step": 1159 + }, + { + "epoch": 2.05, + "grad_norm": 0.12499336153268814, + "learning_rate": 0.0002287890211773238, + "loss": 0.1446, + "step": 1160 + }, + { + "epoch": 2.05, + "grad_norm": 0.22501634061336517, + "learning_rate": 0.00022800632641443902, + "loss": 0.1288, + "step": 1161 + }, + { + "epoch": 2.06, + "grad_norm": 0.123602956533432, + "learning_rate": 0.00022722457714519418, + "loss": 0.1287, + "step": 1162 + }, + { + "epoch": 2.06, + "grad_norm": 0.1110440120100975, + "learning_rate": 0.00022644377608707522, + "loss": 0.1375, + "step": 1163 + }, + { + "epoch": 2.06, + "grad_norm": 0.10080868005752563, + "learning_rate": 0.00022566392595427216, + "loss": 0.1236, + "step": 1164 + }, + { + "epoch": 2.06, + "grad_norm": 0.11740677803754807, + "learning_rate": 0.00022488502945766892, + "loss": 0.1362, + "step": 1165 + }, + { + "epoch": 2.06, + "grad_norm": 0.08558549731969833, + "learning_rate": 0.00022410708930483463, + "loss": 0.1525, + "step": 1166 + }, + { + "epoch": 2.06, + "grad_norm": 0.4037264883518219, + "learning_rate": 0.00022333010820001394, + "loss": 0.1366, + "step": 1167 + }, + { + "epoch": 2.07, + "grad_norm": 0.12114354968070984, + "learning_rate": 0.00022255408884411793, + "loss": 0.1386, + "step": 1168 + }, + { + "epoch": 2.07, + "grad_norm": 0.13077768683433533, + "learning_rate": 0.0002217790339347146, + "loss": 0.1281, + "step": 1169 + }, + { + "epoch": 2.07, + "grad_norm": 0.15883688628673553, + "learning_rate": 0.0002210049461660189, + "loss": 0.129, + "step": 1170 + }, + { + "epoch": 2.07, + "grad_norm": 0.1601915806531906, + "learning_rate": 0.00022023182822888398, + "loss": 0.1119, + "step": 1171 + }, + { + "epoch": 2.07, + "grad_norm": 0.17367267608642578, + "learning_rate": 0.0002194596828107921, + "loss": 0.147, + "step": 1172 + }, + { + "epoch": 2.08, + "grad_norm": 0.2555027902126312, + "learning_rate": 0.00021868851259584427, + "loss": 0.1495, + "step": 1173 + }, + { + "epoch": 2.08, + "grad_norm": 0.10203690081834793, + "learning_rate": 0.00021791832026475238, + "loss": 0.1348, + "step": 1174 + }, + { + "epoch": 2.08, + "grad_norm": 0.19506026804447174, + "learning_rate": 0.00021714910849482776, + "loss": 0.1332, + "step": 1175 + }, + { + "epoch": 2.08, + "grad_norm": 0.1914072483778, + "learning_rate": 0.00021638087995997442, + "loss": 0.1545, + "step": 1176 + }, + { + "epoch": 2.08, + "grad_norm": 0.1700693666934967, + "learning_rate": 0.00021561363733067795, + "loss": 0.1435, + "step": 1177 + }, + { + "epoch": 2.08, + "grad_norm": 0.11551807075738907, + "learning_rate": 0.00021484738327399682, + "loss": 0.1335, + "step": 1178 + }, + { + "epoch": 2.09, + "grad_norm": 0.039073869585990906, + "learning_rate": 0.0002140821204535529, + "loss": 0.1292, + "step": 1179 + }, + { + "epoch": 2.09, + "grad_norm": 0.1664426028728485, + "learning_rate": 0.00021331785152952244, + "loss": 0.1322, + "step": 1180 + }, + { + "epoch": 2.09, + "grad_norm": 0.10394078493118286, + "learning_rate": 0.00021255457915862692, + "loss": 0.1355, + "step": 1181 + }, + { + "epoch": 2.09, + "grad_norm": 0.23373393714427948, + "learning_rate": 0.00021179230599412374, + "loss": 0.1345, + "step": 1182 + }, + { + "epoch": 2.09, + "grad_norm": 0.19787871837615967, + "learning_rate": 0.00021103103468579653, + "loss": 0.1341, + "step": 1183 + }, + { + "epoch": 2.09, + "grad_norm": 0.1043769121170044, + "learning_rate": 0.00021027076787994632, + "loss": 0.1305, + "step": 1184 + }, + { + "epoch": 2.1, + "grad_norm": 0.14168131351470947, + "learning_rate": 0.00020951150821938276, + "loss": 0.1399, + "step": 1185 + }, + { + "epoch": 2.1, + "grad_norm": 0.2587750554084778, + "learning_rate": 0.0002087532583434139, + "loss": 0.1381, + "step": 1186 + }, + { + "epoch": 2.1, + "grad_norm": 0.13129281997680664, + "learning_rate": 0.00020799602088783837, + "loss": 0.1191, + "step": 1187 + }, + { + "epoch": 2.1, + "grad_norm": 0.1005392074584961, + "learning_rate": 0.00020723979848493473, + "loss": 0.1342, + "step": 1188 + }, + { + "epoch": 2.1, + "grad_norm": 0.10267619788646698, + "learning_rate": 0.0002064845937634533, + "loss": 0.1355, + "step": 1189 + }, + { + "epoch": 2.11, + "grad_norm": 0.06988910585641861, + "learning_rate": 0.00020573040934860715, + "loss": 0.1301, + "step": 1190 + }, + { + "epoch": 2.11, + "grad_norm": 0.10177844017744064, + "learning_rate": 0.00020497724786206183, + "loss": 0.1309, + "step": 1191 + }, + { + "epoch": 2.11, + "grad_norm": 0.10639530420303345, + "learning_rate": 0.000204225111921928, + "loss": 0.1201, + "step": 1192 + }, + { + "epoch": 2.11, + "grad_norm": 0.176835834980011, + "learning_rate": 0.00020347400414275058, + "loss": 0.134, + "step": 1193 + }, + { + "epoch": 2.11, + "grad_norm": 0.0938374251127243, + "learning_rate": 0.00020272392713550048, + "loss": 0.1424, + "step": 1194 + }, + { + "epoch": 2.11, + "grad_norm": 0.0616929791867733, + "learning_rate": 0.00020197488350756615, + "loss": 0.1422, + "step": 1195 + }, + { + "epoch": 2.12, + "grad_norm": 0.07180429995059967, + "learning_rate": 0.00020122687586274297, + "loss": 0.1355, + "step": 1196 + }, + { + "epoch": 2.12, + "grad_norm": 0.1905076801776886, + "learning_rate": 0.00020047990680122584, + "loss": 0.1155, + "step": 1197 + }, + { + "epoch": 2.12, + "grad_norm": 0.3793567717075348, + "learning_rate": 0.00019973397891959893, + "loss": 0.1582, + "step": 1198 + }, + { + "epoch": 2.12, + "grad_norm": 0.27994590997695923, + "learning_rate": 0.000198989094810827, + "loss": 0.1186, + "step": 1199 + }, + { + "epoch": 2.12, + "grad_norm": 0.0879906564950943, + "learning_rate": 0.000198245257064247, + "loss": 0.1313, + "step": 1200 + }, + { + "epoch": 2.12, + "grad_norm": 0.20328675210475922, + "learning_rate": 0.000197502468265558, + "loss": 0.1494, + "step": 1201 + }, + { + "epoch": 2.13, + "grad_norm": 0.12098411470651627, + "learning_rate": 0.00019676073099681335, + "loss": 0.1201, + "step": 1202 + }, + { + "epoch": 2.13, + "grad_norm": 0.1231808289885521, + "learning_rate": 0.00019602004783641054, + "loss": 0.1374, + "step": 1203 + }, + { + "epoch": 2.13, + "grad_norm": 0.14193227887153625, + "learning_rate": 0.00019528042135908292, + "loss": 0.14, + "step": 1204 + }, + { + "epoch": 2.13, + "grad_norm": 0.2550235688686371, + "learning_rate": 0.0001945418541358911, + "loss": 0.1289, + "step": 1205 + }, + { + "epoch": 2.13, + "grad_norm": 0.05161747708916664, + "learning_rate": 0.00019380434873421292, + "loss": 0.1208, + "step": 1206 + }, + { + "epoch": 2.14, + "grad_norm": 0.40697818994522095, + "learning_rate": 0.00019306790771773575, + "loss": 0.107, + "step": 1207 + }, + { + "epoch": 2.14, + "grad_norm": 0.09957734495401382, + "learning_rate": 0.00019233253364644653, + "loss": 0.1258, + "step": 1208 + }, + { + "epoch": 2.14, + "grad_norm": 0.15506863594055176, + "learning_rate": 0.00019159822907662333, + "loss": 0.137, + "step": 1209 + }, + { + "epoch": 2.14, + "grad_norm": 0.11825218796730042, + "learning_rate": 0.00019086499656082684, + "loss": 0.1299, + "step": 1210 + }, + { + "epoch": 2.14, + "grad_norm": 0.3029904365539551, + "learning_rate": 0.00019013283864789104, + "loss": 0.1136, + "step": 1211 + }, + { + "epoch": 2.14, + "grad_norm": 0.11233114451169968, + "learning_rate": 0.00018940175788291407, + "loss": 0.125, + "step": 1212 + }, + { + "epoch": 2.15, + "grad_norm": 0.16029077768325806, + "learning_rate": 0.00018867175680725002, + "loss": 0.1713, + "step": 1213 + }, + { + "epoch": 2.15, + "grad_norm": 0.1274372637271881, + "learning_rate": 0.0001879428379584995, + "loss": 0.0996, + "step": 1214 + }, + { + "epoch": 2.15, + "grad_norm": 0.1937287449836731, + "learning_rate": 0.0001872150038705015, + "loss": 0.1246, + "step": 1215 + }, + { + "epoch": 2.15, + "grad_norm": 0.1734904944896698, + "learning_rate": 0.00018648825707332433, + "loss": 0.1698, + "step": 1216 + }, + { + "epoch": 2.15, + "grad_norm": 0.31292834877967834, + "learning_rate": 0.0001857626000932562, + "loss": 0.1284, + "step": 1217 + }, + { + "epoch": 2.15, + "grad_norm": 0.11059171706438065, + "learning_rate": 0.0001850380354527972, + "loss": 0.1039, + "step": 1218 + }, + { + "epoch": 2.16, + "grad_norm": 0.12312529236078262, + "learning_rate": 0.00018431456567065003, + "loss": 0.1436, + "step": 1219 + }, + { + "epoch": 2.16, + "grad_norm": 0.2583165168762207, + "learning_rate": 0.0001835921932617119, + "loss": 0.113, + "step": 1220 + }, + { + "epoch": 2.16, + "grad_norm": 0.2463943064212799, + "learning_rate": 0.00018287092073706557, + "loss": 0.1373, + "step": 1221 + }, + { + "epoch": 2.16, + "grad_norm": 0.23069016635417938, + "learning_rate": 0.0001821507506039693, + "loss": 0.1473, + "step": 1222 + }, + { + "epoch": 2.16, + "grad_norm": 0.057799965143203735, + "learning_rate": 0.0001814316853658503, + "loss": 0.1089, + "step": 1223 + }, + { + "epoch": 2.17, + "grad_norm": 0.14288663864135742, + "learning_rate": 0.00018071372752229497, + "loss": 0.1168, + "step": 1224 + }, + { + "epoch": 2.17, + "grad_norm": 0.1829814910888672, + "learning_rate": 0.00017999687956903954, + "loss": 0.1232, + "step": 1225 + }, + { + "epoch": 2.17, + "grad_norm": 0.07779528200626373, + "learning_rate": 0.00017928114399796296, + "loss": 0.1319, + "step": 1226 + }, + { + "epoch": 2.17, + "grad_norm": 0.12610167264938354, + "learning_rate": 0.00017856652329707623, + "loss": 0.1499, + "step": 1227 + }, + { + "epoch": 2.17, + "grad_norm": 0.08324690163135529, + "learning_rate": 0.00017785301995051588, + "loss": 0.1164, + "step": 1228 + }, + { + "epoch": 2.17, + "grad_norm": 0.16664673388004303, + "learning_rate": 0.00017714063643853423, + "loss": 0.1102, + "step": 1229 + }, + { + "epoch": 2.18, + "grad_norm": 0.47252926230430603, + "learning_rate": 0.00017642937523749036, + "loss": 0.1267, + "step": 1230 + }, + { + "epoch": 2.18, + "grad_norm": 0.24932174384593964, + "learning_rate": 0.00017571923881984236, + "loss": 0.0999, + "step": 1231 + }, + { + "epoch": 2.18, + "grad_norm": 0.1122673973441124, + "learning_rate": 0.0001750102296541382, + "loss": 0.1149, + "step": 1232 + }, + { + "epoch": 2.18, + "grad_norm": 0.03139616549015045, + "learning_rate": 0.00017430235020500756, + "loss": 0.14, + "step": 1233 + }, + { + "epoch": 2.18, + "grad_norm": 0.1314571052789688, + "learning_rate": 0.0001735956029331532, + "loss": 0.1316, + "step": 1234 + }, + { + "epoch": 2.18, + "grad_norm": 0.12405366450548172, + "learning_rate": 0.00017288999029534176, + "loss": 0.1066, + "step": 1235 + }, + { + "epoch": 2.19, + "grad_norm": 0.12566903233528137, + "learning_rate": 0.00017218551474439586, + "loss": 0.1217, + "step": 1236 + }, + { + "epoch": 2.19, + "grad_norm": 0.136549711227417, + "learning_rate": 0.0001714821787291858, + "loss": 0.1242, + "step": 1237 + }, + { + "epoch": 2.19, + "grad_norm": 0.049642208963632584, + "learning_rate": 0.00017077998469462009, + "loss": 0.1343, + "step": 1238 + }, + { + "epoch": 2.19, + "grad_norm": 0.2866723835468292, + "learning_rate": 0.0001700789350816382, + "loss": 0.1365, + "step": 1239 + }, + { + "epoch": 2.19, + "grad_norm": 0.12442098557949066, + "learning_rate": 0.00016937903232720075, + "loss": 0.1512, + "step": 1240 + }, + { + "epoch": 2.2, + "grad_norm": 0.11928176134824753, + "learning_rate": 0.00016868027886428194, + "loss": 0.147, + "step": 1241 + }, + { + "epoch": 2.2, + "grad_norm": 0.09463697671890259, + "learning_rate": 0.00016798267712186122, + "loss": 0.117, + "step": 1242 + }, + { + "epoch": 2.2, + "grad_norm": 0.06501590460538864, + "learning_rate": 0.0001672862295249138, + "loss": 0.1508, + "step": 1243 + }, + { + "epoch": 2.2, + "grad_norm": 0.30164363980293274, + "learning_rate": 0.00016659093849440354, + "loss": 0.1501, + "step": 1244 + }, + { + "epoch": 2.2, + "grad_norm": 0.15526002645492554, + "learning_rate": 0.00016589680644727345, + "loss": 0.1242, + "step": 1245 + }, + { + "epoch": 2.2, + "grad_norm": 0.07917926460504532, + "learning_rate": 0.00016520383579643767, + "loss": 0.1314, + "step": 1246 + }, + { + "epoch": 2.21, + "grad_norm": 0.10616712272167206, + "learning_rate": 0.00016451202895077378, + "loss": 0.1462, + "step": 1247 + }, + { + "epoch": 2.21, + "grad_norm": 0.1220119372010231, + "learning_rate": 0.0001638213883151129, + "loss": 0.1191, + "step": 1248 + }, + { + "epoch": 2.21, + "grad_norm": 0.10847750306129456, + "learning_rate": 0.0001631319162902331, + "loss": 0.0983, + "step": 1249 + }, + { + "epoch": 2.21, + "grad_norm": 0.03508616238832474, + "learning_rate": 0.0001624436152728495, + "loss": 0.1238, + "step": 1250 + }, + { + "epoch": 2.21, + "grad_norm": 0.05642473325133324, + "learning_rate": 0.0001617564876556067, + "loss": 0.1381, + "step": 1251 + }, + { + "epoch": 2.21, + "grad_norm": 0.12747938930988312, + "learning_rate": 0.0001610705358270708, + "loss": 0.1165, + "step": 1252 + }, + { + "epoch": 2.22, + "grad_norm": 0.030638879165053368, + "learning_rate": 0.0001603857621717202, + "loss": 0.1415, + "step": 1253 + }, + { + "epoch": 2.22, + "grad_norm": 0.08363982290029526, + "learning_rate": 0.00015970216906993818, + "loss": 0.1193, + "step": 1254 + }, + { + "epoch": 2.22, + "grad_norm": 0.04466724768280983, + "learning_rate": 0.00015901975889800386, + "loss": 0.1214, + "step": 1255 + }, + { + "epoch": 2.22, + "grad_norm": 0.1267317682504654, + "learning_rate": 0.00015833853402808434, + "loss": 0.1391, + "step": 1256 + }, + { + "epoch": 2.22, + "grad_norm": 0.18954621255397797, + "learning_rate": 0.00015765849682822686, + "loss": 0.1665, + "step": 1257 + }, + { + "epoch": 2.23, + "grad_norm": 0.10698127746582031, + "learning_rate": 0.00015697964966234946, + "loss": 0.1019, + "step": 1258 + }, + { + "epoch": 2.23, + "grad_norm": 0.12424101680517197, + "learning_rate": 0.00015630199489023415, + "loss": 0.1591, + "step": 1259 + }, + { + "epoch": 2.23, + "grad_norm": 0.0653344914317131, + "learning_rate": 0.0001556255348675174, + "loss": 0.1536, + "step": 1260 + }, + { + "epoch": 2.23, + "grad_norm": 0.04556593671441078, + "learning_rate": 0.0001549502719456827, + "loss": 0.1502, + "step": 1261 + }, + { + "epoch": 2.23, + "grad_norm": 0.10966439545154572, + "learning_rate": 0.00015427620847205238, + "loss": 0.1322, + "step": 1262 + }, + { + "epoch": 2.23, + "grad_norm": 0.21205949783325195, + "learning_rate": 0.00015360334678977933, + "loss": 0.1374, + "step": 1263 + }, + { + "epoch": 2.24, + "grad_norm": 0.3078778088092804, + "learning_rate": 0.00015293168923783857, + "loss": 0.1303, + "step": 1264 + }, + { + "epoch": 2.24, + "grad_norm": 0.22372427582740784, + "learning_rate": 0.0001522612381510195, + "loss": 0.1334, + "step": 1265 + }, + { + "epoch": 2.24, + "grad_norm": 0.07522010058164597, + "learning_rate": 0.00015159199585991744, + "loss": 0.1315, + "step": 1266 + }, + { + "epoch": 2.24, + "grad_norm": 0.10713458061218262, + "learning_rate": 0.00015092396469092617, + "loss": 0.1181, + "step": 1267 + }, + { + "epoch": 2.24, + "grad_norm": 0.14708364009857178, + "learning_rate": 0.00015025714696622933, + "loss": 0.1246, + "step": 1268 + }, + { + "epoch": 2.25, + "grad_norm": 0.050081461668014526, + "learning_rate": 0.00014959154500379212, + "loss": 0.1329, + "step": 1269 + }, + { + "epoch": 2.25, + "grad_norm": 0.15830601751804352, + "learning_rate": 0.00014892716111735376, + "loss": 0.1372, + "step": 1270 + }, + { + "epoch": 2.25, + "grad_norm": 0.0707254484295845, + "learning_rate": 0.00014826399761641907, + "loss": 0.1235, + "step": 1271 + }, + { + "epoch": 2.25, + "grad_norm": 0.1088636964559555, + "learning_rate": 0.00014760205680625083, + "loss": 0.1439, + "step": 1272 + }, + { + "epoch": 2.25, + "grad_norm": 0.07059821486473083, + "learning_rate": 0.00014694134098786182, + "loss": 0.1359, + "step": 1273 + }, + { + "epoch": 2.25, + "grad_norm": 0.2451217770576477, + "learning_rate": 0.0001462818524580057, + "loss": 0.1592, + "step": 1274 + }, + { + "epoch": 2.26, + "grad_norm": 0.08877355605363846, + "learning_rate": 0.00014562359350917055, + "loss": 0.1267, + "step": 1275 + }, + { + "epoch": 2.26, + "grad_norm": 0.09591105580329895, + "learning_rate": 0.00014496656642957025, + "loss": 0.1461, + "step": 1276 + }, + { + "epoch": 2.26, + "grad_norm": 0.12581078708171844, + "learning_rate": 0.0001443107735031361, + "loss": 0.1357, + "step": 1277 + }, + { + "epoch": 2.26, + "grad_norm": 0.04613172635436058, + "learning_rate": 0.00014365621700950987, + "loss": 0.12, + "step": 1278 + }, + { + "epoch": 2.26, + "eval_loss": 0.13804392516613007, + "eval_runtime": 14.0836, + "eval_samples_per_second": 33.869, + "eval_steps_per_second": 8.521, + "step": 1278 + }, + { + "epoch": 2.26, + "grad_norm": 0.0844263806939125, + "learning_rate": 0.0001430028992240344, + "loss": 0.1243, + "step": 1279 + }, + { + "epoch": 2.26, + "grad_norm": 0.22574763000011444, + "learning_rate": 0.0001423508224177474, + "loss": 0.1461, + "step": 1280 + }, + { + "epoch": 2.27, + "grad_norm": 0.2293427437543869, + "learning_rate": 0.00014169998885737257, + "loss": 0.1612, + "step": 1281 + }, + { + "epoch": 2.27, + "grad_norm": 0.05698062479496002, + "learning_rate": 0.00014105040080531161, + "loss": 0.1343, + "step": 1282 + }, + { + "epoch": 2.27, + "grad_norm": 0.13387730717658997, + "learning_rate": 0.00014040206051963678, + "loss": 0.1132, + "step": 1283 + }, + { + "epoch": 2.27, + "grad_norm": 0.24633343517780304, + "learning_rate": 0.00013975497025408285, + "loss": 0.1122, + "step": 1284 + }, + { + "epoch": 2.27, + "grad_norm": 0.03243758901953697, + "learning_rate": 0.00013910913225803945, + "loss": 0.1229, + "step": 1285 + }, + { + "epoch": 2.28, + "grad_norm": 0.10440926253795624, + "learning_rate": 0.00013846454877654318, + "loss": 0.1347, + "step": 1286 + }, + { + "epoch": 2.28, + "grad_norm": 0.1835152506828308, + "learning_rate": 0.00013782122205026958, + "loss": 0.1554, + "step": 1287 + }, + { + "epoch": 2.28, + "grad_norm": 0.02530871145427227, + "learning_rate": 0.0001371791543155253, + "loss": 0.1306, + "step": 1288 + }, + { + "epoch": 2.28, + "grad_norm": 0.02593812346458435, + "learning_rate": 0.00013653834780424112, + "loss": 0.1425, + "step": 1289 + }, + { + "epoch": 2.28, + "grad_norm": 0.13932591676712036, + "learning_rate": 0.000135898804743963, + "loss": 0.1314, + "step": 1290 + }, + { + "epoch": 2.28, + "grad_norm": 0.06134510040283203, + "learning_rate": 0.00013526052735784538, + "loss": 0.1215, + "step": 1291 + }, + { + "epoch": 2.29, + "grad_norm": 0.12832637131214142, + "learning_rate": 0.00013462351786464283, + "loss": 0.1293, + "step": 1292 + }, + { + "epoch": 2.29, + "grad_norm": 0.1946830302476883, + "learning_rate": 0.00013398777847870236, + "loss": 0.1383, + "step": 1293 + }, + { + "epoch": 2.29, + "grad_norm": 0.07782842963933945, + "learning_rate": 0.00013335331140995626, + "loss": 0.1397, + "step": 1294 + }, + { + "epoch": 2.29, + "grad_norm": 0.12014558911323547, + "learning_rate": 0.00013272011886391366, + "loss": 0.1342, + "step": 1295 + }, + { + "epoch": 2.29, + "grad_norm": 0.3111260235309601, + "learning_rate": 0.00013208820304165365, + "loss": 0.1425, + "step": 1296 + }, + { + "epoch": 2.29, + "grad_norm": 0.10395296663045883, + "learning_rate": 0.0001314575661398168, + "loss": 0.1355, + "step": 1297 + }, + { + "epoch": 2.3, + "grad_norm": 0.14676041901111603, + "learning_rate": 0.00013082821035059812, + "loss": 0.1315, + "step": 1298 + }, + { + "epoch": 2.3, + "grad_norm": 0.1285468190908432, + "learning_rate": 0.00013020013786173946, + "loss": 0.1381, + "step": 1299 + }, + { + "epoch": 2.3, + "grad_norm": 0.1933162659406662, + "learning_rate": 0.0001295733508565213, + "loss": 0.1389, + "step": 1300 + }, + { + "epoch": 2.3, + "grad_norm": 0.10214361548423767, + "learning_rate": 0.0001289478515137561, + "loss": 0.1317, + "step": 1301 + }, + { + "epoch": 2.3, + "grad_norm": 0.07621518522500992, + "learning_rate": 0.0001283236420077798, + "loss": 0.1316, + "step": 1302 + }, + { + "epoch": 2.31, + "grad_norm": 0.19626323878765106, + "learning_rate": 0.0001277007245084446, + "loss": 0.1365, + "step": 1303 + }, + { + "epoch": 2.31, + "grad_norm": 0.14391857385635376, + "learning_rate": 0.00012707910118111193, + "loss": 0.1284, + "step": 1304 + }, + { + "epoch": 2.31, + "grad_norm": 0.346746563911438, + "learning_rate": 0.0001264587741866439, + "loss": 0.1354, + "step": 1305 + }, + { + "epoch": 2.31, + "grad_norm": 0.14903244376182556, + "learning_rate": 0.00012583974568139699, + "loss": 0.1351, + "step": 1306 + }, + { + "epoch": 2.31, + "grad_norm": 0.11248177289962769, + "learning_rate": 0.00012522201781721336, + "loss": 0.1264, + "step": 1307 + }, + { + "epoch": 2.31, + "grad_norm": 0.16379296779632568, + "learning_rate": 0.00012460559274141407, + "loss": 0.1302, + "step": 1308 + }, + { + "epoch": 2.32, + "grad_norm": 0.07437599450349808, + "learning_rate": 0.00012399047259679182, + "loss": 0.1368, + "step": 1309 + }, + { + "epoch": 2.32, + "grad_norm": 0.035767342895269394, + "learning_rate": 0.00012337665952160266, + "loss": 0.1329, + "step": 1310 + }, + { + "epoch": 2.32, + "grad_norm": 0.1937914788722992, + "learning_rate": 0.00012276415564955952, + "loss": 0.1353, + "step": 1311 + }, + { + "epoch": 2.32, + "grad_norm": 0.08204084634780884, + "learning_rate": 0.00012215296310982404, + "loss": 0.1349, + "step": 1312 + }, + { + "epoch": 2.32, + "grad_norm": 0.08185072988271713, + "learning_rate": 0.00012154308402699932, + "loss": 0.1279, + "step": 1313 + }, + { + "epoch": 2.32, + "grad_norm": 0.12454118579626083, + "learning_rate": 0.00012093452052112308, + "loss": 0.143, + "step": 1314 + }, + { + "epoch": 2.33, + "grad_norm": 0.047802336513996124, + "learning_rate": 0.0001203272747076598, + "loss": 0.1268, + "step": 1315 + }, + { + "epoch": 2.33, + "grad_norm": 0.1269330382347107, + "learning_rate": 0.00011972134869749324, + "loss": 0.1307, + "step": 1316 + }, + { + "epoch": 2.33, + "grad_norm": 0.04310709610581398, + "learning_rate": 0.0001191167445969193, + "loss": 0.1361, + "step": 1317 + }, + { + "epoch": 2.33, + "grad_norm": 0.11329284310340881, + "learning_rate": 0.00011851346450763878, + "loss": 0.1275, + "step": 1318 + }, + { + "epoch": 2.33, + "grad_norm": 0.19605499505996704, + "learning_rate": 0.0001179115105267502, + "loss": 0.1276, + "step": 1319 + }, + { + "epoch": 2.34, + "grad_norm": 0.07777590304613113, + "learning_rate": 0.00011731088474674234, + "loss": 0.1266, + "step": 1320 + }, + { + "epoch": 2.34, + "grad_norm": 0.36323878169059753, + "learning_rate": 0.00011671158925548624, + "loss": 0.1324, + "step": 1321 + }, + { + "epoch": 2.34, + "grad_norm": 0.10665354877710342, + "learning_rate": 0.0001161136261362296, + "loss": 0.1274, + "step": 1322 + }, + { + "epoch": 2.34, + "grad_norm": 0.10488320887088776, + "learning_rate": 0.00011551699746758787, + "loss": 0.1244, + "step": 1323 + }, + { + "epoch": 2.34, + "grad_norm": 0.0471271350979805, + "learning_rate": 0.00011492170532353813, + "loss": 0.1326, + "step": 1324 + }, + { + "epoch": 2.34, + "grad_norm": 0.09352532774209976, + "learning_rate": 0.00011432775177341165, + "loss": 0.1407, + "step": 1325 + }, + { + "epoch": 2.35, + "grad_norm": 0.03449404612183571, + "learning_rate": 0.00011373513888188564, + "loss": 0.1328, + "step": 1326 + }, + { + "epoch": 2.35, + "grad_norm": 0.09462067484855652, + "learning_rate": 0.00011314386870897792, + "loss": 0.141, + "step": 1327 + }, + { + "epoch": 2.35, + "grad_norm": 0.06903573870658875, + "learning_rate": 0.00011255394331003854, + "loss": 0.1169, + "step": 1328 + }, + { + "epoch": 2.35, + "grad_norm": 0.030416639521718025, + "learning_rate": 0.00011196536473574276, + "loss": 0.1235, + "step": 1329 + }, + { + "epoch": 2.35, + "grad_norm": 0.2395412176847458, + "learning_rate": 0.00011137813503208399, + "loss": 0.145, + "step": 1330 + }, + { + "epoch": 2.35, + "grad_norm": 0.2817445695400238, + "learning_rate": 0.00011079225624036688, + "loss": 0.1249, + "step": 1331 + }, + { + "epoch": 2.36, + "grad_norm": 0.17817607522010803, + "learning_rate": 0.00011020773039720017, + "loss": 0.1458, + "step": 1332 + }, + { + "epoch": 2.36, + "grad_norm": 0.03026886098086834, + "learning_rate": 0.00010962455953448952, + "loss": 0.1258, + "step": 1333 + }, + { + "epoch": 2.36, + "grad_norm": 0.07572410255670547, + "learning_rate": 0.0001090427456794304, + "loss": 0.1217, + "step": 1334 + }, + { + "epoch": 2.36, + "grad_norm": 0.06871062517166138, + "learning_rate": 0.00010846229085450099, + "loss": 0.1402, + "step": 1335 + }, + { + "epoch": 2.36, + "grad_norm": 0.2113131284713745, + "learning_rate": 0.00010788319707745525, + "loss": 0.1276, + "step": 1336 + }, + { + "epoch": 2.37, + "grad_norm": 0.20129992067813873, + "learning_rate": 0.00010730546636131621, + "loss": 0.1118, + "step": 1337 + }, + { + "epoch": 2.37, + "grad_norm": 0.05704864114522934, + "learning_rate": 0.00010672910071436865, + "loss": 0.1203, + "step": 1338 + }, + { + "epoch": 2.37, + "grad_norm": 0.058348219841718674, + "learning_rate": 0.00010615410214015186, + "loss": 0.1218, + "step": 1339 + }, + { + "epoch": 2.37, + "grad_norm": 0.15510503947734833, + "learning_rate": 0.00010558047263745297, + "loss": 0.1207, + "step": 1340 + }, + { + "epoch": 2.37, + "grad_norm": 0.2903478741645813, + "learning_rate": 0.00010500821420030049, + "loss": 0.1317, + "step": 1341 + }, + { + "epoch": 2.37, + "grad_norm": 0.18940383195877075, + "learning_rate": 0.00010443732881795614, + "loss": 0.1333, + "step": 1342 + }, + { + "epoch": 2.38, + "grad_norm": 0.08819776773452759, + "learning_rate": 0.0001038678184749095, + "loss": 0.1396, + "step": 1343 + }, + { + "epoch": 2.38, + "grad_norm": 0.09822792559862137, + "learning_rate": 0.00010329968515086969, + "loss": 0.1326, + "step": 1344 + }, + { + "epoch": 2.38, + "grad_norm": 0.06949262320995331, + "learning_rate": 0.00010273293082075913, + "loss": 0.1459, + "step": 1345 + }, + { + "epoch": 2.38, + "grad_norm": 0.1197618618607521, + "learning_rate": 0.00010216755745470701, + "loss": 0.1171, + "step": 1346 + }, + { + "epoch": 2.38, + "grad_norm": 0.12239718437194824, + "learning_rate": 0.00010160356701804169, + "loss": 0.1496, + "step": 1347 + }, + { + "epoch": 2.38, + "grad_norm": 0.04563026875257492, + "learning_rate": 0.00010104096147128455, + "loss": 0.1425, + "step": 1348 + }, + { + "epoch": 2.39, + "grad_norm": 0.14311982691287994, + "learning_rate": 0.00010047974277014266, + "loss": 0.1313, + "step": 1349 + }, + { + "epoch": 2.39, + "grad_norm": 0.05187974497675896, + "learning_rate": 9.991991286550207e-05, + "loss": 0.1278, + "step": 1350 + }, + { + "epoch": 2.39, + "grad_norm": 0.14110100269317627, + "learning_rate": 9.936147370342164e-05, + "loss": 0.1132, + "step": 1351 + }, + { + "epoch": 2.39, + "grad_norm": 0.05993328616023064, + "learning_rate": 9.880442722512518e-05, + "loss": 0.1409, + "step": 1352 + }, + { + "epoch": 2.39, + "grad_norm": 0.12421682476997375, + "learning_rate": 9.824877536699584e-05, + "loss": 0.1564, + "step": 1353 + }, + { + "epoch": 2.4, + "grad_norm": 0.11717087775468826, + "learning_rate": 9.769452006056856e-05, + "loss": 0.1205, + "step": 1354 + }, + { + "epoch": 2.4, + "grad_norm": 0.13294954597949982, + "learning_rate": 9.714166323252349e-05, + "loss": 0.1439, + "step": 1355 + }, + { + "epoch": 2.4, + "grad_norm": 0.03808373585343361, + "learning_rate": 9.659020680468e-05, + "loss": 0.1139, + "step": 1356 + }, + { + "epoch": 2.4, + "grad_norm": 0.07248345017433167, + "learning_rate": 9.604015269398874e-05, + "loss": 0.1314, + "step": 1357 + }, + { + "epoch": 2.4, + "grad_norm": 0.16012993454933167, + "learning_rate": 9.549150281252633e-05, + "loss": 0.1408, + "step": 1358 + }, + { + "epoch": 2.4, + "grad_norm": 0.07186062633991241, + "learning_rate": 9.49442590674876e-05, + "loss": 0.1515, + "step": 1359 + }, + { + "epoch": 2.41, + "grad_norm": 0.06986256688833237, + "learning_rate": 9.439842336117954e-05, + "loss": 0.1054, + "step": 1360 + }, + { + "epoch": 2.41, + "grad_norm": 0.06326832622289658, + "learning_rate": 9.385399759101481e-05, + "loss": 0.121, + "step": 1361 + }, + { + "epoch": 2.41, + "grad_norm": 0.2067011445760727, + "learning_rate": 9.331098364950453e-05, + "loss": 0.094, + "step": 1362 + }, + { + "epoch": 2.41, + "grad_norm": 0.03115193173289299, + "learning_rate": 9.276938342425245e-05, + "loss": 0.1194, + "step": 1363 + }, + { + "epoch": 2.41, + "grad_norm": 0.050981197506189346, + "learning_rate": 9.222919879794772e-05, + "loss": 0.1483, + "step": 1364 + }, + { + "epoch": 2.41, + "grad_norm": 0.15227286517620087, + "learning_rate": 9.169043164835867e-05, + "loss": 0.1301, + "step": 1365 + }, + { + "epoch": 2.42, + "grad_norm": 0.18392544984817505, + "learning_rate": 9.115308384832638e-05, + "loss": 0.1309, + "step": 1366 + }, + { + "epoch": 2.42, + "grad_norm": 0.09350251406431198, + "learning_rate": 9.061715726575825e-05, + "loss": 0.1098, + "step": 1367 + }, + { + "epoch": 2.42, + "grad_norm": 0.19781382381916046, + "learning_rate": 9.008265376362079e-05, + "loss": 0.0971, + "step": 1368 + }, + { + "epoch": 2.42, + "grad_norm": 0.06422741711139679, + "learning_rate": 8.954957519993401e-05, + "loss": 0.116, + "step": 1369 + }, + { + "epoch": 2.42, + "grad_norm": 0.04665152728557587, + "learning_rate": 8.901792342776438e-05, + "loss": 0.102, + "step": 1370 + }, + { + "epoch": 2.43, + "grad_norm": 0.23038771748542786, + "learning_rate": 8.848770029521874e-05, + "loss": 0.1434, + "step": 1371 + }, + { + "epoch": 2.43, + "grad_norm": 0.06798720359802246, + "learning_rate": 8.795890764543818e-05, + "loss": 0.1251, + "step": 1372 + }, + { + "epoch": 2.43, + "grad_norm": 0.11675478518009186, + "learning_rate": 8.74315473165902e-05, + "loss": 0.1236, + "step": 1373 + }, + { + "epoch": 2.43, + "grad_norm": 0.0576016791164875, + "learning_rate": 8.690562114186423e-05, + "loss": 0.1292, + "step": 1374 + }, + { + "epoch": 2.43, + "grad_norm": 0.060148756951093674, + "learning_rate": 8.638113094946381e-05, + "loss": 0.0988, + "step": 1375 + }, + { + "epoch": 2.43, + "grad_norm": 0.18434777855873108, + "learning_rate": 8.585807856260108e-05, + "loss": 0.1204, + "step": 1376 + }, + { + "epoch": 2.44, + "grad_norm": 0.20784330368041992, + "learning_rate": 8.533646579949034e-05, + "loss": 0.1306, + "step": 1377 + }, + { + "epoch": 2.44, + "grad_norm": 0.14157481491565704, + "learning_rate": 8.481629447334066e-05, + "loss": 0.1467, + "step": 1378 + }, + { + "epoch": 2.44, + "grad_norm": 0.06923433393239975, + "learning_rate": 8.429756639235136e-05, + "loss": 0.1189, + "step": 1379 + }, + { + "epoch": 2.44, + "grad_norm": 0.24702174961566925, + "learning_rate": 8.37802833597045e-05, + "loss": 0.1472, + "step": 1380 + }, + { + "epoch": 2.44, + "grad_norm": 0.0554979182779789, + "learning_rate": 8.326444717355874e-05, + "loss": 0.1087, + "step": 1381 + }, + { + "epoch": 2.44, + "grad_norm": 0.11092767864465714, + "learning_rate": 8.275005962704346e-05, + "loss": 0.1191, + "step": 1382 + }, + { + "epoch": 2.45, + "grad_norm": 0.09958731383085251, + "learning_rate": 8.223712250825216e-05, + "loss": 0.1345, + "step": 1383 + }, + { + "epoch": 2.45, + "grad_norm": 0.19082792103290558, + "learning_rate": 8.172563760023665e-05, + "loss": 0.1056, + "step": 1384 + }, + { + "epoch": 2.45, + "grad_norm": 0.06456390023231506, + "learning_rate": 8.121560668100064e-05, + "loss": 0.1055, + "step": 1385 + }, + { + "epoch": 2.45, + "grad_norm": 0.10609104484319687, + "learning_rate": 8.070703152349334e-05, + "loss": 0.1202, + "step": 1386 + }, + { + "epoch": 2.45, + "grad_norm": 0.13664673268795013, + "learning_rate": 8.019991389560349e-05, + "loss": 0.1486, + "step": 1387 + }, + { + "epoch": 2.46, + "grad_norm": 0.24724584817886353, + "learning_rate": 7.969425556015325e-05, + "loss": 0.1453, + "step": 1388 + }, + { + "epoch": 2.46, + "grad_norm": 0.07616293430328369, + "learning_rate": 7.919005827489228e-05, + "loss": 0.1203, + "step": 1389 + }, + { + "epoch": 2.46, + "grad_norm": 0.10011781007051468, + "learning_rate": 7.868732379249122e-05, + "loss": 0.0969, + "step": 1390 + }, + { + "epoch": 2.46, + "grad_norm": 0.19106784462928772, + "learning_rate": 7.818605386053573e-05, + "loss": 0.1426, + "step": 1391 + }, + { + "epoch": 2.46, + "grad_norm": 0.17057418823242188, + "learning_rate": 7.768625022152037e-05, + "loss": 0.1412, + "step": 1392 + }, + { + "epoch": 2.46, + "grad_norm": 0.10925137251615524, + "learning_rate": 7.718791461284303e-05, + "loss": 0.1177, + "step": 1393 + }, + { + "epoch": 2.47, + "grad_norm": 0.16444699466228485, + "learning_rate": 7.669104876679795e-05, + "loss": 0.1238, + "step": 1394 + }, + { + "epoch": 2.47, + "grad_norm": 0.09140460193157196, + "learning_rate": 7.619565441057075e-05, + "loss": 0.127, + "step": 1395 + }, + { + "epoch": 2.47, + "grad_norm": 0.2647579312324524, + "learning_rate": 7.570173326623153e-05, + "loss": 0.1404, + "step": 1396 + }, + { + "epoch": 2.47, + "grad_norm": 0.11721226572990417, + "learning_rate": 7.520928705072938e-05, + "loss": 0.1432, + "step": 1397 + }, + { + "epoch": 2.47, + "grad_norm": 0.08702779561281204, + "learning_rate": 7.471831747588653e-05, + "loss": 0.0986, + "step": 1398 + }, + { + "epoch": 2.48, + "grad_norm": 0.09286800026893616, + "learning_rate": 7.422882624839178e-05, + "loss": 0.1113, + "step": 1399 + }, + { + "epoch": 2.48, + "grad_norm": 0.10280603915452957, + "learning_rate": 7.37408150697953e-05, + "loss": 0.1048, + "step": 1400 + }, + { + "epoch": 2.48, + "grad_norm": 0.20054741203784943, + "learning_rate": 7.325428563650222e-05, + "loss": 0.1496, + "step": 1401 + }, + { + "epoch": 2.48, + "grad_norm": 0.21980531513690948, + "learning_rate": 7.276923963976667e-05, + "loss": 0.1246, + "step": 1402 + }, + { + "epoch": 2.48, + "grad_norm": 0.16484405100345612, + "learning_rate": 7.228567876568665e-05, + "loss": 0.1257, + "step": 1403 + }, + { + "epoch": 2.48, + "grad_norm": 0.219330832362175, + "learning_rate": 7.180360469519714e-05, + "loss": 0.1329, + "step": 1404 + }, + { + "epoch": 2.49, + "grad_norm": 0.17407891154289246, + "learning_rate": 7.132301910406502e-05, + "loss": 0.1585, + "step": 1405 + }, + { + "epoch": 2.49, + "grad_norm": 0.09703540056943893, + "learning_rate": 7.084392366288295e-05, + "loss": 0.1161, + "step": 1406 + }, + { + "epoch": 2.49, + "grad_norm": 0.21986158192157745, + "learning_rate": 7.036632003706328e-05, + "loss": 0.1408, + "step": 1407 + }, + { + "epoch": 2.49, + "grad_norm": 0.16698057949543, + "learning_rate": 6.989020988683314e-05, + "loss": 0.1183, + "step": 1408 + }, + { + "epoch": 2.49, + "grad_norm": 0.09089485555887222, + "learning_rate": 6.941559486722748e-05, + "loss": 0.1138, + "step": 1409 + }, + { + "epoch": 2.49, + "grad_norm": 0.3572080731391907, + "learning_rate": 6.894247662808457e-05, + "loss": 0.1148, + "step": 1410 + }, + { + "epoch": 2.5, + "grad_norm": 0.21670278906822205, + "learning_rate": 6.847085681403914e-05, + "loss": 0.1081, + "step": 1411 + }, + { + "epoch": 2.5, + "grad_norm": 0.1353166103363037, + "learning_rate": 6.800073706451721e-05, + "loss": 0.1335, + "step": 1412 + }, + { + "epoch": 2.5, + "grad_norm": 0.10845667868852615, + "learning_rate": 6.753211901373064e-05, + "loss": 0.1462, + "step": 1413 + }, + { + "epoch": 2.5, + "grad_norm": 0.13182678818702698, + "learning_rate": 6.706500429067075e-05, + "loss": 0.137, + "step": 1414 + }, + { + "epoch": 2.5, + "grad_norm": 0.11666212975978851, + "learning_rate": 6.659939451910341e-05, + "loss": 0.1245, + "step": 1415 + }, + { + "epoch": 2.51, + "grad_norm": 0.19230018556118011, + "learning_rate": 6.613529131756285e-05, + "loss": 0.1392, + "step": 1416 + }, + { + "epoch": 2.51, + "grad_norm": 0.12369673699140549, + "learning_rate": 6.567269629934613e-05, + "loss": 0.152, + "step": 1417 + }, + { + "epoch": 2.51, + "grad_norm": 0.4312630891799927, + "learning_rate": 6.521161107250778e-05, + "loss": 0.1539, + "step": 1418 + }, + { + "epoch": 2.51, + "grad_norm": 0.12075989693403244, + "learning_rate": 6.475203723985418e-05, + "loss": 0.1429, + "step": 1419 + }, + { + "epoch": 2.51, + "grad_norm": 0.19320321083068848, + "learning_rate": 6.429397639893758e-05, + "loss": 0.146, + "step": 1420 + }, + { + "epoch": 2.51, + "eval_loss": 0.13353538513183594, + "eval_runtime": 14.1007, + "eval_samples_per_second": 33.828, + "eval_steps_per_second": 8.51, + "step": 1420 + }, + { + "epoch": 2.51, + "grad_norm": 2.2152562141418457, + "learning_rate": 6.38374301420509e-05, + "loss": 0.1597, + "step": 1421 + }, + { + "epoch": 2.52, + "grad_norm": 0.04021691530942917, + "learning_rate": 6.338240005622209e-05, + "loss": 0.1246, + "step": 1422 + }, + { + "epoch": 2.52, + "grad_norm": 0.06637288630008698, + "learning_rate": 6.292888772320882e-05, + "loss": 0.1187, + "step": 1423 + }, + { + "epoch": 2.52, + "grad_norm": 0.16920767724514008, + "learning_rate": 6.247689471949291e-05, + "loss": 0.1268, + "step": 1424 + }, + { + "epoch": 2.52, + "grad_norm": 0.22192606329917908, + "learning_rate": 6.202642261627411e-05, + "loss": 0.1477, + "step": 1425 + }, + { + "epoch": 2.52, + "grad_norm": 0.08232539147138596, + "learning_rate": 6.157747297946608e-05, + "loss": 0.1264, + "step": 1426 + }, + { + "epoch": 2.52, + "grad_norm": 0.08191093057394028, + "learning_rate": 6.113004736968953e-05, + "loss": 0.1459, + "step": 1427 + }, + { + "epoch": 2.53, + "grad_norm": 0.12319450080394745, + "learning_rate": 6.068414734226774e-05, + "loss": 0.105, + "step": 1428 + }, + { + "epoch": 2.53, + "grad_norm": 0.2650308907032013, + "learning_rate": 6.023977444722095e-05, + "loss": 0.1232, + "step": 1429 + }, + { + "epoch": 2.53, + "grad_norm": 0.13269290328025818, + "learning_rate": 5.979693022926025e-05, + "loss": 0.1118, + "step": 1430 + }, + { + "epoch": 2.53, + "grad_norm": 0.145817831158638, + "learning_rate": 5.935561622778335e-05, + "loss": 0.1142, + "step": 1431 + }, + { + "epoch": 2.53, + "grad_norm": 0.09145599603652954, + "learning_rate": 5.891583397686862e-05, + "loss": 0.1285, + "step": 1432 + }, + { + "epoch": 2.54, + "grad_norm": 0.11915401369333267, + "learning_rate": 5.8477585005269564e-05, + "loss": 0.1172, + "step": 1433 + }, + { + "epoch": 2.54, + "grad_norm": 0.13061662018299103, + "learning_rate": 5.804087083641002e-05, + "loss": 0.1401, + "step": 1434 + }, + { + "epoch": 2.54, + "grad_norm": 0.08747705817222595, + "learning_rate": 5.760569298837826e-05, + "loss": 0.1315, + "step": 1435 + }, + { + "epoch": 2.54, + "grad_norm": 0.08562975376844406, + "learning_rate": 5.717205297392247e-05, + "loss": 0.122, + "step": 1436 + }, + { + "epoch": 2.54, + "grad_norm": 0.12241534143686295, + "learning_rate": 5.673995230044498e-05, + "loss": 0.1247, + "step": 1437 + }, + { + "epoch": 2.54, + "grad_norm": 0.044354431331157684, + "learning_rate": 5.6309392469996944e-05, + "loss": 0.123, + "step": 1438 + }, + { + "epoch": 2.55, + "grad_norm": 0.06913258135318756, + "learning_rate": 5.5880374979273395e-05, + "loss": 0.1275, + "step": 1439 + }, + { + "epoch": 2.55, + "grad_norm": 0.0650060847401619, + "learning_rate": 5.5452901319607894e-05, + "loss": 0.1066, + "step": 1440 + }, + { + "epoch": 2.55, + "grad_norm": 0.14583538472652435, + "learning_rate": 5.502697297696746e-05, + "loss": 0.157, + "step": 1441 + }, + { + "epoch": 2.55, + "grad_norm": 0.12845255434513092, + "learning_rate": 5.4602591431947514e-05, + "loss": 0.1498, + "step": 1442 + }, + { + "epoch": 2.55, + "grad_norm": 0.08573470264673233, + "learning_rate": 5.417975815976628e-05, + "loss": 0.1014, + "step": 1443 + }, + { + "epoch": 2.55, + "grad_norm": 0.21065488457679749, + "learning_rate": 5.37584746302599e-05, + "loss": 0.168, + "step": 1444 + }, + { + "epoch": 2.56, + "grad_norm": 0.06427565217018127, + "learning_rate": 5.333874230787772e-05, + "loss": 0.1166, + "step": 1445 + }, + { + "epoch": 2.56, + "grad_norm": 0.15018604695796967, + "learning_rate": 5.292056265167644e-05, + "loss": 0.1257, + "step": 1446 + }, + { + "epoch": 2.56, + "grad_norm": 0.17715144157409668, + "learning_rate": 5.2503937115315816e-05, + "loss": 0.1457, + "step": 1447 + }, + { + "epoch": 2.56, + "grad_norm": 0.19283372163772583, + "learning_rate": 5.208886714705291e-05, + "loss": 0.1431, + "step": 1448 + }, + { + "epoch": 2.56, + "grad_norm": 0.1359899789094925, + "learning_rate": 5.16753541897374e-05, + "loss": 0.1388, + "step": 1449 + }, + { + "epoch": 2.57, + "grad_norm": 0.09452392160892487, + "learning_rate": 5.126339968080695e-05, + "loss": 0.1245, + "step": 1450 + }, + { + "epoch": 2.57, + "grad_norm": 0.043228164315223694, + "learning_rate": 5.085300505228124e-05, + "loss": 0.1291, + "step": 1451 + }, + { + "epoch": 2.57, + "grad_norm": 0.04729638248682022, + "learning_rate": 5.0444171730758046e-05, + "loss": 0.1374, + "step": 1452 + }, + { + "epoch": 2.57, + "grad_norm": 0.1768418550491333, + "learning_rate": 5.00369011374075e-05, + "loss": 0.1232, + "step": 1453 + }, + { + "epoch": 2.57, + "grad_norm": 0.040266744792461395, + "learning_rate": 4.9631194687967394e-05, + "loss": 0.1337, + "step": 1454 + }, + { + "epoch": 2.57, + "grad_norm": 0.0584387332201004, + "learning_rate": 4.9227053792738616e-05, + "loss": 0.151, + "step": 1455 + }, + { + "epoch": 2.58, + "grad_norm": 0.06620542705059052, + "learning_rate": 4.882447985657956e-05, + "loss": 0.1306, + "step": 1456 + }, + { + "epoch": 2.58, + "grad_norm": 0.2833789885044098, + "learning_rate": 4.842347427890198e-05, + "loss": 0.149, + "step": 1457 + }, + { + "epoch": 2.58, + "grad_norm": 0.12224625051021576, + "learning_rate": 4.8024038453665544e-05, + "loss": 0.1403, + "step": 1458 + }, + { + "epoch": 2.58, + "grad_norm": 0.027846721932291985, + "learning_rate": 4.762617376937312e-05, + "loss": 0.1287, + "step": 1459 + }, + { + "epoch": 2.58, + "grad_norm": 0.26311802864074707, + "learning_rate": 4.722988160906638e-05, + "loss": 0.1335, + "step": 1460 + }, + { + "epoch": 2.58, + "grad_norm": 0.3691308796405792, + "learning_rate": 4.6835163350320176e-05, + "loss": 0.145, + "step": 1461 + }, + { + "epoch": 2.59, + "grad_norm": 0.028389999642968178, + "learning_rate": 4.644202036523881e-05, + "loss": 0.1248, + "step": 1462 + }, + { + "epoch": 2.59, + "grad_norm": 0.04470786452293396, + "learning_rate": 4.605045402045022e-05, + "loss": 0.1381, + "step": 1463 + }, + { + "epoch": 2.59, + "grad_norm": 0.21815143525600433, + "learning_rate": 4.566046567710169e-05, + "loss": 0.1258, + "step": 1464 + }, + { + "epoch": 2.59, + "grad_norm": 0.14745290577411652, + "learning_rate": 4.527205669085549e-05, + "loss": 0.1403, + "step": 1465 + }, + { + "epoch": 2.59, + "grad_norm": 0.2701740562915802, + "learning_rate": 4.488522841188336e-05, + "loss": 0.1228, + "step": 1466 + }, + { + "epoch": 2.6, + "grad_norm": 0.1660449504852295, + "learning_rate": 4.449998218486262e-05, + "loss": 0.1529, + "step": 1467 + }, + { + "epoch": 2.6, + "grad_norm": 0.060599759221076965, + "learning_rate": 4.411631934897092e-05, + "loss": 0.1211, + "step": 1468 + }, + { + "epoch": 2.6, + "grad_norm": 0.2801418900489807, + "learning_rate": 4.3734241237881666e-05, + "loss": 0.137, + "step": 1469 + }, + { + "epoch": 2.6, + "grad_norm": 0.06915499269962311, + "learning_rate": 4.335374917975982e-05, + "loss": 0.1358, + "step": 1470 + }, + { + "epoch": 2.6, + "grad_norm": 0.12740154564380646, + "learning_rate": 4.297484449725691e-05, + "loss": 0.1347, + "step": 1471 + }, + { + "epoch": 2.6, + "grad_norm": 0.1669289469718933, + "learning_rate": 4.259752850750609e-05, + "loss": 0.1301, + "step": 1472 + }, + { + "epoch": 2.61, + "grad_norm": 0.07437172532081604, + "learning_rate": 4.222180252211849e-05, + "loss": 0.119, + "step": 1473 + }, + { + "epoch": 2.61, + "grad_norm": 0.08567313104867935, + "learning_rate": 4.184766784717775e-05, + "loss": 0.1128, + "step": 1474 + }, + { + "epoch": 2.61, + "grad_norm": 0.11972495913505554, + "learning_rate": 4.147512578323615e-05, + "loss": 0.1355, + "step": 1475 + }, + { + "epoch": 2.61, + "grad_norm": 0.0828404352068901, + "learning_rate": 4.110417762530977e-05, + "loss": 0.1511, + "step": 1476 + }, + { + "epoch": 2.61, + "grad_norm": 0.09823042154312134, + "learning_rate": 4.073482466287359e-05, + "loss": 0.1237, + "step": 1477 + }, + { + "epoch": 2.61, + "grad_norm": 0.039382707327604294, + "learning_rate": 4.036706817985802e-05, + "loss": 0.1182, + "step": 1478 + }, + { + "epoch": 2.62, + "grad_norm": 0.09079232066869736, + "learning_rate": 4.0000909454643406e-05, + "loss": 0.1207, + "step": 1479 + }, + { + "epoch": 2.62, + "grad_norm": 0.15438657999038696, + "learning_rate": 3.9636349760056425e-05, + "loss": 0.1229, + "step": 1480 + }, + { + "epoch": 2.62, + "grad_norm": 0.09138775616884232, + "learning_rate": 3.927339036336486e-05, + "loss": 0.1291, + "step": 1481 + }, + { + "epoch": 2.62, + "grad_norm": 0.06647726148366928, + "learning_rate": 3.8912032526273846e-05, + "loss": 0.13, + "step": 1482 + }, + { + "epoch": 2.62, + "grad_norm": 0.13314370810985565, + "learning_rate": 3.855227750492118e-05, + "loss": 0.1367, + "step": 1483 + }, + { + "epoch": 2.63, + "grad_norm": 0.09128770977258682, + "learning_rate": 3.819412654987314e-05, + "loss": 0.1164, + "step": 1484 + }, + { + "epoch": 2.63, + "grad_norm": 0.03929639607667923, + "learning_rate": 3.783758090611983e-05, + "loss": 0.1463, + "step": 1485 + }, + { + "epoch": 2.63, + "grad_norm": 0.06576254218816757, + "learning_rate": 3.748264181307109e-05, + "loss": 0.1158, + "step": 1486 + }, + { + "epoch": 2.63, + "grad_norm": 0.1495221108198166, + "learning_rate": 3.712931050455204e-05, + "loss": 0.1234, + "step": 1487 + }, + { + "epoch": 2.63, + "grad_norm": 0.1673547774553299, + "learning_rate": 3.6777588208799116e-05, + "loss": 0.1176, + "step": 1488 + }, + { + "epoch": 2.63, + "grad_norm": 0.14675819873809814, + "learning_rate": 3.6427476148455484e-05, + "loss": 0.1317, + "step": 1489 + }, + { + "epoch": 2.64, + "grad_norm": 0.07739079743623734, + "learning_rate": 3.607897554056672e-05, + "loss": 0.1235, + "step": 1490 + }, + { + "epoch": 2.64, + "grad_norm": 0.048171430826187134, + "learning_rate": 3.5732087596576866e-05, + "loss": 0.1408, + "step": 1491 + }, + { + "epoch": 2.64, + "grad_norm": 0.04192957654595375, + "learning_rate": 3.538681352232403e-05, + "loss": 0.1361, + "step": 1492 + }, + { + "epoch": 2.64, + "grad_norm": 0.0708787590265274, + "learning_rate": 3.50431545180363e-05, + "loss": 0.1345, + "step": 1493 + }, + { + "epoch": 2.64, + "grad_norm": 0.047831833362579346, + "learning_rate": 3.470111177832758e-05, + "loss": 0.1179, + "step": 1494 + }, + { + "epoch": 2.64, + "grad_norm": 0.07011017948389053, + "learning_rate": 3.436068649219326e-05, + "loss": 0.1206, + "step": 1495 + }, + { + "epoch": 2.65, + "grad_norm": 0.20468172430992126, + "learning_rate": 3.402187984300614e-05, + "loss": 0.1155, + "step": 1496 + }, + { + "epoch": 2.65, + "grad_norm": 0.5008040070533752, + "learning_rate": 3.368469300851262e-05, + "loss": 0.1518, + "step": 1497 + }, + { + "epoch": 2.65, + "grad_norm": 0.12470618635416031, + "learning_rate": 3.334912716082811e-05, + "loss": 0.1476, + "step": 1498 + }, + { + "epoch": 2.65, + "grad_norm": 0.04671414569020271, + "learning_rate": 3.30151834664334e-05, + "loss": 0.1516, + "step": 1499 + }, + { + "epoch": 2.65, + "grad_norm": 0.06154852360486984, + "learning_rate": 3.2682863086170414e-05, + "loss": 0.1419, + "step": 1500 + }, + { + "epoch": 2.66, + "grad_norm": 0.13698704540729523, + "learning_rate": 3.235216717523787e-05, + "loss": 0.14, + "step": 1501 + }, + { + "epoch": 2.66, + "grad_norm": 0.25294530391693115, + "learning_rate": 3.2023096883188e-05, + "loss": 0.1392, + "step": 1502 + }, + { + "epoch": 2.66, + "grad_norm": 0.0433129258453846, + "learning_rate": 3.169565335392183e-05, + "loss": 0.1203, + "step": 1503 + }, + { + "epoch": 2.66, + "grad_norm": 0.10114753991365433, + "learning_rate": 3.136983772568569e-05, + "loss": 0.1179, + "step": 1504 + }, + { + "epoch": 2.66, + "grad_norm": 0.058237019926309586, + "learning_rate": 3.104565113106689e-05, + "loss": 0.1491, + "step": 1505 + }, + { + "epoch": 2.66, + "grad_norm": 0.07598286867141724, + "learning_rate": 3.0723094696990027e-05, + "loss": 0.1117, + "step": 1506 + }, + { + "epoch": 2.67, + "grad_norm": 0.13283759355545044, + "learning_rate": 3.040216954471309e-05, + "loss": 0.1409, + "step": 1507 + }, + { + "epoch": 2.67, + "grad_norm": 0.1821421980857849, + "learning_rate": 3.0082876789823245e-05, + "loss": 0.1257, + "step": 1508 + }, + { + "epoch": 2.67, + "grad_norm": 0.04798609018325806, + "learning_rate": 2.9765217542233438e-05, + "loss": 0.1253, + "step": 1509 + }, + { + "epoch": 2.67, + "grad_norm": 0.3384339213371277, + "learning_rate": 2.9449192906178203e-05, + "loss": 0.1432, + "step": 1510 + }, + { + "epoch": 2.67, + "grad_norm": 0.08849462866783142, + "learning_rate": 2.9134803980209734e-05, + "loss": 0.1336, + "step": 1511 + }, + { + "epoch": 2.67, + "grad_norm": 0.11560472846031189, + "learning_rate": 2.88220518571945e-05, + "loss": 0.11, + "step": 1512 + }, + { + "epoch": 2.68, + "grad_norm": 0.2896404266357422, + "learning_rate": 2.8510937624308954e-05, + "loss": 0.163, + "step": 1513 + }, + { + "epoch": 2.68, + "grad_norm": 0.15655553340911865, + "learning_rate": 2.8201462363036112e-05, + "loss": 0.1396, + "step": 1514 + }, + { + "epoch": 2.68, + "grad_norm": 0.05299900099635124, + "learning_rate": 2.7893627149161717e-05, + "loss": 0.115, + "step": 1515 + }, + { + "epoch": 2.68, + "grad_norm": 0.08437127619981766, + "learning_rate": 2.7587433052770115e-05, + "loss": 0.1149, + "step": 1516 + }, + { + "epoch": 2.68, + "grad_norm": 0.18013562262058258, + "learning_rate": 2.72828811382414e-05, + "loss": 0.1164, + "step": 1517 + }, + { + "epoch": 2.69, + "grad_norm": 0.09165755659341812, + "learning_rate": 2.6979972464246604e-05, + "loss": 0.1474, + "step": 1518 + }, + { + "epoch": 2.69, + "grad_norm": 0.16448521614074707, + "learning_rate": 2.667870808374506e-05, + "loss": 0.1372, + "step": 1519 + }, + { + "epoch": 2.69, + "grad_norm": 0.06156858801841736, + "learning_rate": 2.6379089043980064e-05, + "loss": 0.1421, + "step": 1520 + }, + { + "epoch": 2.69, + "grad_norm": 0.13896550238132477, + "learning_rate": 2.6081116386475313e-05, + "loss": 0.1283, + "step": 1521 + }, + { + "epoch": 2.69, + "grad_norm": 0.12410745024681091, + "learning_rate": 2.5784791147031638e-05, + "loss": 0.1273, + "step": 1522 + }, + { + "epoch": 2.69, + "grad_norm": 0.0802113488316536, + "learning_rate": 2.5490114355723294e-05, + "loss": 0.1134, + "step": 1523 + }, + { + "epoch": 2.7, + "grad_norm": 0.22037829458713531, + "learning_rate": 2.5197087036893774e-05, + "loss": 0.1457, + "step": 1524 + }, + { + "epoch": 2.7, + "grad_norm": 0.09046395123004913, + "learning_rate": 2.490571020915322e-05, + "loss": 0.1279, + "step": 1525 + }, + { + "epoch": 2.7, + "grad_norm": 0.08439705520868301, + "learning_rate": 2.4615984885374143e-05, + "loss": 0.0983, + "step": 1526 + }, + { + "epoch": 2.7, + "grad_norm": 0.311358243227005, + "learning_rate": 2.4327912072688306e-05, + "loss": 0.1276, + "step": 1527 + }, + { + "epoch": 2.7, + "grad_norm": 0.18210773169994354, + "learning_rate": 2.404149277248313e-05, + "loss": 0.1321, + "step": 1528 + }, + { + "epoch": 2.7, + "grad_norm": 0.17474836111068726, + "learning_rate": 2.3756727980397742e-05, + "loss": 0.1602, + "step": 1529 + }, + { + "epoch": 2.71, + "grad_norm": 0.040003515779972076, + "learning_rate": 2.3473618686320474e-05, + "loss": 0.1371, + "step": 1530 + }, + { + "epoch": 2.71, + "grad_norm": 0.25132429599761963, + "learning_rate": 2.319216587438455e-05, + "loss": 0.1293, + "step": 1531 + }, + { + "epoch": 2.71, + "grad_norm": 0.07414573431015015, + "learning_rate": 2.291237052296513e-05, + "loss": 0.1316, + "step": 1532 + }, + { + "epoch": 2.71, + "grad_norm": 0.050560012459754944, + "learning_rate": 2.2634233604675812e-05, + "loss": 0.1191, + "step": 1533 + }, + { + "epoch": 2.71, + "grad_norm": 0.048617489635944366, + "learning_rate": 2.2357756086364924e-05, + "loss": 0.1106, + "step": 1534 + }, + { + "epoch": 2.72, + "grad_norm": 0.09126359969377518, + "learning_rate": 2.2082938929112838e-05, + "loss": 0.1422, + "step": 1535 + }, + { + "epoch": 2.72, + "grad_norm": 0.09898441284894943, + "learning_rate": 2.180978308822812e-05, + "loss": 0.1145, + "step": 1536 + }, + { + "epoch": 2.72, + "grad_norm": 0.2153613418340683, + "learning_rate": 2.1538289513244212e-05, + "loss": 0.1339, + "step": 1537 + }, + { + "epoch": 2.72, + "grad_norm": 0.07101116329431534, + "learning_rate": 2.126845914791631e-05, + "loss": 0.1397, + "step": 1538 + }, + { + "epoch": 2.72, + "grad_norm": 0.23921048641204834, + "learning_rate": 2.100029293021799e-05, + "loss": 0.113, + "step": 1539 + }, + { + "epoch": 2.72, + "grad_norm": 0.16103985905647278, + "learning_rate": 2.0733791792338197e-05, + "loss": 0.1372, + "step": 1540 + }, + { + "epoch": 2.73, + "grad_norm": 0.05808824300765991, + "learning_rate": 2.046895666067755e-05, + "loss": 0.1549, + "step": 1541 + }, + { + "epoch": 2.73, + "grad_norm": 0.044097110629081726, + "learning_rate": 2.0205788455845474e-05, + "loss": 0.1337, + "step": 1542 + }, + { + "epoch": 2.73, + "grad_norm": 0.16746556758880615, + "learning_rate": 1.9944288092656858e-05, + "loss": 0.1096, + "step": 1543 + }, + { + "epoch": 2.73, + "grad_norm": 0.3115890920162201, + "learning_rate": 1.9684456480128843e-05, + "loss": 0.1437, + "step": 1544 + }, + { + "epoch": 2.73, + "grad_norm": 0.12141338735818863, + "learning_rate": 1.942629452147787e-05, + "loss": 0.1322, + "step": 1545 + }, + { + "epoch": 2.74, + "grad_norm": 0.13122229278087616, + "learning_rate": 1.9169803114116313e-05, + "loss": 0.1057, + "step": 1546 + }, + { + "epoch": 2.74, + "grad_norm": 0.12674184143543243, + "learning_rate": 1.8914983149649513e-05, + "loss": 0.1004, + "step": 1547 + }, + { + "epoch": 2.74, + "grad_norm": 0.18313969671726227, + "learning_rate": 1.866183551387235e-05, + "loss": 0.1122, + "step": 1548 + }, + { + "epoch": 2.74, + "grad_norm": 0.12356254458427429, + "learning_rate": 1.84103610867668e-05, + "loss": 0.1362, + "step": 1549 + }, + { + "epoch": 2.74, + "grad_norm": 0.09115366637706757, + "learning_rate": 1.8160560742498223e-05, + "loss": 0.131, + "step": 1550 + }, + { + "epoch": 2.74, + "grad_norm": 0.03777840733528137, + "learning_rate": 1.7912435349412726e-05, + "loss": 0.1339, + "step": 1551 + }, + { + "epoch": 2.75, + "grad_norm": 0.026918258517980576, + "learning_rate": 1.7665985770033975e-05, + "loss": 0.1414, + "step": 1552 + }, + { + "epoch": 2.75, + "grad_norm": 0.06117779389023781, + "learning_rate": 1.7421212861060133e-05, + "loss": 0.1121, + "step": 1553 + }, + { + "epoch": 2.75, + "grad_norm": 0.05697258189320564, + "learning_rate": 1.7178117473361287e-05, + "loss": 0.1523, + "step": 1554 + }, + { + "epoch": 2.75, + "grad_norm": 0.14216284453868866, + "learning_rate": 1.693670045197582e-05, + "loss": 0.1354, + "step": 1555 + }, + { + "epoch": 2.75, + "grad_norm": 0.1293654590845108, + "learning_rate": 1.669696263610815e-05, + "loss": 0.1466, + "step": 1556 + }, + { + "epoch": 2.75, + "grad_norm": 0.09816037863492966, + "learning_rate": 1.6458904859125322e-05, + "loss": 0.1344, + "step": 1557 + }, + { + "epoch": 2.76, + "grad_norm": 0.1260695606470108, + "learning_rate": 1.622252794855433e-05, + "loss": 0.1464, + "step": 1558 + }, + { + "epoch": 2.76, + "grad_norm": 0.04835040122270584, + "learning_rate": 1.5987832726079343e-05, + "loss": 0.1292, + "step": 1559 + }, + { + "epoch": 2.76, + "grad_norm": 0.1723223179578781, + "learning_rate": 1.5754820007538473e-05, + "loss": 0.1474, + "step": 1560 + }, + { + "epoch": 2.76, + "grad_norm": 0.04331492260098457, + "learning_rate": 1.5523490602921353e-05, + "loss": 0.1513, + "step": 1561 + }, + { + "epoch": 2.76, + "grad_norm": 0.18255582451820374, + "learning_rate": 1.5293845316366185e-05, + "loss": 0.1502, + "step": 1562 + }, + { + "epoch": 2.76, + "eval_loss": 0.13326887786388397, + "eval_runtime": 14.1351, + "eval_samples_per_second": 33.746, + "eval_steps_per_second": 8.489, + "step": 1562 + }, + { + "epoch": 2.77, + "grad_norm": 0.3445633351802826, + "learning_rate": 1.5065884946156682e-05, + "loss": 0.1229, + "step": 1563 + }, + { + "epoch": 2.77, + "grad_norm": 0.0768701359629631, + "learning_rate": 1.483961028471975e-05, + "loss": 0.1318, + "step": 1564 + }, + { + "epoch": 2.77, + "grad_norm": 0.09043741226196289, + "learning_rate": 1.4615022118622368e-05, + "loss": 0.1038, + "step": 1565 + }, + { + "epoch": 2.77, + "grad_norm": 0.12818877398967743, + "learning_rate": 1.4392121228569088e-05, + "loss": 0.121, + "step": 1566 + }, + { + "epoch": 2.77, + "grad_norm": 0.08737614750862122, + "learning_rate": 1.4170908389399107e-05, + "loss": 0.1169, + "step": 1567 + }, + { + "epoch": 2.77, + "grad_norm": 0.041610416024923325, + "learning_rate": 1.3951384370083697e-05, + "loss": 0.139, + "step": 1568 + }, + { + "epoch": 2.78, + "grad_norm": 0.13382531702518463, + "learning_rate": 1.3733549933723666e-05, + "loss": 0.1242, + "step": 1569 + }, + { + "epoch": 2.78, + "grad_norm": 0.14836427569389343, + "learning_rate": 1.3517405837546404e-05, + "loss": 0.1292, + "step": 1570 + }, + { + "epoch": 2.78, + "grad_norm": 0.16950459778308868, + "learning_rate": 1.3302952832903392e-05, + "loss": 0.1338, + "step": 1571 + }, + { + "epoch": 2.78, + "grad_norm": 0.09254854172468185, + "learning_rate": 1.3090191665267814e-05, + "loss": 0.1016, + "step": 1572 + }, + { + "epoch": 2.78, + "grad_norm": 0.04616238549351692, + "learning_rate": 1.2879123074231502e-05, + "loss": 0.1467, + "step": 1573 + }, + { + "epoch": 2.78, + "grad_norm": 0.04659713804721832, + "learning_rate": 1.2669747793502828e-05, + "loss": 0.1246, + "step": 1574 + }, + { + "epoch": 2.79, + "grad_norm": 0.0829511284828186, + "learning_rate": 1.2462066550903816e-05, + "loss": 0.153, + "step": 1575 + }, + { + "epoch": 2.79, + "grad_norm": 0.12231741100549698, + "learning_rate": 1.225608006836776e-05, + "loss": 0.1125, + "step": 1576 + }, + { + "epoch": 2.79, + "grad_norm": 0.0418907031416893, + "learning_rate": 1.2051789061936713e-05, + "loss": 0.1282, + "step": 1577 + }, + { + "epoch": 2.79, + "grad_norm": 0.03682105615735054, + "learning_rate": 1.1849194241759009e-05, + "loss": 0.1337, + "step": 1578 + }, + { + "epoch": 2.79, + "grad_norm": 0.21309414505958557, + "learning_rate": 1.1648296312086747e-05, + "loss": 0.11, + "step": 1579 + }, + { + "epoch": 2.8, + "grad_norm": 0.06915529817342758, + "learning_rate": 1.1449095971273304e-05, + "loss": 0.1595, + "step": 1580 + }, + { + "epoch": 2.8, + "grad_norm": 0.07684849202632904, + "learning_rate": 1.1251593911771052e-05, + "loss": 0.1307, + "step": 1581 + }, + { + "epoch": 2.8, + "grad_norm": 0.07943115383386612, + "learning_rate": 1.1055790820128919e-05, + "loss": 0.1172, + "step": 1582 + }, + { + "epoch": 2.8, + "grad_norm": 0.21117864549160004, + "learning_rate": 1.0861687376989671e-05, + "loss": 0.1199, + "step": 1583 + }, + { + "epoch": 2.8, + "grad_norm": 0.18909168243408203, + "learning_rate": 1.0669284257088185e-05, + "loss": 0.1482, + "step": 1584 + }, + { + "epoch": 2.8, + "grad_norm": 0.027331866323947906, + "learning_rate": 1.0478582129248516e-05, + "loss": 0.1238, + "step": 1585 + }, + { + "epoch": 2.81, + "grad_norm": 0.12267359346151352, + "learning_rate": 1.0289581656381774e-05, + "loss": 0.1284, + "step": 1586 + }, + { + "epoch": 2.81, + "grad_norm": 0.07290299981832504, + "learning_rate": 1.0102283495483977e-05, + "loss": 0.1385, + "step": 1587 + }, + { + "epoch": 2.81, + "grad_norm": 0.10076455026865005, + "learning_rate": 9.916688297633647e-06, + "loss": 0.1455, + "step": 1588 + }, + { + "epoch": 2.81, + "grad_norm": 0.2671211361885071, + "learning_rate": 9.732796707989377e-06, + "loss": 0.16, + "step": 1589 + }, + { + "epoch": 2.81, + "grad_norm": 0.11416134238243103, + "learning_rate": 9.550609365787888e-06, + "loss": 0.1575, + "step": 1590 + }, + { + "epoch": 2.81, + "grad_norm": 0.08602559566497803, + "learning_rate": 9.37012690434147e-06, + "loss": 0.1292, + "step": 1591 + }, + { + "epoch": 2.82, + "grad_norm": 0.046941716223955154, + "learning_rate": 9.191349951036266e-06, + "loss": 0.1017, + "step": 1592 + }, + { + "epoch": 2.82, + "grad_norm": 0.061204541474580765, + "learning_rate": 9.014279127329605e-06, + "loss": 0.1385, + "step": 1593 + }, + { + "epoch": 2.82, + "grad_norm": 0.07410020381212234, + "learning_rate": 8.838915048748064e-06, + "loss": 0.1474, + "step": 1594 + }, + { + "epoch": 2.82, + "grad_norm": 0.35706961154937744, + "learning_rate": 8.66525832488535e-06, + "loss": 0.1596, + "step": 1595 + }, + { + "epoch": 2.82, + "grad_norm": 0.06470204889774323, + "learning_rate": 8.493309559399976e-06, + "loss": 0.1628, + "step": 1596 + }, + { + "epoch": 2.83, + "grad_norm": 0.18308192491531372, + "learning_rate": 8.323069350013479e-06, + "loss": 0.1176, + "step": 1597 + }, + { + "epoch": 2.83, + "grad_norm": 0.06461644172668457, + "learning_rate": 8.154538288508206e-06, + "loss": 0.126, + "step": 1598 + }, + { + "epoch": 2.83, + "grad_norm": 0.2490461766719818, + "learning_rate": 7.987716960725144e-06, + "loss": 0.1468, + "step": 1599 + }, + { + "epoch": 2.83, + "grad_norm": 0.11734851449728012, + "learning_rate": 7.822605946561923e-06, + "loss": 0.145, + "step": 1600 + }, + { + "epoch": 2.83, + "grad_norm": 0.039642583578825, + "learning_rate": 7.659205819970927e-06, + "loss": 0.1268, + "step": 1601 + }, + { + "epoch": 2.83, + "grad_norm": 0.04151405021548271, + "learning_rate": 7.497517148957244e-06, + "loss": 0.1301, + "step": 1602 + }, + { + "epoch": 2.84, + "grad_norm": 0.1388237625360489, + "learning_rate": 7.3375404955766665e-06, + "loss": 0.1382, + "step": 1603 + }, + { + "epoch": 2.84, + "grad_norm": 0.26035836338996887, + "learning_rate": 7.179276415933633e-06, + "loss": 0.1538, + "step": 1604 + }, + { + "epoch": 2.84, + "grad_norm": 0.22738611698150635, + "learning_rate": 7.022725460179458e-06, + "loss": 0.1083, + "step": 1605 + }, + { + "epoch": 2.84, + "grad_norm": 0.2859216630458832, + "learning_rate": 6.867888172510439e-06, + "loss": 0.1362, + "step": 1606 + }, + { + "epoch": 2.84, + "grad_norm": 0.22834646701812744, + "learning_rate": 6.7147650911658086e-06, + "loss": 0.1289, + "step": 1607 + }, + { + "epoch": 2.84, + "grad_norm": 0.05140649899840355, + "learning_rate": 6.5633567484259525e-06, + "loss": 0.1191, + "step": 1608 + }, + { + "epoch": 2.85, + "grad_norm": 0.06531380861997604, + "learning_rate": 6.413663670610526e-06, + "loss": 0.1122, + "step": 1609 + }, + { + "epoch": 2.85, + "grad_norm": 0.031297944486141205, + "learning_rate": 6.26568637807673e-06, + "loss": 0.1274, + "step": 1610 + }, + { + "epoch": 2.85, + "grad_norm": 0.23136821389198303, + "learning_rate": 6.119425385217314e-06, + "loss": 0.1178, + "step": 1611 + }, + { + "epoch": 2.85, + "grad_norm": 0.11101561784744263, + "learning_rate": 5.9748812004590255e-06, + "loss": 0.1435, + "step": 1612 + }, + { + "epoch": 2.85, + "grad_norm": 0.12405877560377121, + "learning_rate": 5.832054326260605e-06, + "loss": 0.1342, + "step": 1613 + }, + { + "epoch": 2.86, + "grad_norm": 0.02779628150165081, + "learning_rate": 5.69094525911118e-06, + "loss": 0.1283, + "step": 1614 + }, + { + "epoch": 2.86, + "grad_norm": 0.1838769018650055, + "learning_rate": 5.551554489528432e-06, + "loss": 0.1569, + "step": 1615 + }, + { + "epoch": 2.86, + "grad_norm": 0.06827875226736069, + "learning_rate": 5.413882502057155e-06, + "loss": 0.1119, + "step": 1616 + }, + { + "epoch": 2.86, + "grad_norm": 0.07234180718660355, + "learning_rate": 5.277929775267143e-06, + "loss": 0.1302, + "step": 1617 + }, + { + "epoch": 2.86, + "grad_norm": 0.23422658443450928, + "learning_rate": 5.143696781751972e-06, + "loss": 0.1459, + "step": 1618 + }, + { + "epoch": 2.86, + "grad_norm": 0.10591956228017807, + "learning_rate": 5.011183988127055e-06, + "loss": 0.1529, + "step": 1619 + }, + { + "epoch": 2.87, + "grad_norm": 0.04643354192376137, + "learning_rate": 4.880391855028088e-06, + "loss": 0.1496, + "step": 1620 + }, + { + "epoch": 2.87, + "grad_norm": 0.2558877766132355, + "learning_rate": 4.751320837109552e-06, + "loss": 0.1484, + "step": 1621 + }, + { + "epoch": 2.87, + "grad_norm": 0.14346656203269958, + "learning_rate": 4.6239713830429354e-06, + "loss": 0.1295, + "step": 1622 + }, + { + "epoch": 2.87, + "grad_norm": 0.06926041841506958, + "learning_rate": 4.498343935515348e-06, + "loss": 0.1275, + "step": 1623 + }, + { + "epoch": 2.87, + "grad_norm": 0.10302523523569107, + "learning_rate": 4.374438931228075e-06, + "loss": 0.1432, + "step": 1624 + }, + { + "epoch": 2.87, + "grad_norm": 0.12526661157608032, + "learning_rate": 4.252256800894694e-06, + "loss": 0.1271, + "step": 1625 + }, + { + "epoch": 2.88, + "grad_norm": 0.08141893148422241, + "learning_rate": 4.131797969239903e-06, + "loss": 0.1352, + "step": 1626 + }, + { + "epoch": 2.88, + "grad_norm": 0.03432145342230797, + "learning_rate": 4.013062854998028e-06, + "loss": 0.1307, + "step": 1627 + }, + { + "epoch": 2.88, + "grad_norm": 0.04721994698047638, + "learning_rate": 3.896051870911188e-06, + "loss": 0.1333, + "step": 1628 + }, + { + "epoch": 2.88, + "grad_norm": 0.07389276474714279, + "learning_rate": 3.7807654237284605e-06, + "loss": 0.1245, + "step": 1629 + }, + { + "epoch": 2.88, + "grad_norm": 0.027808329090476036, + "learning_rate": 3.6672039142039425e-06, + "loss": 0.1354, + "step": 1630 + }, + { + "epoch": 2.89, + "grad_norm": 0.18238960206508636, + "learning_rate": 3.5553677370957495e-06, + "loss": 0.1252, + "step": 1631 + }, + { + "epoch": 2.89, + "grad_norm": 0.042762529104948044, + "learning_rate": 3.445257281164349e-06, + "loss": 0.1445, + "step": 1632 + }, + { + "epoch": 2.89, + "grad_norm": 0.16484946012496948, + "learning_rate": 3.3368729291712863e-06, + "loss": 0.1459, + "step": 1633 + }, + { + "epoch": 2.89, + "grad_norm": 0.04074009507894516, + "learning_rate": 3.2302150578780165e-06, + "loss": 0.1229, + "step": 1634 + }, + { + "epoch": 2.89, + "grad_norm": 0.03902529180049896, + "learning_rate": 3.125284038044407e-06, + "loss": 0.1303, + "step": 1635 + }, + { + "epoch": 2.89, + "grad_norm": 0.0906100794672966, + "learning_rate": 3.0220802344275157e-06, + "loss": 0.1367, + "step": 1636 + }, + { + "epoch": 2.9, + "grad_norm": 0.07743262499570847, + "learning_rate": 2.9206040057802584e-06, + "loss": 0.1423, + "step": 1637 + }, + { + "epoch": 2.9, + "grad_norm": 0.0563175305724144, + "learning_rate": 2.8208557048503556e-06, + "loss": 0.122, + "step": 1638 + }, + { + "epoch": 2.9, + "grad_norm": 0.08874353021383286, + "learning_rate": 2.7228356783788876e-06, + "loss": 0.1037, + "step": 1639 + }, + { + "epoch": 2.9, + "grad_norm": 0.05931072309613228, + "learning_rate": 2.626544267099129e-06, + "loss": 0.1155, + "step": 1640 + }, + { + "epoch": 2.9, + "grad_norm": 0.03920688480138779, + "learning_rate": 2.531981805735606e-06, + "loss": 0.1462, + "step": 1641 + }, + { + "epoch": 2.9, + "grad_norm": 0.09828366339206696, + "learning_rate": 2.4391486230024297e-06, + "loss": 0.1603, + "step": 1642 + }, + { + "epoch": 2.91, + "grad_norm": 0.16284847259521484, + "learning_rate": 2.3480450416027423e-06, + "loss": 0.1222, + "step": 1643 + }, + { + "epoch": 2.91, + "grad_norm": 0.0588088184595108, + "learning_rate": 2.2586713782272172e-06, + "loss": 0.1423, + "step": 1644 + }, + { + "epoch": 2.91, + "grad_norm": 0.21331720054149628, + "learning_rate": 2.1710279435530057e-06, + "loss": 0.1567, + "step": 1645 + }, + { + "epoch": 2.91, + "grad_norm": 0.1025921106338501, + "learning_rate": 2.0851150422427913e-06, + "loss": 0.1297, + "step": 1646 + }, + { + "epoch": 2.91, + "grad_norm": 0.12087155133485794, + "learning_rate": 2.0009329729435146e-06, + "loss": 0.1207, + "step": 1647 + }, + { + "epoch": 2.92, + "grad_norm": 0.12067259848117828, + "learning_rate": 1.9184820282855953e-06, + "loss": 0.1335, + "step": 1648 + }, + { + "epoch": 2.92, + "grad_norm": 0.12102551758289337, + "learning_rate": 1.8377624948817673e-06, + "loss": 0.1483, + "step": 1649 + }, + { + "epoch": 2.92, + "grad_norm": 0.07149103283882141, + "learning_rate": 1.7587746533260785e-06, + "loss": 0.1423, + "step": 1650 + }, + { + "epoch": 2.92, + "grad_norm": 0.19505378603935242, + "learning_rate": 1.6815187781928921e-06, + "loss": 0.1045, + "step": 1651 + }, + { + "epoch": 2.92, + "grad_norm": 0.14010155200958252, + "learning_rate": 1.6059951380359984e-06, + "loss": 0.1282, + "step": 1652 + }, + { + "epoch": 2.92, + "grad_norm": 0.061695683747529984, + "learning_rate": 1.5322039953878374e-06, + "loss": 0.1342, + "step": 1653 + }, + { + "epoch": 2.93, + "grad_norm": 0.24404413998126984, + "learning_rate": 1.4601456067580564e-06, + "loss": 0.1379, + "step": 1654 + }, + { + "epoch": 2.93, + "grad_norm": 0.07468795031309128, + "learning_rate": 1.3898202226333423e-06, + "loss": 0.1219, + "step": 1655 + }, + { + "epoch": 2.93, + "grad_norm": 0.061813950538635254, + "learning_rate": 1.3212280874759231e-06, + "loss": 0.1151, + "step": 1656 + }, + { + "epoch": 2.93, + "grad_norm": 0.1875925213098526, + "learning_rate": 1.2543694397230686e-06, + "loss": 0.1474, + "step": 1657 + }, + { + "epoch": 2.93, + "grad_norm": 0.06623123586177826, + "learning_rate": 1.1892445117862028e-06, + "loss": 0.0881, + "step": 1658 + }, + { + "epoch": 2.93, + "grad_norm": 0.057737018913030624, + "learning_rate": 1.1258535300499583e-06, + "loss": 0.1452, + "step": 1659 + }, + { + "epoch": 2.94, + "grad_norm": 0.03276927024126053, + "learning_rate": 1.0641967148716236e-06, + "loss": 0.1435, + "step": 1660 + }, + { + "epoch": 2.94, + "grad_norm": 0.07240304350852966, + "learning_rate": 1.004274280580142e-06, + "loss": 0.1266, + "step": 1661 + }, + { + "epoch": 2.94, + "grad_norm": 0.10652832686901093, + "learning_rate": 9.460864354755017e-07, + "loss": 0.1628, + "step": 1662 + }, + { + "epoch": 2.94, + "grad_norm": 0.37316522002220154, + "learning_rate": 8.896333818280145e-07, + "loss": 0.1186, + "step": 1663 + }, + { + "epoch": 2.94, + "grad_norm": 0.20198196172714233, + "learning_rate": 8.349153158774825e-07, + "loss": 0.1195, + "step": 1664 + }, + { + "epoch": 2.95, + "grad_norm": 0.18203914165496826, + "learning_rate": 7.8193242783281e-07, + "loss": 0.1618, + "step": 1665 + }, + { + "epoch": 2.95, + "grad_norm": 0.1698591411113739, + "learning_rate": 7.306849018708927e-07, + "loss": 0.1423, + "step": 1666 + }, + { + "epoch": 2.95, + "grad_norm": 0.13877595961093903, + "learning_rate": 6.811729161363966e-07, + "loss": 0.1068, + "step": 1667 + }, + { + "epoch": 2.95, + "grad_norm": 0.2812526524066925, + "learning_rate": 6.333966427409243e-07, + "loss": 0.1271, + "step": 1668 + }, + { + "epoch": 2.95, + "grad_norm": 0.2276443988084793, + "learning_rate": 5.873562477624605e-07, + "loss": 0.1471, + "step": 1669 + }, + { + "epoch": 2.95, + "grad_norm": 0.09518153220415115, + "learning_rate": 5.430518912448168e-07, + "loss": 0.1251, + "step": 1670 + }, + { + "epoch": 2.96, + "grad_norm": 0.0484699048101902, + "learning_rate": 5.004837271970764e-07, + "loss": 0.115, + "step": 1671 + }, + { + "epoch": 2.96, + "grad_norm": 0.09272460639476776, + "learning_rate": 4.596519035929281e-07, + "loss": 0.1178, + "step": 1672 + }, + { + "epoch": 2.96, + "grad_norm": 0.1327453851699829, + "learning_rate": 4.2055656237038884e-07, + "loss": 0.1252, + "step": 1673 + }, + { + "epoch": 2.96, + "grad_norm": 0.10774451494216919, + "learning_rate": 3.83197839431082e-07, + "loss": 0.1243, + "step": 1674 + }, + { + "epoch": 2.96, + "grad_norm": 0.05414144694805145, + "learning_rate": 3.475758646400151e-07, + "loss": 0.1254, + "step": 1675 + }, + { + "epoch": 2.97, + "grad_norm": 0.05690954253077507, + "learning_rate": 3.1369076182480305e-07, + "loss": 0.1227, + "step": 1676 + }, + { + "epoch": 2.97, + "grad_norm": 0.02989388071000576, + "learning_rate": 2.815426487755568e-07, + "loss": 0.1144, + "step": 1677 + }, + { + "epoch": 2.97, + "grad_norm": 0.07636480778455734, + "learning_rate": 2.5113163724427293e-07, + "loss": 0.1264, + "step": 1678 + }, + { + "epoch": 2.97, + "grad_norm": 0.14926138520240784, + "learning_rate": 2.2245783294444488e-07, + "loss": 0.1386, + "step": 1679 + }, + { + "epoch": 2.97, + "grad_norm": 0.20612908899784088, + "learning_rate": 1.9552133555084116e-07, + "loss": 0.1107, + "step": 1680 + }, + { + "epoch": 2.97, + "grad_norm": 0.16899043321609497, + "learning_rate": 1.7032223869911656e-07, + "loss": 0.1392, + "step": 1681 + }, + { + "epoch": 2.98, + "grad_norm": 0.03907778859138489, + "learning_rate": 1.4686062998525706e-07, + "loss": 0.1191, + "step": 1682 + }, + { + "epoch": 2.98, + "grad_norm": 0.05882280319929123, + "learning_rate": 1.2513659096569097e-07, + "loss": 0.1223, + "step": 1683 + }, + { + "epoch": 2.98, + "grad_norm": 0.09647829085588455, + "learning_rate": 1.0515019715656716e-07, + "loss": 0.1278, + "step": 1684 + }, + { + "epoch": 2.98, + "grad_norm": 0.15963368117809296, + "learning_rate": 8.690151803386614e-08, + "loss": 0.1391, + "step": 1685 + }, + { + "epoch": 2.98, + "grad_norm": 0.13670912384986877, + "learning_rate": 7.0390617032845e-08, + "loss": 0.1424, + "step": 1686 + }, + { + "epoch": 2.98, + "grad_norm": 0.1605810821056366, + "learning_rate": 5.5617551548148294e-08, + "loss": 0.1438, + "step": 1687 + }, + { + "epoch": 2.99, + "grad_norm": 0.15910910069942474, + "learning_rate": 4.258237293325307e-08, + "loss": 0.1518, + "step": 1688 + }, + { + "epoch": 2.99, + "grad_norm": 0.09237740933895111, + "learning_rate": 3.1285126500579795e-08, + "loss": 0.1212, + "step": 1689 + }, + { + "epoch": 2.99, + "grad_norm": 0.05058419331908226, + "learning_rate": 2.1725851521103845e-08, + "loss": 0.1282, + "step": 1690 + }, + { + "epoch": 2.99, + "grad_norm": 0.03350934758782387, + "learning_rate": 1.3904581224410962e-08, + "loss": 0.1285, + "step": 1691 + }, + { + "epoch": 2.99, + "grad_norm": 0.11487533152103424, + "learning_rate": 7.821342798530751e-09, + "loss": 0.1352, + "step": 1692 + }, + { + "epoch": 3.0, + "grad_norm": 0.03711562231183052, + "learning_rate": 3.4761573897701404e-09, + "loss": 0.1145, + "step": 1693 + }, + { + "epoch": 3.0, + "grad_norm": 0.0804651752114296, + "learning_rate": 8.690401026578698e-10, + "loss": 0.1144, + "step": 1694 + }, + { + "epoch": 3.0, + "grad_norm": 0.13091543316841125, + "learning_rate": 0.0, + "loss": 0.127, + "step": 1695 + } + ], + "logging_steps": 1, + "max_steps": 1695, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 565, + "total_flos": 1.549439947809751e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}