diff --git "a/checkpoint-1130/trainer_state.json" "b/checkpoint-1130/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1130/trainer_state.json" @@ -0,0 +1,7995 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9991154356479433, + "eval_steps": 142, + "global_step": 1130, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.705155372619629, + "learning_rate": 0.0001, + "loss": 3.3182, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 3.3362529277801514, + "eval_runtime": 14.4366, + "eval_samples_per_second": 33.041, + "eval_steps_per_second": 8.312, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.644563674926758, + "learning_rate": 0.0002, + "loss": 3.2788, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 4.3825764656066895, + "learning_rate": 0.0003, + "loss": 2.9231, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 2.904296636581421, + "learning_rate": 0.0004, + "loss": 1.1824, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 1.6988284587860107, + "learning_rate": 0.0005, + "loss": 0.3472, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 0.5742101073265076, + "learning_rate": 0.0006, + "loss": 0.1478, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 0.6511944532394409, + "learning_rate": 0.0007, + "loss": 0.1532, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 2.305083751678467, + "learning_rate": 0.0008, + "loss": 0.2397, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 4.7435078620910645, + "learning_rate": 0.0009000000000000001, + "loss": 0.434, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.6199779510498047, + "learning_rate": 0.001, + "loss": 0.1743, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 0.14406554400920868, + "learning_rate": 0.0009999991309598973, + "loss": 0.1404, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.1965201050043106, + "learning_rate": 0.0009999965238426103, + "loss": 0.1418, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 26.520109176635742, + "learning_rate": 0.0009999921786572016, + "loss": 0.2689, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 0.0870603695511818, + "learning_rate": 0.0009999860954187755, + "loss": 0.1338, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 0.05200817808508873, + "learning_rate": 0.0009999782741484788, + "loss": 0.1308, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 0.2145700752735138, + "learning_rate": 0.0009999687148734995, + "loss": 0.1375, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.19921083748340607, + "learning_rate": 0.0009999574176270667, + "loss": 0.1388, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 1.193419337272644, + "learning_rate": 0.0009999443824484518, + "loss": 0.1978, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.4399484395980835, + "learning_rate": 0.0009999296093829671, + "loss": 0.1518, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 44.88853073120117, + "learning_rate": 0.0009999130984819661, + "loss": 0.9033, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 0.3220385015010834, + "learning_rate": 0.0009998948498028434, + "loss": 0.1234, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.5420748591423035, + "learning_rate": 0.0009998748634090344, + "loss": 0.1602, + "step": 22 + }, + { + "epoch": 0.04, + "grad_norm": 0.5249865651130676, + "learning_rate": 0.0009998531393700149, + "loss": 0.1538, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.056158341467380524, + "learning_rate": 0.000999829677761301, + "loss": 0.1374, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.19818872213363647, + "learning_rate": 0.0009998044786644492, + "loss": 0.1413, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 0.27901849150657654, + "learning_rate": 0.0009997775421670557, + "loss": 0.1395, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 0.22768354415893555, + "learning_rate": 0.0009997488683627558, + "loss": 0.1241, + "step": 27 + }, + { + "epoch": 0.05, + "grad_norm": 0.14878959953784943, + "learning_rate": 0.0009997184573512245, + "loss": 0.1243, + "step": 28 + }, + { + "epoch": 0.05, + "grad_norm": 1.0589066743850708, + "learning_rate": 0.000999686309238175, + "loss": 0.2499, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.11455405503511429, + "learning_rate": 0.00099965242413536, + "loss": 0.1254, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 0.16566088795661926, + "learning_rate": 0.000999616802160569, + "loss": 0.1416, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 1.3691716194152832, + "learning_rate": 0.0009995794434376297, + "loss": 0.1465, + "step": 32 + }, + { + "epoch": 0.06, + "grad_norm": 0.09674070030450821, + "learning_rate": 0.000999540348096407, + "loss": 0.1373, + "step": 33 + }, + { + "epoch": 0.06, + "grad_norm": 0.5034632086753845, + "learning_rate": 0.000999499516272803, + "loss": 0.1471, + "step": 34 + }, + { + "epoch": 0.06, + "grad_norm": 0.26572930812835693, + "learning_rate": 0.0009994569481087553, + "loss": 0.1424, + "step": 35 + }, + { + "epoch": 0.06, + "grad_norm": 0.20631802082061768, + "learning_rate": 0.0009994126437522376, + "loss": 0.1449, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 0.11268749833106995, + "learning_rate": 0.0009993666033572591, + "loss": 0.1403, + "step": 37 + }, + { + "epoch": 0.07, + "grad_norm": 0.6610996723175049, + "learning_rate": 0.0009993188270838635, + "loss": 0.1424, + "step": 38 + }, + { + "epoch": 0.07, + "grad_norm": 98.93838500976562, + "learning_rate": 0.0009992693150981291, + "loss": 2.775, + "step": 39 + }, + { + "epoch": 0.07, + "grad_norm": 32.53168869018555, + "learning_rate": 0.0009992180675721671, + "loss": 0.6932, + "step": 40 + }, + { + "epoch": 0.07, + "grad_norm": 54.8778076171875, + "learning_rate": 0.0009991650846841226, + "loss": 5.7008, + "step": 41 + }, + { + "epoch": 0.07, + "grad_norm": 2.0524775981903076, + "learning_rate": 0.000999110366618172, + "loss": 0.1623, + "step": 42 + }, + { + "epoch": 0.08, + "grad_norm": 0.404278427362442, + "learning_rate": 0.0009990539135645246, + "loss": 0.1427, + "step": 43 + }, + { + "epoch": 0.08, + "grad_norm": 1.7963409423828125, + "learning_rate": 0.0009989957257194198, + "loss": 0.174, + "step": 44 + }, + { + "epoch": 0.08, + "grad_norm": 0.11620022356510162, + "learning_rate": 0.0009989358032851284, + "loss": 0.1339, + "step": 45 + }, + { + "epoch": 0.08, + "grad_norm": 0.5025681853294373, + "learning_rate": 0.00099887414646995, + "loss": 0.1558, + "step": 46 + }, + { + "epoch": 0.08, + "grad_norm": 78.1165771484375, + "learning_rate": 0.0009988107554882138, + "loss": 2.2938, + "step": 47 + }, + { + "epoch": 0.08, + "grad_norm": 0.08389786630868912, + "learning_rate": 0.0009987456305602768, + "loss": 0.1409, + "step": 48 + }, + { + "epoch": 0.09, + "grad_norm": 7.123101711273193, + "learning_rate": 0.0009986787719125242, + "loss": 0.1524, + "step": 49 + }, + { + "epoch": 0.09, + "grad_norm": 0.5341290235519409, + "learning_rate": 0.0009986101797773666, + "loss": 0.1598, + "step": 50 + }, + { + "epoch": 0.09, + "grad_norm": 0.05239284038543701, + "learning_rate": 0.000998539854393242, + "loss": 0.1386, + "step": 51 + }, + { + "epoch": 0.09, + "grad_norm": 0.0722254291176796, + "learning_rate": 0.0009984677960046123, + "loss": 0.1385, + "step": 52 + }, + { + "epoch": 0.09, + "grad_norm": 0.11535236239433289, + "learning_rate": 0.000998394004861964, + "loss": 0.1369, + "step": 53 + }, + { + "epoch": 0.1, + "grad_norm": 0.7584894299507141, + "learning_rate": 0.0009983184812218072, + "loss": 0.108, + "step": 54 + }, + { + "epoch": 0.1, + "grad_norm": 0.8361538052558899, + "learning_rate": 0.000998241225346674, + "loss": 0.1703, + "step": 55 + }, + { + "epoch": 0.1, + "grad_norm": 0.37683162093162537, + "learning_rate": 0.0009981622375051184, + "loss": 0.1368, + "step": 56 + }, + { + "epoch": 0.1, + "grad_norm": 0.5335961580276489, + "learning_rate": 0.0009980815179717144, + "loss": 0.1559, + "step": 57 + }, + { + "epoch": 0.1, + "grad_norm": 0.2806299328804016, + "learning_rate": 0.0009979990670270565, + "loss": 0.1397, + "step": 58 + }, + { + "epoch": 0.1, + "grad_norm": 0.4967437982559204, + "learning_rate": 0.0009979148849577574, + "loss": 0.1543, + "step": 59 + }, + { + "epoch": 0.11, + "grad_norm": 0.10350017994642258, + "learning_rate": 0.0009978289720564471, + "loss": 0.1367, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 78.35698699951172, + "learning_rate": 0.0009977413286217727, + "loss": 2.2474, + "step": 61 + }, + { + "epoch": 0.11, + "grad_norm": 0.15264186263084412, + "learning_rate": 0.0009976519549583973, + "loss": 0.1311, + "step": 62 + }, + { + "epoch": 0.11, + "grad_norm": 0.31865784525871277, + "learning_rate": 0.0009975608513769975, + "loss": 0.1407, + "step": 63 + }, + { + "epoch": 0.11, + "grad_norm": 0.32891547679901123, + "learning_rate": 0.0009974680181942645, + "loss": 0.1423, + "step": 64 + }, + { + "epoch": 0.11, + "grad_norm": 0.15653717517852783, + "learning_rate": 0.0009973734557329008, + "loss": 0.1365, + "step": 65 + }, + { + "epoch": 0.12, + "grad_norm": 0.3237778842449188, + "learning_rate": 0.0009972771643216212, + "loss": 0.1407, + "step": 66 + }, + { + "epoch": 0.12, + "grad_norm": 0.13634416460990906, + "learning_rate": 0.0009971791442951496, + "loss": 0.1378, + "step": 67 + }, + { + "epoch": 0.12, + "grad_norm": 0.3488883376121521, + "learning_rate": 0.0009970793959942197, + "loss": 0.1429, + "step": 68 + }, + { + "epoch": 0.12, + "grad_norm": 0.5150622129440308, + "learning_rate": 0.0009969779197655725, + "loss": 0.1492, + "step": 69 + }, + { + "epoch": 0.12, + "grad_norm": 0.3482552468776703, + "learning_rate": 0.0009968747159619555, + "loss": 0.1415, + "step": 70 + }, + { + "epoch": 0.13, + "grad_norm": 0.22551549971103668, + "learning_rate": 0.000996769784942122, + "loss": 0.1418, + "step": 71 + }, + { + "epoch": 0.13, + "grad_norm": 0.20759086310863495, + "learning_rate": 0.0009966631270708287, + "loss": 0.1366, + "step": 72 + }, + { + "epoch": 0.13, + "grad_norm": 13.050313949584961, + "learning_rate": 0.0009965547427188356, + "loss": 0.1375, + "step": 73 + }, + { + "epoch": 0.13, + "grad_norm": 0.18372055888175964, + "learning_rate": 0.0009964446322629043, + "loss": 0.1285, + "step": 74 + }, + { + "epoch": 0.13, + "grad_norm": 0.4404817819595337, + "learning_rate": 0.000996332796085796, + "loss": 0.1501, + "step": 75 + }, + { + "epoch": 0.13, + "grad_norm": 1.269240379333496, + "learning_rate": 0.0009962192345762716, + "loss": 0.1346, + "step": 76 + }, + { + "epoch": 0.14, + "grad_norm": 32.20164108276367, + "learning_rate": 0.0009961039481290888, + "loss": 0.3348, + "step": 77 + }, + { + "epoch": 0.14, + "grad_norm": 82.92976379394531, + "learning_rate": 0.0009959869371450021, + "loss": 5.8309, + "step": 78 + }, + { + "epoch": 0.14, + "grad_norm": 0.3416314721107483, + "learning_rate": 0.0009958682020307602, + "loss": 0.1418, + "step": 79 + }, + { + "epoch": 0.14, + "grad_norm": 31.961870193481445, + "learning_rate": 0.0009957477431991053, + "loss": 0.1899, + "step": 80 + }, + { + "epoch": 0.14, + "grad_norm": 38.58375930786133, + "learning_rate": 0.000995625561068772, + "loss": 0.5641, + "step": 81 + }, + { + "epoch": 0.15, + "grad_norm": 0.32622194290161133, + "learning_rate": 0.0009955016560644846, + "loss": 0.1144, + "step": 82 + }, + { + "epoch": 0.15, + "grad_norm": 6.264970779418945, + "learning_rate": 0.0009953760286169572, + "loss": 0.4788, + "step": 83 + }, + { + "epoch": 0.15, + "grad_norm": 0.07168668508529663, + "learning_rate": 0.0009952486791628904, + "loss": 0.1326, + "step": 84 + }, + { + "epoch": 0.15, + "grad_norm": 35.18340301513672, + "learning_rate": 0.000995119608144972, + "loss": 0.3884, + "step": 85 + }, + { + "epoch": 0.15, + "grad_norm": 0.03896519914269447, + "learning_rate": 0.000994988816011873, + "loss": 0.1249, + "step": 86 + }, + { + "epoch": 0.15, + "grad_norm": 14.499520301818848, + "learning_rate": 0.000994856303218248, + "loss": 0.3756, + "step": 87 + }, + { + "epoch": 0.16, + "grad_norm": 0.3134947419166565, + "learning_rate": 0.000994722070224733, + "loss": 0.1539, + "step": 88 + }, + { + "epoch": 0.16, + "grad_norm": 117.39696502685547, + "learning_rate": 0.000994586117497943, + "loss": 0.5885, + "step": 89 + }, + { + "epoch": 0.16, + "grad_norm": 37.93465805053711, + "learning_rate": 0.0009944484455104716, + "loss": 0.7709, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 236.63330078125, + "learning_rate": 0.0009943090547408888, + "loss": 6.0182, + "step": 91 + }, + { + "epoch": 0.16, + "grad_norm": 1.1088515520095825, + "learning_rate": 0.0009941679456737394, + "loss": 0.1931, + "step": 92 + }, + { + "epoch": 0.16, + "grad_norm": 0.11310256272554398, + "learning_rate": 0.0009940251187995411, + "loss": 0.1293, + "step": 93 + }, + { + "epoch": 0.17, + "grad_norm": 0.6143047213554382, + "learning_rate": 0.0009938805746147828, + "loss": 0.2364, + "step": 94 + }, + { + "epoch": 0.17, + "grad_norm": 0.2461577206850052, + "learning_rate": 0.0009937343136219232, + "loss": 0.1504, + "step": 95 + }, + { + "epoch": 0.17, + "grad_norm": 97.17162322998047, + "learning_rate": 0.0009935863363293895, + "loss": 5.764, + "step": 96 + }, + { + "epoch": 0.17, + "grad_norm": 0.5417380928993225, + "learning_rate": 0.000993436643251574, + "loss": 0.1576, + "step": 97 + }, + { + "epoch": 0.17, + "grad_norm": 0.2737255096435547, + "learning_rate": 0.0009932852349088341, + "loss": 0.1437, + "step": 98 + }, + { + "epoch": 0.18, + "grad_norm": 138.00778198242188, + "learning_rate": 0.0009931321118274896, + "loss": 4.1331, + "step": 99 + }, + { + "epoch": 0.18, + "grad_norm": 46.4688606262207, + "learning_rate": 0.0009929772745398205, + "loss": 0.6178, + "step": 100 + }, + { + "epoch": 0.18, + "grad_norm": 0.49907386302948, + "learning_rate": 0.0009928207235840663, + "loss": 0.1445, + "step": 101 + }, + { + "epoch": 0.18, + "grad_norm": 0.33814460039138794, + "learning_rate": 0.0009926624595044233, + "loss": 0.139, + "step": 102 + }, + { + "epoch": 0.18, + "grad_norm": 0.3241071403026581, + "learning_rate": 0.0009925024828510427, + "loss": 0.1404, + "step": 103 + }, + { + "epoch": 0.18, + "grad_norm": 78.4036865234375, + "learning_rate": 0.000992340794180029, + "loss": 1.2663, + "step": 104 + }, + { + "epoch": 0.19, + "grad_norm": 1.113776445388794, + "learning_rate": 0.000992177394053438, + "loss": 0.162, + "step": 105 + }, + { + "epoch": 0.19, + "grad_norm": 16.512048721313477, + "learning_rate": 0.0009920122830392748, + "loss": 3.3373, + "step": 106 + }, + { + "epoch": 0.19, + "grad_norm": 111.53176879882812, + "learning_rate": 0.0009918454617114918, + "loss": 2.3969, + "step": 107 + }, + { + "epoch": 0.19, + "grad_norm": 14.91741943359375, + "learning_rate": 0.0009916769306499865, + "loss": 1.8837, + "step": 108 + }, + { + "epoch": 0.19, + "grad_norm": 61.30055618286133, + "learning_rate": 0.0009915066904406, + "loss": 10.4922, + "step": 109 + }, + { + "epoch": 0.19, + "grad_norm": 0.6948704123497009, + "learning_rate": 0.0009913347416751147, + "loss": 0.1536, + "step": 110 + }, + { + "epoch": 0.2, + "grad_norm": 0.7721084356307983, + "learning_rate": 0.000991161084951252, + "loss": 0.1356, + "step": 111 + }, + { + "epoch": 0.2, + "grad_norm": 0.24614596366882324, + "learning_rate": 0.0009909857208726704, + "loss": 0.1339, + "step": 112 + }, + { + "epoch": 0.2, + "grad_norm": 7.189969062805176, + "learning_rate": 0.0009908086500489638, + "loss": 0.2551, + "step": 113 + }, + { + "epoch": 0.2, + "grad_norm": 0.8675662279129028, + "learning_rate": 0.0009906298730956585, + "loss": 0.1668, + "step": 114 + }, + { + "epoch": 0.2, + "grad_norm": 0.605249285697937, + "learning_rate": 0.0009904493906342123, + "loss": 0.1478, + "step": 115 + }, + { + "epoch": 0.21, + "grad_norm": 0.8765722513198853, + "learning_rate": 0.0009902672032920106, + "loss": 0.1598, + "step": 116 + }, + { + "epoch": 0.21, + "grad_norm": 0.6021157503128052, + "learning_rate": 0.0009900833117023665, + "loss": 0.1506, + "step": 117 + }, + { + "epoch": 0.21, + "grad_norm": 0.28180792927742004, + "learning_rate": 0.000989897716504516, + "loss": 0.1389, + "step": 118 + }, + { + "epoch": 0.21, + "grad_norm": 0.21730898320674896, + "learning_rate": 0.0009897104183436184, + "loss": 0.1377, + "step": 119 + }, + { + "epoch": 0.21, + "grad_norm": 0.977118730545044, + "learning_rate": 0.0009895214178707516, + "loss": 0.1698, + "step": 120 + }, + { + "epoch": 0.21, + "grad_norm": 2.674729585647583, + "learning_rate": 0.0009893307157429118, + "loss": 0.1559, + "step": 121 + }, + { + "epoch": 0.22, + "grad_norm": 0.9852035045623779, + "learning_rate": 0.0009891383126230102, + "loss": 0.2027, + "step": 122 + }, + { + "epoch": 0.22, + "grad_norm": 0.36689773201942444, + "learning_rate": 0.0009889442091798712, + "loss": 0.1498, + "step": 123 + }, + { + "epoch": 0.22, + "grad_norm": 0.104621522128582, + "learning_rate": 0.000988748406088229, + "loss": 0.1379, + "step": 124 + }, + { + "epoch": 0.22, + "grad_norm": 74.17496490478516, + "learning_rate": 0.0009885509040287268, + "loss": 0.7724, + "step": 125 + }, + { + "epoch": 0.22, + "grad_norm": 1.2943025827407837, + "learning_rate": 0.0009883517036879132, + "loss": 0.2643, + "step": 126 + }, + { + "epoch": 0.22, + "grad_norm": 0.828774094581604, + "learning_rate": 0.000988150805758241, + "loss": 0.1852, + "step": 127 + }, + { + "epoch": 0.23, + "grad_norm": 0.13165877759456635, + "learning_rate": 0.0009879482109380632, + "loss": 0.1429, + "step": 128 + }, + { + "epoch": 0.23, + "grad_norm": 0.662426769733429, + "learning_rate": 0.0009877439199316323, + "loss": 0.1643, + "step": 129 + }, + { + "epoch": 0.23, + "grad_norm": 0.6256189942359924, + "learning_rate": 0.0009875379334490962, + "loss": 0.157, + "step": 130 + }, + { + "epoch": 0.23, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0009873302522064972, + "loss": 0.1484, + "step": 131 + }, + { + "epoch": 0.23, + "grad_norm": 1.4133671522140503, + "learning_rate": 0.0009871208769257685, + "loss": 0.1736, + "step": 132 + }, + { + "epoch": 0.24, + "grad_norm": 0.7930824756622314, + "learning_rate": 0.0009869098083347323, + "loss": 0.1543, + "step": 133 + }, + { + "epoch": 0.24, + "grad_norm": 0.5717449188232422, + "learning_rate": 0.0009866970471670965, + "loss": 0.1338, + "step": 134 + }, + { + "epoch": 0.24, + "grad_norm": 0.582081139087677, + "learning_rate": 0.0009864825941624537, + "loss": 0.1692, + "step": 135 + }, + { + "epoch": 0.24, + "grad_norm": 10.226588249206543, + "learning_rate": 0.0009862664500662763, + "loss": 0.2425, + "step": 136 + }, + { + "epoch": 0.24, + "grad_norm": 1.1186953783035278, + "learning_rate": 0.0009860486156299164, + "loss": 0.2052, + "step": 137 + }, + { + "epoch": 0.24, + "grad_norm": 0.2953661382198334, + "learning_rate": 0.000985829091610601, + "loss": 0.1408, + "step": 138 + }, + { + "epoch": 0.25, + "grad_norm": 0.8647088408470154, + "learning_rate": 0.000985607878771431, + "loss": 0.1571, + "step": 139 + }, + { + "epoch": 0.25, + "grad_norm": 0.41964420676231384, + "learning_rate": 0.0009853849778813776, + "loss": 0.1477, + "step": 140 + }, + { + "epoch": 0.25, + "grad_norm": 0.25675931572914124, + "learning_rate": 0.0009851603897152803, + "loss": 0.1398, + "step": 141 + }, + { + "epoch": 0.25, + "grad_norm": 0.2311631143093109, + "learning_rate": 0.0009849341150538434, + "loss": 0.1432, + "step": 142 + }, + { + "epoch": 0.25, + "eval_loss": 1.5366541147232056, + "eval_runtime": 14.6962, + "eval_samples_per_second": 32.457, + "eval_steps_per_second": 8.165, + "step": 142 + }, + { + "epoch": 0.25, + "grad_norm": 41.83562469482422, + "learning_rate": 0.0009847061546836339, + "loss": 1.1525, + "step": 143 + }, + { + "epoch": 0.25, + "grad_norm": 0.27440375089645386, + "learning_rate": 0.0009844765093970787, + "loss": 0.1452, + "step": 144 + }, + { + "epoch": 0.26, + "grad_norm": 0.27643319964408875, + "learning_rate": 0.0009842451799924616, + "loss": 0.1069, + "step": 145 + }, + { + "epoch": 0.26, + "grad_norm": 0.21519601345062256, + "learning_rate": 0.0009840121672739207, + "loss": 0.1358, + "step": 146 + }, + { + "epoch": 0.26, + "grad_norm": 0.4073689877986908, + "learning_rate": 0.0009837774720514456, + "loss": 0.1545, + "step": 147 + }, + { + "epoch": 0.26, + "grad_norm": 0.13685636222362518, + "learning_rate": 0.0009835410951408747, + "loss": 0.1259, + "step": 148 + }, + { + "epoch": 0.26, + "grad_norm": 0.07474564015865326, + "learning_rate": 0.000983303037363892, + "loss": 0.1356, + "step": 149 + }, + { + "epoch": 0.27, + "grad_norm": 0.45116662979125977, + "learning_rate": 0.0009830632995480241, + "loss": 0.1379, + "step": 150 + }, + { + "epoch": 0.27, + "grad_norm": 0.1297813504934311, + "learning_rate": 0.0009828218825266388, + "loss": 0.1343, + "step": 151 + }, + { + "epoch": 0.27, + "grad_norm": 0.5846492052078247, + "learning_rate": 0.00098257878713894, + "loss": 0.1563, + "step": 152 + }, + { + "epoch": 0.27, + "grad_norm": 0.38457778096199036, + "learning_rate": 0.0009823340142299662, + "loss": 0.1477, + "step": 153 + }, + { + "epoch": 0.27, + "grad_norm": 0.09184035658836365, + "learning_rate": 0.0009820875646505873, + "loss": 0.1376, + "step": 154 + }, + { + "epoch": 0.27, + "grad_norm": 0.5166211128234863, + "learning_rate": 0.0009818394392575019, + "loss": 0.1498, + "step": 155 + }, + { + "epoch": 0.28, + "grad_norm": 0.2788640260696411, + "learning_rate": 0.0009815896389132332, + "loss": 0.1434, + "step": 156 + }, + { + "epoch": 0.28, + "grad_norm": 0.3762676417827606, + "learning_rate": 0.0009813381644861276, + "loss": 0.1482, + "step": 157 + }, + { + "epoch": 0.28, + "grad_norm": 0.3615610897541046, + "learning_rate": 0.0009810850168503506, + "loss": 0.1312, + "step": 158 + }, + { + "epoch": 0.28, + "grad_norm": 0.03483320027589798, + "learning_rate": 0.0009808301968858837, + "loss": 0.1239, + "step": 159 + }, + { + "epoch": 0.28, + "grad_norm": 0.5616227984428406, + "learning_rate": 0.0009805737054785222, + "loss": 0.1881, + "step": 160 + }, + { + "epoch": 0.28, + "grad_norm": 0.029542161151766777, + "learning_rate": 0.000980315543519871, + "loss": 0.1254, + "step": 161 + }, + { + "epoch": 0.29, + "grad_norm": 0.142581045627594, + "learning_rate": 0.0009800557119073433, + "loss": 0.1258, + "step": 162 + }, + { + "epoch": 0.29, + "grad_norm": 0.7289375066757202, + "learning_rate": 0.0009797942115441546, + "loss": 0.1526, + "step": 163 + }, + { + "epoch": 0.29, + "grad_norm": 0.6975064873695374, + "learning_rate": 0.0009795310433393224, + "loss": 0.1487, + "step": 164 + }, + { + "epoch": 0.29, + "grad_norm": 1.3072260618209839, + "learning_rate": 0.0009792662082076617, + "loss": 0.1712, + "step": 165 + }, + { + "epoch": 0.29, + "grad_norm": 0.2993917465209961, + "learning_rate": 0.000978999707069782, + "loss": 0.1424, + "step": 166 + }, + { + "epoch": 0.3, + "grad_norm": 0.3258236050605774, + "learning_rate": 0.0009787315408520839, + "loss": 0.135, + "step": 167 + }, + { + "epoch": 0.3, + "grad_norm": 0.26566603779792786, + "learning_rate": 0.000978461710486756, + "loss": 0.1441, + "step": 168 + }, + { + "epoch": 0.3, + "grad_norm": 1.1709599494934082, + "learning_rate": 0.0009781902169117718, + "loss": 0.2084, + "step": 169 + }, + { + "epoch": 0.3, + "grad_norm": 0.6554279923439026, + "learning_rate": 0.000977917061070887, + "loss": 0.1634, + "step": 170 + }, + { + "epoch": 0.3, + "grad_norm": 0.1635073721408844, + "learning_rate": 0.000977642243913635, + "loss": 0.1371, + "step": 171 + }, + { + "epoch": 0.3, + "grad_norm": 0.4419834017753601, + "learning_rate": 0.0009773657663953242, + "loss": 0.1523, + "step": 172 + }, + { + "epoch": 0.31, + "grad_norm": 0.839259147644043, + "learning_rate": 0.000977087629477035, + "loss": 0.1628, + "step": 173 + }, + { + "epoch": 0.31, + "grad_norm": 0.1979222148656845, + "learning_rate": 0.0009768078341256155, + "loss": 0.1367, + "step": 174 + }, + { + "epoch": 0.31, + "grad_norm": 0.2939910888671875, + "learning_rate": 0.0009765263813136795, + "loss": 0.1349, + "step": 175 + }, + { + "epoch": 0.31, + "grad_norm": 0.19882674515247345, + "learning_rate": 0.0009762432720196024, + "loss": 0.1424, + "step": 176 + }, + { + "epoch": 0.31, + "grad_norm": 0.07146954536437988, + "learning_rate": 0.000975958507227517, + "loss": 0.1237, + "step": 177 + }, + { + "epoch": 0.31, + "grad_norm": 0.5031868815422058, + "learning_rate": 0.0009756720879273117, + "loss": 0.1592, + "step": 178 + }, + { + "epoch": 0.32, + "grad_norm": 0.14860151708126068, + "learning_rate": 0.0009753840151146258, + "loss": 0.1396, + "step": 179 + }, + { + "epoch": 0.32, + "grad_norm": 0.10280521959066391, + "learning_rate": 0.0009750942897908468, + "loss": 0.1333, + "step": 180 + }, + { + "epoch": 0.32, + "grad_norm": 0.4652903974056244, + "learning_rate": 0.0009748029129631061, + "loss": 0.1421, + "step": 181 + }, + { + "epoch": 0.32, + "grad_norm": 0.3985591530799866, + "learning_rate": 0.0009745098856442768, + "loss": 0.1459, + "step": 182 + }, + { + "epoch": 0.32, + "grad_norm": 0.20321591198444366, + "learning_rate": 0.0009742152088529683, + "loss": 0.1381, + "step": 183 + }, + { + "epoch": 0.33, + "grad_norm": 0.7694361805915833, + "learning_rate": 0.0009739188836135246, + "loss": 0.1676, + "step": 184 + }, + { + "epoch": 0.33, + "grad_norm": 0.04469340294599533, + "learning_rate": 0.0009736209109560201, + "loss": 0.136, + "step": 185 + }, + { + "epoch": 0.33, + "grad_norm": 0.08576061576604843, + "learning_rate": 0.0009733212919162549, + "loss": 0.1408, + "step": 186 + }, + { + "epoch": 0.33, + "grad_norm": 0.042906519025564194, + "learning_rate": 0.0009730200275357535, + "loss": 0.1364, + "step": 187 + }, + { + "epoch": 0.33, + "grad_norm": 0.30054494738578796, + "learning_rate": 0.0009727171188617588, + "loss": 0.1539, + "step": 188 + }, + { + "epoch": 0.33, + "grad_norm": 0.05149005725979805, + "learning_rate": 0.0009724125669472299, + "loss": 0.1352, + "step": 189 + }, + { + "epoch": 0.34, + "grad_norm": 0.1381620466709137, + "learning_rate": 0.0009721063728508383, + "loss": 0.1409, + "step": 190 + }, + { + "epoch": 0.34, + "grad_norm": 0.37344205379486084, + "learning_rate": 0.0009717985376369639, + "loss": 0.1299, + "step": 191 + }, + { + "epoch": 0.34, + "grad_norm": 0.1037706583738327, + "learning_rate": 0.0009714890623756912, + "loss": 0.1341, + "step": 192 + }, + { + "epoch": 0.34, + "grad_norm": 0.14189712703227997, + "learning_rate": 0.0009711779481428056, + "loss": 0.1418, + "step": 193 + }, + { + "epoch": 0.34, + "grad_norm": 0.15108801424503326, + "learning_rate": 0.0009708651960197903, + "loss": 0.142, + "step": 194 + }, + { + "epoch": 0.34, + "grad_norm": 0.037045519798994064, + "learning_rate": 0.0009705508070938218, + "loss": 0.1315, + "step": 195 + }, + { + "epoch": 0.35, + "grad_norm": 0.23301652073860168, + "learning_rate": 0.0009702347824577666, + "loss": 0.1396, + "step": 196 + }, + { + "epoch": 0.35, + "grad_norm": 0.08476269990205765, + "learning_rate": 0.0009699171232101768, + "loss": 0.1392, + "step": 197 + }, + { + "epoch": 0.35, + "grad_norm": 0.4222690463066101, + "learning_rate": 0.000969597830455287, + "loss": 0.1463, + "step": 198 + }, + { + "epoch": 0.35, + "grad_norm": 0.3234136402606964, + "learning_rate": 0.0009692769053030099, + "loss": 0.1257, + "step": 199 + }, + { + "epoch": 0.35, + "grad_norm": 0.04025443643331528, + "learning_rate": 0.0009689543488689332, + "loss": 0.1303, + "step": 200 + }, + { + "epoch": 0.36, + "grad_norm": 0.07074520736932755, + "learning_rate": 0.0009686301622743144, + "loss": 0.1289, + "step": 201 + }, + { + "epoch": 0.36, + "grad_norm": 0.0788850486278534, + "learning_rate": 0.0009683043466460782, + "loss": 0.1236, + "step": 202 + }, + { + "epoch": 0.36, + "grad_norm": 0.525541365146637, + "learning_rate": 0.000967976903116812, + "loss": 0.1564, + "step": 203 + }, + { + "epoch": 0.36, + "grad_norm": 0.6145509481430054, + "learning_rate": 0.0009676478328247623, + "loss": 0.156, + "step": 204 + }, + { + "epoch": 0.36, + "grad_norm": 0.230132058262825, + "learning_rate": 0.0009673171369138296, + "loss": 0.1425, + "step": 205 + }, + { + "epoch": 0.36, + "grad_norm": 0.03262978792190552, + "learning_rate": 0.0009669848165335666, + "loss": 0.1297, + "step": 206 + }, + { + "epoch": 0.37, + "grad_norm": 0.0462469644844532, + "learning_rate": 0.0009666508728391718, + "loss": 0.1177, + "step": 207 + }, + { + "epoch": 0.37, + "grad_norm": 0.06880385428667068, + "learning_rate": 0.0009663153069914874, + "loss": 0.1207, + "step": 208 + }, + { + "epoch": 0.37, + "grad_norm": 0.4248260259628296, + "learning_rate": 0.000965978120156994, + "loss": 0.1571, + "step": 209 + }, + { + "epoch": 0.37, + "grad_norm": 0.060492075979709625, + "learning_rate": 0.0009656393135078068, + "loss": 0.1219, + "step": 210 + }, + { + "epoch": 0.37, + "grad_norm": 0.12135621905326843, + "learning_rate": 0.0009652988882216725, + "loss": 0.1323, + "step": 211 + }, + { + "epoch": 0.38, + "grad_norm": 0.252119243144989, + "learning_rate": 0.0009649568454819637, + "loss": 0.1366, + "step": 212 + }, + { + "epoch": 0.38, + "grad_norm": 0.5283567905426025, + "learning_rate": 0.0009646131864776761, + "loss": 0.1246, + "step": 213 + }, + { + "epoch": 0.38, + "grad_norm": 2.224665880203247, + "learning_rate": 0.0009642679124034233, + "loss": 0.2582, + "step": 214 + }, + { + "epoch": 0.38, + "grad_norm": 1.9277523756027222, + "learning_rate": 0.0009639210244594335, + "loss": 0.2131, + "step": 215 + }, + { + "epoch": 0.38, + "grad_norm": 0.5668452978134155, + "learning_rate": 0.0009635725238515446, + "loss": 0.141, + "step": 216 + }, + { + "epoch": 0.38, + "grad_norm": 0.13912492990493774, + "learning_rate": 0.000963222411791201, + "loss": 0.1418, + "step": 217 + }, + { + "epoch": 0.39, + "grad_norm": 0.39307814836502075, + "learning_rate": 0.0009628706894954479, + "loss": 0.1477, + "step": 218 + }, + { + "epoch": 0.39, + "grad_norm": 0.26248928904533386, + "learning_rate": 0.000962517358186929, + "loss": 0.1315, + "step": 219 + }, + { + "epoch": 0.39, + "grad_norm": 0.2875257730484009, + "learning_rate": 0.0009621624190938803, + "loss": 0.1321, + "step": 220 + }, + { + "epoch": 0.39, + "grad_norm": 0.6386964917182922, + "learning_rate": 0.0009618058734501269, + "loss": 0.1668, + "step": 221 + }, + { + "epoch": 0.39, + "grad_norm": 0.16165001690387726, + "learning_rate": 0.0009614477224950789, + "loss": 0.1272, + "step": 222 + }, + { + "epoch": 0.39, + "grad_norm": 0.6959558129310608, + "learning_rate": 0.0009610879674737262, + "loss": 0.1381, + "step": 223 + }, + { + "epoch": 0.4, + "grad_norm": 0.1701437532901764, + "learning_rate": 0.0009607266096366352, + "loss": 0.1366, + "step": 224 + }, + { + "epoch": 0.4, + "grad_norm": 0.2511409819126129, + "learning_rate": 0.0009603636502399437, + "loss": 0.126, + "step": 225 + }, + { + "epoch": 0.4, + "grad_norm": 0.04554220288991928, + "learning_rate": 0.0009599990905453566, + "loss": 0.1321, + "step": 226 + }, + { + "epoch": 0.4, + "grad_norm": 0.3964705765247345, + "learning_rate": 0.000959632931820142, + "loss": 0.1383, + "step": 227 + }, + { + "epoch": 0.4, + "grad_norm": 0.10925984382629395, + "learning_rate": 0.0009592651753371264, + "loss": 0.1226, + "step": 228 + }, + { + "epoch": 0.41, + "grad_norm": 0.19012318551540375, + "learning_rate": 0.0009588958223746903, + "loss": 0.1255, + "step": 229 + }, + { + "epoch": 0.41, + "grad_norm": 0.23432157933712006, + "learning_rate": 0.0009585248742167639, + "loss": 0.1152, + "step": 230 + }, + { + "epoch": 0.41, + "grad_norm": 0.1737753301858902, + "learning_rate": 0.0009581523321528223, + "loss": 0.1468, + "step": 231 + }, + { + "epoch": 0.41, + "grad_norm": 0.2625434100627899, + "learning_rate": 0.0009577781974778817, + "loss": 0.1296, + "step": 232 + }, + { + "epoch": 0.41, + "grad_norm": 0.3056884706020355, + "learning_rate": 0.000957402471492494, + "loss": 0.1574, + "step": 233 + }, + { + "epoch": 0.41, + "grad_norm": 0.4111999273300171, + "learning_rate": 0.0009570251555027432, + "loss": 0.1434, + "step": 234 + }, + { + "epoch": 0.42, + "grad_norm": 0.056673482060432434, + "learning_rate": 0.0009566462508202401, + "loss": 0.1337, + "step": 235 + }, + { + "epoch": 0.42, + "grad_norm": 0.3861597180366516, + "learning_rate": 0.0009562657587621184, + "loss": 0.1609, + "step": 236 + }, + { + "epoch": 0.42, + "grad_norm": 0.35893362760543823, + "learning_rate": 0.0009558836806510292, + "loss": 0.1189, + "step": 237 + }, + { + "epoch": 0.42, + "grad_norm": 0.40538331866264343, + "learning_rate": 0.0009555000178151374, + "loss": 0.1504, + "step": 238 + }, + { + "epoch": 0.42, + "grad_norm": 81.36141967773438, + "learning_rate": 0.0009551147715881167, + "loss": 4.7235, + "step": 239 + }, + { + "epoch": 0.42, + "grad_norm": 0.21178042888641357, + "learning_rate": 0.0009547279433091446, + "loss": 0.1139, + "step": 240 + }, + { + "epoch": 0.43, + "grad_norm": 0.27380529046058655, + "learning_rate": 0.0009543395343228983, + "loss": 0.1504, + "step": 241 + }, + { + "epoch": 0.43, + "grad_norm": 41.42683410644531, + "learning_rate": 0.0009539495459795498, + "loss": 1.2477, + "step": 242 + }, + { + "epoch": 0.43, + "grad_norm": 0.14853385090827942, + "learning_rate": 0.0009535579796347612, + "loss": 0.1343, + "step": 243 + }, + { + "epoch": 0.43, + "grad_norm": 0.3484509289264679, + "learning_rate": 0.0009531648366496798, + "loss": 0.15, + "step": 244 + }, + { + "epoch": 0.43, + "grad_norm": 0.20152732729911804, + "learning_rate": 0.0009527701183909336, + "loss": 0.1399, + "step": 245 + }, + { + "epoch": 0.44, + "grad_norm": 80.84031677246094, + "learning_rate": 0.000952373826230627, + "loss": 3.1939, + "step": 246 + }, + { + "epoch": 0.44, + "grad_norm": 15.475607872009277, + "learning_rate": 0.0009519759615463346, + "loss": 3.3935, + "step": 247 + }, + { + "epoch": 0.44, + "grad_norm": 77.19477081298828, + "learning_rate": 0.0009515765257210979, + "loss": 6.5034, + "step": 248 + }, + { + "epoch": 0.44, + "grad_norm": 0.1174071803689003, + "learning_rate": 0.0009511755201434205, + "loss": 0.1212, + "step": 249 + }, + { + "epoch": 0.44, + "grad_norm": 16.503982543945312, + "learning_rate": 0.0009507729462072614, + "loss": 0.3753, + "step": 250 + }, + { + "epoch": 0.44, + "grad_norm": 76.65412902832031, + "learning_rate": 0.0009503688053120326, + "loss": 0.9386, + "step": 251 + }, + { + "epoch": 0.45, + "grad_norm": 94.82160186767578, + "learning_rate": 0.0009499630988625925, + "loss": 4.7449, + "step": 252 + }, + { + "epoch": 0.45, + "grad_norm": 0.2721010148525238, + "learning_rate": 0.0009495558282692421, + "loss": 0.1358, + "step": 253 + }, + { + "epoch": 0.45, + "grad_norm": 0.5150814056396484, + "learning_rate": 0.0009491469949477187, + "loss": 0.1622, + "step": 254 + }, + { + "epoch": 0.45, + "grad_norm": 51.050167083740234, + "learning_rate": 0.0009487366003191931, + "loss": 0.7818, + "step": 255 + }, + { + "epoch": 0.45, + "grad_norm": 11.698090553283691, + "learning_rate": 0.0009483246458102625, + "loss": 0.3862, + "step": 256 + }, + { + "epoch": 0.45, + "grad_norm": 0.648543655872345, + "learning_rate": 0.0009479111328529472, + "loss": 0.1884, + "step": 257 + }, + { + "epoch": 0.46, + "grad_norm": 0.745293140411377, + "learning_rate": 0.0009474960628846843, + "loss": 0.1562, + "step": 258 + }, + { + "epoch": 0.46, + "grad_norm": 0.17890043556690216, + "learning_rate": 0.0009470794373483235, + "loss": 0.1425, + "step": 259 + }, + { + "epoch": 0.46, + "grad_norm": 0.5058090686798096, + "learning_rate": 0.0009466612576921223, + "loss": 0.17, + "step": 260 + }, + { + "epoch": 0.46, + "grad_norm": 1.3177820444107056, + "learning_rate": 0.00094624152536974, + "loss": 0.15, + "step": 261 + }, + { + "epoch": 0.46, + "grad_norm": 0.49652573466300964, + "learning_rate": 0.0009458202418402337, + "loss": 0.145, + "step": 262 + }, + { + "epoch": 0.47, + "grad_norm": 11.423394203186035, + "learning_rate": 0.0009453974085680526, + "loss": 0.349, + "step": 263 + }, + { + "epoch": 0.47, + "grad_norm": 1.5422337055206299, + "learning_rate": 0.0009449730270230326, + "loss": 0.211, + "step": 264 + }, + { + "epoch": 0.47, + "grad_norm": 103.68435668945312, + "learning_rate": 0.0009445470986803921, + "loss": 17.4069, + "step": 265 + }, + { + "epoch": 0.47, + "grad_norm": 54.51758575439453, + "learning_rate": 0.0009441196250207267, + "loss": 15.685, + "step": 266 + }, + { + "epoch": 0.47, + "grad_norm": 14.596623420715332, + "learning_rate": 0.0009436906075300032, + "loss": 0.791, + "step": 267 + }, + { + "epoch": 0.47, + "grad_norm": 3.3164780139923096, + "learning_rate": 0.000943260047699555, + "loss": 0.3611, + "step": 268 + }, + { + "epoch": 0.48, + "grad_norm": 0.3087855577468872, + "learning_rate": 0.0009428279470260776, + "loss": 0.1332, + "step": 269 + }, + { + "epoch": 0.48, + "grad_norm": 1.1544523239135742, + "learning_rate": 0.0009423943070116219, + "loss": 0.2405, + "step": 270 + }, + { + "epoch": 0.48, + "grad_norm": 0.27010253071784973, + "learning_rate": 0.00094195912916359, + "loss": 0.1241, + "step": 271 + }, + { + "epoch": 0.48, + "grad_norm": 0.2287709265947342, + "learning_rate": 0.0009415224149947306, + "loss": 0.1366, + "step": 272 + }, + { + "epoch": 0.48, + "grad_norm": 0.5216432809829712, + "learning_rate": 0.0009410841660231316, + "loss": 0.1641, + "step": 273 + }, + { + "epoch": 0.48, + "grad_norm": 1.3091949224472046, + "learning_rate": 0.0009406443837722167, + "loss": 0.2524, + "step": 274 + }, + { + "epoch": 0.49, + "grad_norm": 0.11813609302043915, + "learning_rate": 0.0009402030697707398, + "loss": 0.1353, + "step": 275 + }, + { + "epoch": 0.49, + "grad_norm": 1.3709551095962524, + "learning_rate": 0.000939760225552779, + "loss": 0.2714, + "step": 276 + }, + { + "epoch": 0.49, + "grad_norm": 8.527563095092773, + "learning_rate": 0.0009393158526577322, + "loss": 0.1955, + "step": 277 + }, + { + "epoch": 0.49, + "grad_norm": 21.874027252197266, + "learning_rate": 0.0009388699526303105, + "loss": 0.2398, + "step": 278 + }, + { + "epoch": 0.49, + "grad_norm": 51.793731689453125, + "learning_rate": 0.0009384225270205339, + "loss": 1.3069, + "step": 279 + }, + { + "epoch": 0.5, + "grad_norm": 0.6711062788963318, + "learning_rate": 0.0009379735773837259, + "loss": 0.1664, + "step": 280 + }, + { + "epoch": 0.5, + "grad_norm": 5.93789005279541, + "learning_rate": 0.0009375231052805072, + "loss": 0.2455, + "step": 281 + }, + { + "epoch": 0.5, + "grad_norm": 62.527198791503906, + "learning_rate": 0.0009370711122767912, + "loss": 6.6447, + "step": 282 + }, + { + "epoch": 0.5, + "grad_norm": 22.35348129272461, + "learning_rate": 0.000936617599943778, + "loss": 2.5015, + "step": 283 + }, + { + "epoch": 0.5, + "grad_norm": 0.7277780175209045, + "learning_rate": 0.0009361625698579493, + "loss": 0.1667, + "step": 284 + }, + { + "epoch": 0.5, + "eval_loss": 0.14179374277591705, + "eval_runtime": 14.7139, + "eval_samples_per_second": 32.418, + "eval_steps_per_second": 8.156, + "step": 284 + }, + { + "epoch": 0.5, + "grad_norm": 0.26271358132362366, + "learning_rate": 0.0009357060236010625, + "loss": 0.1429, + "step": 285 + }, + { + "epoch": 0.51, + "grad_norm": 21.24464988708496, + "learning_rate": 0.0009352479627601457, + "loss": 2.0706, + "step": 286 + }, + { + "epoch": 0.51, + "grad_norm": 6.5764265060424805, + "learning_rate": 0.0009347883889274922, + "loss": 0.3337, + "step": 287 + }, + { + "epoch": 0.51, + "grad_norm": 0.6868380904197693, + "learning_rate": 0.0009343273037006539, + "loss": 0.1994, + "step": 288 + }, + { + "epoch": 0.51, + "grad_norm": 0.9018234610557556, + "learning_rate": 0.0009338647086824372, + "loss": 0.1908, + "step": 289 + }, + { + "epoch": 0.51, + "grad_norm": 1.7751502990722656, + "learning_rate": 0.0009334006054808966, + "loss": 0.2028, + "step": 290 + }, + { + "epoch": 0.51, + "grad_norm": 0.5386408567428589, + "learning_rate": 0.0009329349957093293, + "loss": 0.1853, + "step": 291 + }, + { + "epoch": 0.52, + "grad_norm": 1.4171103239059448, + "learning_rate": 0.0009324678809862695, + "loss": 0.3597, + "step": 292 + }, + { + "epoch": 0.52, + "grad_norm": 0.4105970561504364, + "learning_rate": 0.0009319992629354827, + "loss": 0.1344, + "step": 293 + }, + { + "epoch": 0.52, + "grad_norm": 0.26628127694129944, + "learning_rate": 0.000931529143185961, + "loss": 0.1453, + "step": 294 + }, + { + "epoch": 0.52, + "grad_norm": 14.981964111328125, + "learning_rate": 0.0009310575233719154, + "loss": 0.2563, + "step": 295 + }, + { + "epoch": 0.52, + "grad_norm": 0.6945788264274597, + "learning_rate": 0.0009305844051327725, + "loss": 0.1229, + "step": 296 + }, + { + "epoch": 0.53, + "grad_norm": 31.034496307373047, + "learning_rate": 0.000930109790113167, + "loss": 1.2974, + "step": 297 + }, + { + "epoch": 0.53, + "grad_norm": 1.5794603824615479, + "learning_rate": 0.0009296336799629368, + "loss": 0.22, + "step": 298 + }, + { + "epoch": 0.53, + "grad_norm": 0.33219394087791443, + "learning_rate": 0.0009291560763371172, + "loss": 0.1262, + "step": 299 + }, + { + "epoch": 0.53, + "grad_norm": 2.597118377685547, + "learning_rate": 0.000928676980895935, + "loss": 0.4026, + "step": 300 + }, + { + "epoch": 0.53, + "grad_norm": 13.547090530395508, + "learning_rate": 0.0009281963953048029, + "loss": 1.3086, + "step": 301 + }, + { + "epoch": 0.53, + "grad_norm": 1.289302945137024, + "learning_rate": 0.0009277143212343134, + "loss": 0.2215, + "step": 302 + }, + { + "epoch": 0.54, + "grad_norm": 1.2176313400268555, + "learning_rate": 0.0009272307603602334, + "loss": 0.15, + "step": 303 + }, + { + "epoch": 0.54, + "grad_norm": 4.436944007873535, + "learning_rate": 0.0009267457143634979, + "loss": 0.514, + "step": 304 + }, + { + "epoch": 0.54, + "grad_norm": 29.960241317749023, + "learning_rate": 0.0009262591849302047, + "loss": 3.5389, + "step": 305 + }, + { + "epoch": 0.54, + "grad_norm": 5.514049530029297, + "learning_rate": 0.0009257711737516082, + "loss": 0.2902, + "step": 306 + }, + { + "epoch": 0.54, + "grad_norm": 2.331019401550293, + "learning_rate": 0.0009252816825241135, + "loss": 0.2775, + "step": 307 + }, + { + "epoch": 0.54, + "grad_norm": 0.5708584189414978, + "learning_rate": 0.0009247907129492707, + "loss": 0.1438, + "step": 308 + }, + { + "epoch": 0.55, + "grad_norm": 2.16607928276062, + "learning_rate": 0.0009242982667337685, + "loss": 0.2383, + "step": 309 + }, + { + "epoch": 0.55, + "grad_norm": 1.5346423387527466, + "learning_rate": 0.0009238043455894293, + "loss": 0.1793, + "step": 310 + }, + { + "epoch": 0.55, + "grad_norm": 0.6200052499771118, + "learning_rate": 0.000923308951233202, + "loss": 0.1473, + "step": 311 + }, + { + "epoch": 0.55, + "grad_norm": 64.87612915039062, + "learning_rate": 0.0009228120853871572, + "loss": 0.8875, + "step": 312 + }, + { + "epoch": 0.55, + "grad_norm": 1.077471137046814, + "learning_rate": 0.0009223137497784797, + "loss": 0.2114, + "step": 313 + }, + { + "epoch": 0.56, + "grad_norm": 3.5934722423553467, + "learning_rate": 0.0009218139461394644, + "loss": 0.2852, + "step": 314 + }, + { + "epoch": 0.56, + "grad_norm": 0.10276800394058228, + "learning_rate": 0.0009213126762075088, + "loss": 0.1365, + "step": 315 + }, + { + "epoch": 0.56, + "grad_norm": 3.9422831535339355, + "learning_rate": 0.0009208099417251077, + "loss": 0.2949, + "step": 316 + }, + { + "epoch": 0.56, + "grad_norm": 1.7574914693832397, + "learning_rate": 0.0009203057444398468, + "loss": 0.2621, + "step": 317 + }, + { + "epoch": 0.56, + "grad_norm": 0.29479530453681946, + "learning_rate": 0.0009198000861043967, + "loss": 0.1341, + "step": 318 + }, + { + "epoch": 0.56, + "grad_norm": 0.5362827181816101, + "learning_rate": 0.0009192929684765068, + "loss": 0.1398, + "step": 319 + }, + { + "epoch": 0.57, + "grad_norm": 0.8159481287002563, + "learning_rate": 0.0009187843933189994, + "loss": 0.1863, + "step": 320 + }, + { + "epoch": 0.57, + "grad_norm": 0.9413295388221741, + "learning_rate": 0.0009182743623997634, + "loss": 0.2104, + "step": 321 + }, + { + "epoch": 0.57, + "grad_norm": 0.5306220650672913, + "learning_rate": 0.0009177628774917479, + "loss": 0.1537, + "step": 322 + }, + { + "epoch": 0.57, + "grad_norm": 0.8887706398963928, + "learning_rate": 0.0009172499403729567, + "loss": 0.1963, + "step": 323 + }, + { + "epoch": 0.57, + "grad_norm": 0.8467744588851929, + "learning_rate": 0.0009167355528264414, + "loss": 0.204, + "step": 324 + }, + { + "epoch": 0.57, + "grad_norm": 0.19867151975631714, + "learning_rate": 0.0009162197166402956, + "loss": 0.1407, + "step": 325 + }, + { + "epoch": 0.58, + "grad_norm": 0.13638383150100708, + "learning_rate": 0.0009157024336076487, + "loss": 0.1408, + "step": 326 + }, + { + "epoch": 0.58, + "grad_norm": 0.2027496099472046, + "learning_rate": 0.0009151837055266594, + "loss": 0.1444, + "step": 327 + }, + { + "epoch": 0.58, + "grad_norm": 0.370151549577713, + "learning_rate": 0.0009146635342005098, + "loss": 0.158, + "step": 328 + }, + { + "epoch": 0.58, + "grad_norm": 0.3114052414894104, + "learning_rate": 0.000914141921437399, + "loss": 0.1464, + "step": 329 + }, + { + "epoch": 0.58, + "grad_norm": 0.15394961833953857, + "learning_rate": 0.0009136188690505362, + "loss": 0.1341, + "step": 330 + }, + { + "epoch": 0.59, + "grad_norm": 0.46498528122901917, + "learning_rate": 0.0009130943788581359, + "loss": 0.1426, + "step": 331 + }, + { + "epoch": 0.59, + "grad_norm": 0.28067877888679504, + "learning_rate": 0.00091256845268341, + "loss": 0.1409, + "step": 332 + }, + { + "epoch": 0.59, + "grad_norm": 0.061186857521533966, + "learning_rate": 0.0009120410923545619, + "loss": 0.1401, + "step": 333 + }, + { + "epoch": 0.59, + "grad_norm": 0.26736098527908325, + "learning_rate": 0.0009115122997047811, + "loss": 0.1467, + "step": 334 + }, + { + "epoch": 0.59, + "grad_norm": 0.5139696598052979, + "learning_rate": 0.0009109820765722356, + "loss": 0.1585, + "step": 335 + }, + { + "epoch": 0.59, + "grad_norm": 0.40007275342941284, + "learning_rate": 0.000910450424800066, + "loss": 0.1473, + "step": 336 + }, + { + "epoch": 0.6, + "grad_norm": 0.66825270652771, + "learning_rate": 0.0009099173462363792, + "loss": 0.1572, + "step": 337 + }, + { + "epoch": 0.6, + "grad_norm": 0.5313024520874023, + "learning_rate": 0.0009093828427342418, + "loss": 0.1555, + "step": 338 + }, + { + "epoch": 0.6, + "grad_norm": 0.4224655330181122, + "learning_rate": 0.0009088469161516735, + "loss": 0.1429, + "step": 339 + }, + { + "epoch": 0.6, + "grad_norm": 0.03462248668074608, + "learning_rate": 0.0009083095683516414, + "loss": 0.1325, + "step": 340 + }, + { + "epoch": 0.6, + "grad_norm": 0.542322039604187, + "learning_rate": 0.0009077708012020524, + "loss": 0.1755, + "step": 341 + }, + { + "epoch": 0.61, + "grad_norm": 0.2164747267961502, + "learning_rate": 0.0009072306165757476, + "loss": 0.1458, + "step": 342 + }, + { + "epoch": 0.61, + "grad_norm": 0.27414461970329285, + "learning_rate": 0.0009066890163504955, + "loss": 0.1512, + "step": 343 + }, + { + "epoch": 0.61, + "grad_norm": 0.1911482959985733, + "learning_rate": 0.0009061460024089853, + "loss": 0.1185, + "step": 344 + }, + { + "epoch": 0.61, + "grad_norm": 0.1287711262702942, + "learning_rate": 0.0009056015766388205, + "loss": 0.1372, + "step": 345 + }, + { + "epoch": 0.61, + "grad_norm": 0.18598809838294983, + "learning_rate": 0.0009050557409325125, + "loss": 0.1341, + "step": 346 + }, + { + "epoch": 0.61, + "grad_norm": 0.18694853782653809, + "learning_rate": 0.0009045084971874737, + "loss": 0.141, + "step": 347 + }, + { + "epoch": 0.62, + "grad_norm": 0.06479912996292114, + "learning_rate": 0.0009039598473060113, + "loss": 0.1368, + "step": 348 + }, + { + "epoch": 0.62, + "grad_norm": 0.17768733203411102, + "learning_rate": 0.0009034097931953201, + "loss": 0.1381, + "step": 349 + }, + { + "epoch": 0.62, + "grad_norm": 0.28938984870910645, + "learning_rate": 0.0009028583367674765, + "loss": 0.1365, + "step": 350 + }, + { + "epoch": 0.62, + "grad_norm": 0.2924034893512726, + "learning_rate": 0.0009023054799394316, + "loss": 0.1282, + "step": 351 + }, + { + "epoch": 0.62, + "grad_norm": 0.28439652919769287, + "learning_rate": 0.0009017512246330042, + "loss": 0.151, + "step": 352 + }, + { + "epoch": 0.62, + "grad_norm": 0.14329224824905396, + "learning_rate": 0.0009011955727748749, + "loss": 0.1419, + "step": 353 + }, + { + "epoch": 0.63, + "grad_norm": 0.15245947241783142, + "learning_rate": 0.0009006385262965785, + "loss": 0.1163, + "step": 354 + }, + { + "epoch": 0.63, + "grad_norm": 0.052399642765522, + "learning_rate": 0.000900080087134498, + "loss": 0.1241, + "step": 355 + }, + { + "epoch": 0.63, + "grad_norm": 0.030301153659820557, + "learning_rate": 0.0008995202572298575, + "loss": 0.1232, + "step": 356 + }, + { + "epoch": 0.63, + "grad_norm": 0.41738417744636536, + "learning_rate": 0.0008989590385287155, + "loss": 0.1675, + "step": 357 + }, + { + "epoch": 0.63, + "grad_norm": 0.19307875633239746, + "learning_rate": 0.0008983964329819583, + "loss": 0.1328, + "step": 358 + }, + { + "epoch": 0.64, + "grad_norm": 0.05682377517223358, + "learning_rate": 0.000897832442545293, + "loss": 0.1322, + "step": 359 + }, + { + "epoch": 0.64, + "grad_norm": 0.15418089926242828, + "learning_rate": 0.0008972670691792409, + "loss": 0.1414, + "step": 360 + }, + { + "epoch": 0.64, + "grad_norm": 0.07167459279298782, + "learning_rate": 0.0008967003148491304, + "loss": 0.1414, + "step": 361 + }, + { + "epoch": 0.64, + "grad_norm": 0.2866109609603882, + "learning_rate": 0.0008961321815250904, + "loss": 0.1381, + "step": 362 + }, + { + "epoch": 0.64, + "grad_norm": 0.281264990568161, + "learning_rate": 0.0008955626711820438, + "loss": 0.1365, + "step": 363 + }, + { + "epoch": 0.64, + "grad_norm": 0.19263768196105957, + "learning_rate": 0.0008949917857996997, + "loss": 0.1394, + "step": 364 + }, + { + "epoch": 0.65, + "grad_norm": 0.30531641840934753, + "learning_rate": 0.0008944195273625471, + "loss": 0.1478, + "step": 365 + }, + { + "epoch": 0.65, + "grad_norm": 0.16229306161403656, + "learning_rate": 0.0008938458978598483, + "loss": 0.1412, + "step": 366 + }, + { + "epoch": 0.65, + "grad_norm": 0.09315463900566101, + "learning_rate": 0.0008932708992856315, + "loss": 0.1397, + "step": 367 + }, + { + "epoch": 0.65, + "grad_norm": 0.04228806868195534, + "learning_rate": 0.0008926945336386838, + "loss": 0.1383, + "step": 368 + }, + { + "epoch": 0.65, + "grad_norm": 0.2209407389163971, + "learning_rate": 0.0008921168029225448, + "loss": 0.1434, + "step": 369 + }, + { + "epoch": 0.65, + "grad_norm": 0.04254443198442459, + "learning_rate": 0.0008915377091454992, + "loss": 0.1326, + "step": 370 + }, + { + "epoch": 0.66, + "grad_norm": 0.09651175886392593, + "learning_rate": 0.0008909572543205698, + "loss": 0.134, + "step": 371 + }, + { + "epoch": 0.66, + "grad_norm": 0.2821654975414276, + "learning_rate": 0.0008903754404655105, + "loss": 0.1498, + "step": 372 + }, + { + "epoch": 0.66, + "grad_norm": 0.43042680621147156, + "learning_rate": 0.0008897922696027998, + "loss": 0.1571, + "step": 373 + }, + { + "epoch": 0.66, + "grad_norm": 0.06591568142175674, + "learning_rate": 0.0008892077437596332, + "loss": 0.1391, + "step": 374 + }, + { + "epoch": 0.66, + "grad_norm": 0.08771979063749313, + "learning_rate": 0.0008886218649679161, + "loss": 0.1375, + "step": 375 + }, + { + "epoch": 0.67, + "grad_norm": 0.03339942544698715, + "learning_rate": 0.0008880346352642574, + "loss": 0.1368, + "step": 376 + }, + { + "epoch": 0.67, + "grad_norm": 0.15352453291416168, + "learning_rate": 0.0008874460566899616, + "loss": 0.1447, + "step": 377 + }, + { + "epoch": 0.67, + "grad_norm": 0.1778584122657776, + "learning_rate": 0.0008868561312910222, + "loss": 0.1189, + "step": 378 + }, + { + "epoch": 0.67, + "grad_norm": 0.11893154680728912, + "learning_rate": 0.0008862648611181144, + "loss": 0.1167, + "step": 379 + }, + { + "epoch": 0.67, + "grad_norm": 0.4323861598968506, + "learning_rate": 0.0008856722482265886, + "loss": 0.1691, + "step": 380 + }, + { + "epoch": 0.67, + "grad_norm": 0.28813356161117554, + "learning_rate": 0.0008850782946764618, + "loss": 0.1505, + "step": 381 + }, + { + "epoch": 0.68, + "grad_norm": 0.5008757710456848, + "learning_rate": 0.0008844830025324122, + "loss": 0.1671, + "step": 382 + }, + { + "epoch": 0.68, + "grad_norm": 0.12061876803636551, + "learning_rate": 0.0008838863738637705, + "loss": 0.1375, + "step": 383 + }, + { + "epoch": 0.68, + "grad_norm": 0.6747052073478699, + "learning_rate": 0.0008832884107445138, + "loss": 0.1663, + "step": 384 + }, + { + "epoch": 0.68, + "grad_norm": 0.18846777081489563, + "learning_rate": 0.0008826891152532579, + "loss": 0.1148, + "step": 385 + }, + { + "epoch": 0.68, + "grad_norm": 0.0950111448764801, + "learning_rate": 0.0008820884894732497, + "loss": 0.1138, + "step": 386 + }, + { + "epoch": 0.68, + "grad_norm": 0.42371127009391785, + "learning_rate": 0.0008814865354923613, + "loss": 0.142, + "step": 387 + }, + { + "epoch": 0.69, + "grad_norm": 0.17662374675273895, + "learning_rate": 0.0008808832554030808, + "loss": 0.1255, + "step": 388 + }, + { + "epoch": 0.69, + "grad_norm": 0.7766286134719849, + "learning_rate": 0.0008802786513025068, + "loss": 0.1613, + "step": 389 + }, + { + "epoch": 0.69, + "grad_norm": 0.49581214785575867, + "learning_rate": 0.0008796727252923403, + "loss": 0.1346, + "step": 390 + }, + { + "epoch": 0.69, + "grad_norm": 0.6148929595947266, + "learning_rate": 0.0008790654794788768, + "loss": 0.1426, + "step": 391 + }, + { + "epoch": 0.69, + "grad_norm": 0.15860037505626678, + "learning_rate": 0.0008784569159730007, + "loss": 0.1382, + "step": 392 + }, + { + "epoch": 0.7, + "grad_norm": 0.6793199777603149, + "learning_rate": 0.0008778470368901761, + "loss": 0.1398, + "step": 393 + }, + { + "epoch": 0.7, + "grad_norm": 0.40314817428588867, + "learning_rate": 0.0008772358443504404, + "loss": 0.1428, + "step": 394 + }, + { + "epoch": 0.7, + "grad_norm": 0.6403933167457581, + "learning_rate": 0.0008766233404783974, + "loss": 0.1556, + "step": 395 + }, + { + "epoch": 0.7, + "grad_norm": 0.33554157614707947, + "learning_rate": 0.0008760095274032083, + "loss": 0.1439, + "step": 396 + }, + { + "epoch": 0.7, + "grad_norm": 0.45690324902534485, + "learning_rate": 0.000875394407258586, + "loss": 0.1374, + "step": 397 + }, + { + "epoch": 0.7, + "grad_norm": 0.0541120283305645, + "learning_rate": 0.0008747779821827868, + "loss": 0.1314, + "step": 398 + }, + { + "epoch": 0.71, + "grad_norm": 0.6533159613609314, + "learning_rate": 0.0008741602543186031, + "loss": 0.169, + "step": 399 + }, + { + "epoch": 0.71, + "grad_norm": 0.4919282793998718, + "learning_rate": 0.0008735412258133561, + "loss": 0.1569, + "step": 400 + }, + { + "epoch": 0.71, + "grad_norm": 0.30325594544410706, + "learning_rate": 0.0008729208988188881, + "loss": 0.1471, + "step": 401 + }, + { + "epoch": 0.71, + "grad_norm": 0.3497300148010254, + "learning_rate": 0.0008722992754915554, + "loss": 0.1457, + "step": 402 + }, + { + "epoch": 0.71, + "grad_norm": 0.22892774641513824, + "learning_rate": 0.0008716763579922203, + "loss": 0.1334, + "step": 403 + }, + { + "epoch": 0.71, + "grad_norm": 0.20050272345542908, + "learning_rate": 0.0008710521484862439, + "loss": 0.1446, + "step": 404 + }, + { + "epoch": 0.72, + "grad_norm": 0.5029633641242981, + "learning_rate": 0.0008704266491434787, + "loss": 0.171, + "step": 405 + }, + { + "epoch": 0.72, + "grad_norm": 0.2720576226711273, + "learning_rate": 0.0008697998621382607, + "loss": 0.144, + "step": 406 + }, + { + "epoch": 0.72, + "grad_norm": 0.10961242765188217, + "learning_rate": 0.000869171789649402, + "loss": 0.1349, + "step": 407 + }, + { + "epoch": 0.72, + "grad_norm": 0.13584192097187042, + "learning_rate": 0.0008685424338601833, + "loss": 0.1385, + "step": 408 + }, + { + "epoch": 0.72, + "grad_norm": 0.6586437821388245, + "learning_rate": 0.0008679117969583464, + "loss": 0.1459, + "step": 409 + }, + { + "epoch": 0.73, + "grad_norm": 0.24006032943725586, + "learning_rate": 0.0008672798811360864, + "loss": 0.1344, + "step": 410 + }, + { + "epoch": 0.73, + "grad_norm": 0.1859387755393982, + "learning_rate": 0.0008666466885900438, + "loss": 0.1358, + "step": 411 + }, + { + "epoch": 0.73, + "grad_norm": 0.5095134973526001, + "learning_rate": 0.0008660122215212977, + "loss": 0.1387, + "step": 412 + }, + { + "epoch": 0.73, + "grad_norm": 0.1827729493379593, + "learning_rate": 0.0008653764821353573, + "loss": 0.1377, + "step": 413 + }, + { + "epoch": 0.73, + "grad_norm": 0.14332665503025055, + "learning_rate": 0.0008647394726421547, + "loss": 0.131, + "step": 414 + }, + { + "epoch": 0.73, + "grad_norm": 0.383101224899292, + "learning_rate": 0.0008641011952560371, + "loss": 0.146, + "step": 415 + }, + { + "epoch": 0.74, + "grad_norm": 0.19079791009426117, + "learning_rate": 0.000863461652195759, + "loss": 0.1255, + "step": 416 + }, + { + "epoch": 0.74, + "grad_norm": 0.49537310004234314, + "learning_rate": 0.0008628208456844747, + "loss": 0.1602, + "step": 417 + }, + { + "epoch": 0.74, + "grad_norm": 0.5658069849014282, + "learning_rate": 0.0008621787779497306, + "loss": 0.1518, + "step": 418 + }, + { + "epoch": 0.74, + "grad_norm": 0.2572256326675415, + "learning_rate": 0.0008615354512234569, + "loss": 0.1369, + "step": 419 + }, + { + "epoch": 0.74, + "grad_norm": 1.1088945865631104, + "learning_rate": 0.0008608908677419605, + "loss": 0.1773, + "step": 420 + }, + { + "epoch": 0.74, + "grad_norm": 0.35405099391937256, + "learning_rate": 0.0008602450297459173, + "loss": 0.1441, + "step": 421 + }, + { + "epoch": 0.75, + "grad_norm": 0.39150556921958923, + "learning_rate": 0.0008595979394803633, + "loss": 0.147, + "step": 422 + }, + { + "epoch": 0.75, + "grad_norm": 0.07459918409585953, + "learning_rate": 0.0008589495991946885, + "loss": 0.1338, + "step": 423 + }, + { + "epoch": 0.75, + "grad_norm": 0.2999761402606964, + "learning_rate": 0.0008583000111426276, + "loss": 0.1357, + "step": 424 + }, + { + "epoch": 0.75, + "grad_norm": 0.28417065739631653, + "learning_rate": 0.0008576491775822525, + "loss": 0.1411, + "step": 425 + }, + { + "epoch": 0.75, + "grad_norm": 0.32605019211769104, + "learning_rate": 0.0008569971007759657, + "loss": 0.1329, + "step": 426 + }, + { + "epoch": 0.75, + "eval_loss": 0.13750587403774261, + "eval_runtime": 15.1749, + "eval_samples_per_second": 31.433, + "eval_steps_per_second": 7.908, + "step": 426 + }, + { + "epoch": 0.76, + "grad_norm": 0.047430120408535004, + "learning_rate": 0.0008563437829904903, + "loss": 0.1373, + "step": 427 + }, + { + "epoch": 0.76, + "grad_norm": 0.4616542160511017, + "learning_rate": 0.0008556892264968639, + "loss": 0.1534, + "step": 428 + }, + { + "epoch": 0.76, + "grad_norm": 0.12317585945129395, + "learning_rate": 0.0008550334335704297, + "loss": 0.1338, + "step": 429 + }, + { + "epoch": 0.76, + "grad_norm": 0.39604276418685913, + "learning_rate": 0.0008543764064908295, + "loss": 0.1434, + "step": 430 + }, + { + "epoch": 0.76, + "grad_norm": 0.3490678369998932, + "learning_rate": 0.0008537181475419944, + "loss": 0.1365, + "step": 431 + }, + { + "epoch": 0.76, + "grad_norm": 0.15001270174980164, + "learning_rate": 0.0008530586590121383, + "loss": 0.1358, + "step": 432 + }, + { + "epoch": 0.77, + "grad_norm": 0.33340635895729065, + "learning_rate": 0.0008523979431937492, + "loss": 0.1367, + "step": 433 + }, + { + "epoch": 0.77, + "grad_norm": 0.06029750779271126, + "learning_rate": 0.0008517360023835809, + "loss": 0.1366, + "step": 434 + }, + { + "epoch": 0.77, + "grad_norm": 0.07978738099336624, + "learning_rate": 0.0008510728388826463, + "loss": 0.1345, + "step": 435 + }, + { + "epoch": 0.77, + "grad_norm": 0.27599036693573, + "learning_rate": 0.0008504084549962079, + "loss": 0.1447, + "step": 436 + }, + { + "epoch": 0.77, + "grad_norm": 0.13302059471607208, + "learning_rate": 0.0008497428530337706, + "loss": 0.1407, + "step": 437 + }, + { + "epoch": 0.77, + "grad_norm": 0.20869582891464233, + "learning_rate": 0.0008490760353090737, + "loss": 0.1374, + "step": 438 + }, + { + "epoch": 0.78, + "grad_norm": 0.10881117731332779, + "learning_rate": 0.0008484080041400825, + "loss": 0.1429, + "step": 439 + }, + { + "epoch": 0.78, + "grad_norm": 0.20344361662864685, + "learning_rate": 0.0008477387618489807, + "loss": 0.139, + "step": 440 + }, + { + "epoch": 0.78, + "grad_norm": 0.07153432071208954, + "learning_rate": 0.0008470683107621615, + "loss": 0.1315, + "step": 441 + }, + { + "epoch": 0.78, + "grad_norm": 0.08688751608133316, + "learning_rate": 0.0008463966532102207, + "loss": 0.1346, + "step": 442 + }, + { + "epoch": 0.78, + "grad_norm": 0.06495650112628937, + "learning_rate": 0.0008457237915279476, + "loss": 0.1307, + "step": 443 + }, + { + "epoch": 0.79, + "grad_norm": 0.1892390102148056, + "learning_rate": 0.0008450497280543173, + "loss": 0.12, + "step": 444 + }, + { + "epoch": 0.79, + "grad_norm": 0.2579623758792877, + "learning_rate": 0.0008443744651324827, + "loss": 0.1531, + "step": 445 + }, + { + "epoch": 0.79, + "grad_norm": 0.149379700422287, + "learning_rate": 0.000843698005109766, + "loss": 0.1385, + "step": 446 + }, + { + "epoch": 0.79, + "grad_norm": 0.19281132519245148, + "learning_rate": 0.0008430203503376506, + "loss": 0.1033, + "step": 447 + }, + { + "epoch": 0.79, + "grad_norm": 0.33208444714546204, + "learning_rate": 0.0008423415031717733, + "loss": 0.1525, + "step": 448 + }, + { + "epoch": 0.79, + "grad_norm": 0.15149784088134766, + "learning_rate": 0.0008416614659719157, + "loss": 0.1282, + "step": 449 + }, + { + "epoch": 0.8, + "grad_norm": 0.24646438658237457, + "learning_rate": 0.0008409802411019962, + "loss": 0.1393, + "step": 450 + }, + { + "epoch": 0.8, + "grad_norm": 0.2505553662776947, + "learning_rate": 0.000840297830930062, + "loss": 0.1453, + "step": 451 + }, + { + "epoch": 0.8, + "grad_norm": 0.1632508784532547, + "learning_rate": 0.0008396142378282799, + "loss": 0.1274, + "step": 452 + }, + { + "epoch": 0.8, + "grad_norm": 0.12370573729276657, + "learning_rate": 0.0008389294641729292, + "loss": 0.1201, + "step": 453 + }, + { + "epoch": 0.8, + "grad_norm": 0.08046772330999374, + "learning_rate": 0.0008382435123443934, + "loss": 0.1263, + "step": 454 + }, + { + "epoch": 0.8, + "grad_norm": 0.19015488028526306, + "learning_rate": 0.0008375563847271506, + "loss": 0.1318, + "step": 455 + }, + { + "epoch": 0.81, + "grad_norm": 0.3562954366207123, + "learning_rate": 0.0008368680837097669, + "loss": 0.132, + "step": 456 + }, + { + "epoch": 0.81, + "grad_norm": 0.06315189599990845, + "learning_rate": 0.000836178611684887, + "loss": 0.1113, + "step": 457 + }, + { + "epoch": 0.81, + "grad_norm": 0.43667125701904297, + "learning_rate": 0.0008354879710492264, + "loss": 0.1908, + "step": 458 + }, + { + "epoch": 0.81, + "grad_norm": 0.0708879753947258, + "learning_rate": 0.0008347961642035624, + "loss": 0.1399, + "step": 459 + }, + { + "epoch": 0.81, + "grad_norm": 0.04855835437774658, + "learning_rate": 0.0008341031935527267, + "loss": 0.1258, + "step": 460 + }, + { + "epoch": 0.82, + "grad_norm": 0.1364990919828415, + "learning_rate": 0.0008334090615055965, + "loss": 0.1344, + "step": 461 + }, + { + "epoch": 0.82, + "grad_norm": 0.08166524022817612, + "learning_rate": 0.0008327137704750862, + "loss": 0.134, + "step": 462 + }, + { + "epoch": 0.82, + "grad_norm": 0.09308458864688873, + "learning_rate": 0.0008320173228781389, + "loss": 0.1507, + "step": 463 + }, + { + "epoch": 0.82, + "grad_norm": 0.07796576619148254, + "learning_rate": 0.000831319721135718, + "loss": 0.1284, + "step": 464 + }, + { + "epoch": 0.82, + "grad_norm": 0.12168626487255096, + "learning_rate": 0.0008306209676727993, + "loss": 0.148, + "step": 465 + }, + { + "epoch": 0.82, + "grad_norm": 0.18862847983837128, + "learning_rate": 0.000829921064918362, + "loss": 0.1229, + "step": 466 + }, + { + "epoch": 0.83, + "grad_norm": 0.23615515232086182, + "learning_rate": 0.00082922001530538, + "loss": 0.1322, + "step": 467 + }, + { + "epoch": 0.83, + "grad_norm": 0.34108766913414, + "learning_rate": 0.0008285178212708142, + "loss": 0.1338, + "step": 468 + }, + { + "epoch": 0.83, + "grad_norm": 0.39579400420188904, + "learning_rate": 0.0008278144852556042, + "loss": 0.1341, + "step": 469 + }, + { + "epoch": 0.83, + "grad_norm": 0.2620592713356018, + "learning_rate": 0.0008271100097046585, + "loss": 0.1395, + "step": 470 + }, + { + "epoch": 0.83, + "grad_norm": 0.08778171986341476, + "learning_rate": 0.0008264043970668469, + "loss": 0.1328, + "step": 471 + }, + { + "epoch": 0.84, + "grad_norm": 0.6086364388465881, + "learning_rate": 0.0008256976497949924, + "loss": 0.1271, + "step": 472 + }, + { + "epoch": 0.84, + "grad_norm": 0.08982394635677338, + "learning_rate": 0.0008249897703458619, + "loss": 0.1346, + "step": 473 + }, + { + "epoch": 0.84, + "grad_norm": 0.054080091416835785, + "learning_rate": 0.0008242807611801578, + "loss": 0.1218, + "step": 474 + }, + { + "epoch": 0.84, + "grad_norm": 0.5981457829475403, + "learning_rate": 0.0008235706247625098, + "loss": 0.1715, + "step": 475 + }, + { + "epoch": 0.84, + "grad_norm": 0.9139420986175537, + "learning_rate": 0.0008228593635614659, + "loss": 0.1983, + "step": 476 + }, + { + "epoch": 0.84, + "grad_norm": 0.05938498303294182, + "learning_rate": 0.0008221469800494841, + "loss": 0.1308, + "step": 477 + }, + { + "epoch": 0.85, + "grad_norm": 0.11526026576757431, + "learning_rate": 0.0008214334767029239, + "loss": 0.1422, + "step": 478 + }, + { + "epoch": 0.85, + "grad_norm": 0.3049907386302948, + "learning_rate": 0.0008207188560020373, + "loss": 0.1419, + "step": 479 + }, + { + "epoch": 0.85, + "grad_norm": 0.04782035946846008, + "learning_rate": 0.0008200031204309604, + "loss": 0.138, + "step": 480 + }, + { + "epoch": 0.85, + "grad_norm": 0.12950918078422546, + "learning_rate": 0.000819286272477705, + "loss": 0.1315, + "step": 481 + }, + { + "epoch": 0.85, + "grad_norm": 0.0429329015314579, + "learning_rate": 0.0008185683146341496, + "loss": 0.1354, + "step": 482 + }, + { + "epoch": 0.85, + "grad_norm": 0.4792588949203491, + "learning_rate": 0.0008178492493960308, + "loss": 0.1476, + "step": 483 + }, + { + "epoch": 0.86, + "grad_norm": 0.19784927368164062, + "learning_rate": 0.0008171290792629346, + "loss": 0.1394, + "step": 484 + }, + { + "epoch": 0.86, + "grad_norm": 0.1172945499420166, + "learning_rate": 0.000816407806738288, + "loss": 0.1302, + "step": 485 + }, + { + "epoch": 0.86, + "grad_norm": 0.3732689917087555, + "learning_rate": 0.0008156854343293501, + "loss": 0.1416, + "step": 486 + }, + { + "epoch": 0.86, + "grad_norm": 0.5152392983436584, + "learning_rate": 0.0008149619645472031, + "loss": 0.1403, + "step": 487 + }, + { + "epoch": 0.86, + "grad_norm": 0.15429601073265076, + "learning_rate": 0.000814237399906744, + "loss": 0.1322, + "step": 488 + }, + { + "epoch": 0.87, + "grad_norm": 1.0002127885818481, + "learning_rate": 0.0008135117429266756, + "loss": 0.1303, + "step": 489 + }, + { + "epoch": 0.87, + "grad_norm": 0.7232715487480164, + "learning_rate": 0.0008127849961294984, + "loss": 0.143, + "step": 490 + }, + { + "epoch": 0.87, + "grad_norm": 0.13510456681251526, + "learning_rate": 0.0008120571620415006, + "loss": 0.1536, + "step": 491 + }, + { + "epoch": 0.87, + "grad_norm": 0.5168789625167847, + "learning_rate": 0.0008113282431927503, + "loss": 0.1312, + "step": 492 + }, + { + "epoch": 0.87, + "grad_norm": 0.7039850950241089, + "learning_rate": 0.000810598242117086, + "loss": 0.118, + "step": 493 + }, + { + "epoch": 0.87, + "grad_norm": 1.5126641988754272, + "learning_rate": 0.0008098671613521089, + "loss": 0.2343, + "step": 494 + }, + { + "epoch": 0.88, + "grad_norm": 0.6958308815956116, + "learning_rate": 0.0008091350034391731, + "loss": 0.1648, + "step": 495 + }, + { + "epoch": 0.88, + "grad_norm": 6.979303359985352, + "learning_rate": 0.0008084017709233766, + "loss": 0.2261, + "step": 496 + }, + { + "epoch": 0.88, + "grad_norm": 0.3389752507209778, + "learning_rate": 0.0008076674663535537, + "loss": 0.146, + "step": 497 + }, + { + "epoch": 0.88, + "grad_norm": 0.19990071654319763, + "learning_rate": 0.0008069320922822643, + "loss": 0.1429, + "step": 498 + }, + { + "epoch": 0.88, + "grad_norm": 0.33689868450164795, + "learning_rate": 0.0008061956512657871, + "loss": 0.147, + "step": 499 + }, + { + "epoch": 0.88, + "grad_norm": 0.09925112873315811, + "learning_rate": 0.000805458145864109, + "loss": 0.1342, + "step": 500 + }, + { + "epoch": 0.89, + "grad_norm": 1.961702585220337, + "learning_rate": 0.0008047195786409172, + "loss": 0.1361, + "step": 501 + }, + { + "epoch": 0.89, + "grad_norm": 0.4342229962348938, + "learning_rate": 0.0008039799521635895, + "loss": 0.1485, + "step": 502 + }, + { + "epoch": 0.89, + "grad_norm": 0.1798858642578125, + "learning_rate": 0.0008032392690031867, + "loss": 0.1314, + "step": 503 + }, + { + "epoch": 0.89, + "grad_norm": 1.3653756380081177, + "learning_rate": 0.0008024975317344421, + "loss": 0.1388, + "step": 504 + }, + { + "epoch": 0.89, + "grad_norm": 9.677605628967285, + "learning_rate": 0.0008017547429357531, + "loss": 0.4186, + "step": 505 + }, + { + "epoch": 0.9, + "grad_norm": 8.348475456237793, + "learning_rate": 0.0008010109051891731, + "loss": 0.3806, + "step": 506 + }, + { + "epoch": 0.9, + "grad_norm": 35.19770050048828, + "learning_rate": 0.0008002660210804011, + "loss": 3.6145, + "step": 507 + }, + { + "epoch": 0.9, + "grad_norm": 9.18663501739502, + "learning_rate": 0.0007995200931987743, + "loss": 0.6162, + "step": 508 + }, + { + "epoch": 0.9, + "grad_norm": 0.05997322499752045, + "learning_rate": 0.0007987731241372571, + "loss": 0.1129, + "step": 509 + }, + { + "epoch": 0.9, + "grad_norm": 0.41408172249794006, + "learning_rate": 0.000798025116492434, + "loss": 0.1512, + "step": 510 + }, + { + "epoch": 0.9, + "grad_norm": 0.4445393979549408, + "learning_rate": 0.0007972760728644996, + "loss": 0.1463, + "step": 511 + }, + { + "epoch": 0.91, + "grad_norm": 0.19678063690662384, + "learning_rate": 0.0007965259958572495, + "loss": 0.1386, + "step": 512 + }, + { + "epoch": 0.91, + "grad_norm": 0.45497119426727295, + "learning_rate": 0.0007957748880780721, + "loss": 0.1373, + "step": 513 + }, + { + "epoch": 0.91, + "grad_norm": 0.6455509066581726, + "learning_rate": 0.0007950227521379381, + "loss": 0.1584, + "step": 514 + }, + { + "epoch": 0.91, + "grad_norm": 0.3793765604496002, + "learning_rate": 0.0007942695906513929, + "loss": 0.1236, + "step": 515 + }, + { + "epoch": 0.91, + "grad_norm": 0.20562775433063507, + "learning_rate": 0.0007935154062365467, + "loss": 0.1364, + "step": 516 + }, + { + "epoch": 0.91, + "grad_norm": 1.3131325244903564, + "learning_rate": 0.0007927602015150655, + "loss": 0.1556, + "step": 517 + }, + { + "epoch": 0.92, + "grad_norm": 0.1705670803785324, + "learning_rate": 0.0007920039791121617, + "loss": 0.1372, + "step": 518 + }, + { + "epoch": 0.92, + "grad_norm": 6.6207499504089355, + "learning_rate": 0.0007912467416565861, + "loss": 0.22, + "step": 519 + }, + { + "epoch": 0.92, + "grad_norm": 0.34343230724334717, + "learning_rate": 0.0007904884917806173, + "loss": 0.1453, + "step": 520 + }, + { + "epoch": 0.92, + "grad_norm": 0.4290754497051239, + "learning_rate": 0.0007897292321200537, + "loss": 0.1177, + "step": 521 + }, + { + "epoch": 0.92, + "grad_norm": 0.24469922482967377, + "learning_rate": 0.0007889689653142036, + "loss": 0.1369, + "step": 522 + }, + { + "epoch": 0.93, + "grad_norm": 0.5307168960571289, + "learning_rate": 0.0007882076940058763, + "loss": 0.1542, + "step": 523 + }, + { + "epoch": 0.93, + "grad_norm": 0.13802866637706757, + "learning_rate": 0.000787445420841373, + "loss": 0.1372, + "step": 524 + }, + { + "epoch": 0.93, + "grad_norm": 0.36055922508239746, + "learning_rate": 0.0007866821484704776, + "loss": 0.1413, + "step": 525 + }, + { + "epoch": 0.93, + "grad_norm": 0.36655113101005554, + "learning_rate": 0.0007859178795464472, + "loss": 0.1438, + "step": 526 + }, + { + "epoch": 0.93, + "grad_norm": 0.6237390637397766, + "learning_rate": 0.0007851526167260034, + "loss": 0.1382, + "step": 527 + }, + { + "epoch": 0.93, + "grad_norm": 0.42217007279396057, + "learning_rate": 0.0007843863626693221, + "loss": 0.1408, + "step": 528 + }, + { + "epoch": 0.94, + "grad_norm": 24.023250579833984, + "learning_rate": 0.0007836191200400256, + "loss": 0.1517, + "step": 529 + }, + { + "epoch": 0.94, + "grad_norm": 0.31599146127700806, + "learning_rate": 0.0007828508915051723, + "loss": 0.1353, + "step": 530 + }, + { + "epoch": 0.94, + "grad_norm": 0.6795622706413269, + "learning_rate": 0.0007820816797352479, + "loss": 0.1515, + "step": 531 + }, + { + "epoch": 0.94, + "grad_norm": 0.37493640184402466, + "learning_rate": 0.0007813114874041557, + "loss": 0.141, + "step": 532 + }, + { + "epoch": 0.94, + "grad_norm": 1.7365546226501465, + "learning_rate": 0.0007805403171892079, + "loss": 0.1347, + "step": 533 + }, + { + "epoch": 0.94, + "grad_norm": 18.393390655517578, + "learning_rate": 0.000779768171771116, + "loss": 0.1753, + "step": 534 + }, + { + "epoch": 0.95, + "grad_norm": 2.2978413105010986, + "learning_rate": 0.0007789950538339812, + "loss": 0.1418, + "step": 535 + }, + { + "epoch": 0.95, + "grad_norm": 0.495151162147522, + "learning_rate": 0.0007782209660652854, + "loss": 0.146, + "step": 536 + }, + { + "epoch": 0.95, + "grad_norm": 7.705572605133057, + "learning_rate": 0.0007774459111558821, + "loss": 0.2042, + "step": 537 + }, + { + "epoch": 0.95, + "grad_norm": 0.6036086678504944, + "learning_rate": 0.0007766698917999862, + "loss": 0.1695, + "step": 538 + }, + { + "epoch": 0.95, + "grad_norm": 127.21215057373047, + "learning_rate": 0.0007758929106951656, + "loss": 18.5136, + "step": 539 + }, + { + "epoch": 0.96, + "grad_norm": 40.58448791503906, + "learning_rate": 0.0007751149705423312, + "loss": 0.5973, + "step": 540 + }, + { + "epoch": 0.96, + "grad_norm": 0.6296218633651733, + "learning_rate": 0.0007743360740457278, + "loss": 0.1849, + "step": 541 + }, + { + "epoch": 0.96, + "grad_norm": 0.4533160924911499, + "learning_rate": 0.0007735562239129247, + "loss": 0.1464, + "step": 542 + }, + { + "epoch": 0.96, + "grad_norm": 0.2379036247730255, + "learning_rate": 0.0007727754228548058, + "loss": 0.1267, + "step": 543 + }, + { + "epoch": 0.96, + "grad_norm": 0.8904889225959778, + "learning_rate": 0.000771993673585561, + "loss": 0.2181, + "step": 544 + }, + { + "epoch": 0.96, + "grad_norm": 0.8934443593025208, + "learning_rate": 0.0007712109788226762, + "loss": 0.2158, + "step": 545 + }, + { + "epoch": 0.97, + "grad_norm": 0.3368353545665741, + "learning_rate": 0.0007704273412869238, + "loss": 0.1489, + "step": 546 + }, + { + "epoch": 0.97, + "grad_norm": 0.2570180594921112, + "learning_rate": 0.0007696427637023537, + "loss": 0.144, + "step": 547 + }, + { + "epoch": 0.97, + "grad_norm": 2.865034580230713, + "learning_rate": 0.0007688572487962834, + "loss": 0.1664, + "step": 548 + }, + { + "epoch": 0.97, + "grad_norm": 0.4369525611400604, + "learning_rate": 0.0007680707992992888, + "loss": 0.1777, + "step": 549 + }, + { + "epoch": 0.97, + "grad_norm": 0.2545509934425354, + "learning_rate": 0.0007672834179451942, + "loss": 0.1536, + "step": 550 + }, + { + "epoch": 0.97, + "grad_norm": 0.14455465972423553, + "learning_rate": 0.0007664951074710638, + "loss": 0.1256, + "step": 551 + }, + { + "epoch": 0.98, + "grad_norm": 0.16001886129379272, + "learning_rate": 0.0007657058706171911, + "loss": 0.1356, + "step": 552 + }, + { + "epoch": 0.98, + "grad_norm": 0.2537885308265686, + "learning_rate": 0.0007649157101270903, + "loss": 0.1393, + "step": 553 + }, + { + "epoch": 0.98, + "grad_norm": 0.33060047030448914, + "learning_rate": 0.0007641246287474854, + "loss": 0.148, + "step": 554 + }, + { + "epoch": 0.98, + "grad_norm": 1.691941499710083, + "learning_rate": 0.0007633326292283028, + "loss": 0.1764, + "step": 555 + }, + { + "epoch": 0.98, + "grad_norm": 0.20472805202007294, + "learning_rate": 0.0007625397143226595, + "loss": 0.1424, + "step": 556 + }, + { + "epoch": 0.99, + "grad_norm": 0.7124485969543457, + "learning_rate": 0.0007617458867868553, + "loss": 0.1482, + "step": 557 + }, + { + "epoch": 0.99, + "grad_norm": 0.09631184488534927, + "learning_rate": 0.0007609511493803615, + "loss": 0.1392, + "step": 558 + }, + { + "epoch": 0.99, + "grad_norm": 0.20814809203147888, + "learning_rate": 0.0007601555048658133, + "loss": 0.1384, + "step": 559 + }, + { + "epoch": 0.99, + "grad_norm": 0.19566737115383148, + "learning_rate": 0.0007593589560089984, + "loss": 0.1394, + "step": 560 + }, + { + "epoch": 0.99, + "grad_norm": 0.13406091928482056, + "learning_rate": 0.0007585615055788484, + "loss": 0.1389, + "step": 561 + }, + { + "epoch": 0.99, + "grad_norm": 0.07635807991027832, + "learning_rate": 0.0007577631563474291, + "loss": 0.1376, + "step": 562 + }, + { + "epoch": 1.0, + "grad_norm": 0.11265091598033905, + "learning_rate": 0.0007569639110899302, + "loss": 0.1395, + "step": 563 + }, + { + "epoch": 1.0, + "grad_norm": 0.31152746081352234, + "learning_rate": 0.0007561637725846567, + "loss": 0.1407, + "step": 564 + }, + { + "epoch": 1.0, + "grad_norm": 0.13474373519420624, + "learning_rate": 0.0007553627436130183, + "loss": 0.1386, + "step": 565 + }, + { + "epoch": 1.0, + "grad_norm": 0.23706336319446564, + "learning_rate": 0.0007545608269595201, + "loss": 0.1417, + "step": 566 + }, + { + "epoch": 1.0, + "grad_norm": 0.30558836460113525, + "learning_rate": 0.0007537580254117531, + "loss": 0.1452, + "step": 567 + }, + { + "epoch": 1.0, + "grad_norm": 0.106146439909935, + "learning_rate": 0.0007529543417603843, + "loss": 0.1372, + "step": 568 + }, + { + "epoch": 1.0, + "eval_loss": 0.13968442380428314, + "eval_runtime": 15.1558, + "eval_samples_per_second": 31.473, + "eval_steps_per_second": 7.918, + "step": 568 + }, + { + "epoch": 1.01, + "grad_norm": 0.3243511915206909, + "learning_rate": 0.0007521497787991472, + "loss": 0.1424, + "step": 569 + }, + { + "epoch": 1.01, + "grad_norm": 0.19688986241817474, + "learning_rate": 0.0007513443393248312, + "loss": 0.1403, + "step": 570 + }, + { + "epoch": 1.01, + "grad_norm": 0.1128445565700531, + "learning_rate": 0.0007505380261372734, + "loss": 0.1397, + "step": 571 + }, + { + "epoch": 1.01, + "grad_norm": 0.11025507003068924, + "learning_rate": 0.0007497308420393477, + "loss": 0.1391, + "step": 572 + }, + { + "epoch": 1.01, + "grad_norm": 0.19862700998783112, + "learning_rate": 0.0007489227898369558, + "loss": 0.1345, + "step": 573 + }, + { + "epoch": 1.02, + "grad_norm": 0.11129032075405121, + "learning_rate": 0.0007481138723390164, + "loss": 0.1342, + "step": 574 + }, + { + "epoch": 1.02, + "grad_norm": 0.21451863646507263, + "learning_rate": 0.0007473040923574567, + "loss": 0.132, + "step": 575 + }, + { + "epoch": 1.02, + "grad_norm": 0.6781334280967712, + "learning_rate": 0.0007464934527072016, + "loss": 0.1688, + "step": 576 + }, + { + "epoch": 1.02, + "grad_norm": 0.3881673812866211, + "learning_rate": 0.0007456819562061648, + "loss": 0.15, + "step": 577 + }, + { + "epoch": 1.02, + "grad_norm": 0.0530267171561718, + "learning_rate": 0.0007448696056752383, + "loss": 0.139, + "step": 578 + }, + { + "epoch": 1.02, + "grad_norm": 0.2782767415046692, + "learning_rate": 0.0007440564039382827, + "loss": 0.1334, + "step": 579 + }, + { + "epoch": 1.03, + "grad_norm": 0.693821370601654, + "learning_rate": 0.0007432423538221178, + "loss": 0.1673, + "step": 580 + }, + { + "epoch": 1.03, + "grad_norm": 0.27020275592803955, + "learning_rate": 0.0007424274581565122, + "loss": 0.1464, + "step": 581 + }, + { + "epoch": 1.03, + "grad_norm": 0.42129820585250854, + "learning_rate": 0.0007416117197741742, + "loss": 0.1507, + "step": 582 + }, + { + "epoch": 1.03, + "grad_norm": 0.21161474287509918, + "learning_rate": 0.0007407951415107412, + "loss": 0.1398, + "step": 583 + }, + { + "epoch": 1.03, + "grad_norm": 0.13954728841781616, + "learning_rate": 0.00073997772620477, + "loss": 0.1387, + "step": 584 + }, + { + "epoch": 1.03, + "grad_norm": 0.08771730959415436, + "learning_rate": 0.0007391594766977276, + "loss": 0.1419, + "step": 585 + }, + { + "epoch": 1.04, + "grad_norm": 0.2630119025707245, + "learning_rate": 0.0007383403958339806, + "loss": 0.1483, + "step": 586 + }, + { + "epoch": 1.04, + "grad_norm": 0.07496945559978485, + "learning_rate": 0.0007375204864607851, + "loss": 0.1339, + "step": 587 + }, + { + "epoch": 1.04, + "grad_norm": 0.25115033984184265, + "learning_rate": 0.0007366997514282782, + "loss": 0.129, + "step": 588 + }, + { + "epoch": 1.04, + "grad_norm": 0.24599210917949677, + "learning_rate": 0.0007358781935894659, + "loss": 0.1491, + "step": 589 + }, + { + "epoch": 1.04, + "grad_norm": 0.14762777090072632, + "learning_rate": 0.0007350558158002153, + "loss": 0.1287, + "step": 590 + }, + { + "epoch": 1.05, + "grad_norm": 0.03114377148449421, + "learning_rate": 0.0007342326209192435, + "loss": 0.1379, + "step": 591 + }, + { + "epoch": 1.05, + "grad_norm": 0.5076407194137573, + "learning_rate": 0.000733408611808108, + "loss": 0.1122, + "step": 592 + }, + { + "epoch": 1.05, + "grad_norm": 0.10492309182882309, + "learning_rate": 0.0007325837913311966, + "loss": 0.1284, + "step": 593 + }, + { + "epoch": 1.05, + "grad_norm": 0.1740669459104538, + "learning_rate": 0.0007317581623557177, + "loss": 0.1458, + "step": 594 + }, + { + "epoch": 1.05, + "grad_norm": 0.20419681072235107, + "learning_rate": 0.00073093172775169, + "loss": 0.1511, + "step": 595 + }, + { + "epoch": 1.05, + "grad_norm": 0.1906755119562149, + "learning_rate": 0.0007301044903919325, + "loss": 0.1052, + "step": 596 + }, + { + "epoch": 1.06, + "grad_norm": 0.10201478004455566, + "learning_rate": 0.0007292764531520552, + "loss": 0.1147, + "step": 597 + }, + { + "epoch": 1.06, + "grad_norm": 0.4594266712665558, + "learning_rate": 0.0007284476189104485, + "loss": 0.1739, + "step": 598 + }, + { + "epoch": 1.06, + "grad_norm": 0.1697234809398651, + "learning_rate": 0.0007276179905482729, + "loss": 0.1049, + "step": 599 + }, + { + "epoch": 1.06, + "grad_norm": 0.09107261896133423, + "learning_rate": 0.0007267875709494499, + "loss": 0.1319, + "step": 600 + }, + { + "epoch": 1.06, + "grad_norm": 0.07888934016227722, + "learning_rate": 0.0007259563630006512, + "loss": 0.1323, + "step": 601 + }, + { + "epoch": 1.07, + "grad_norm": 0.06149132549762726, + "learning_rate": 0.0007251243695912886, + "loss": 0.1239, + "step": 602 + }, + { + "epoch": 1.07, + "grad_norm": 0.3240460157394409, + "learning_rate": 0.0007242915936135052, + "loss": 0.1614, + "step": 603 + }, + { + "epoch": 1.07, + "grad_norm": 0.04239710047841072, + "learning_rate": 0.0007234580379621635, + "loss": 0.1336, + "step": 604 + }, + { + "epoch": 1.07, + "grad_norm": 0.04415787383913994, + "learning_rate": 0.000722623705534837, + "loss": 0.1336, + "step": 605 + }, + { + "epoch": 1.07, + "grad_norm": 0.13500471413135529, + "learning_rate": 0.0007217885992317985, + "loss": 0.1429, + "step": 606 + }, + { + "epoch": 1.07, + "grad_norm": 0.09405327588319778, + "learning_rate": 0.0007209527219560119, + "loss": 0.1399, + "step": 607 + }, + { + "epoch": 1.08, + "grad_norm": 0.16369308531284332, + "learning_rate": 0.0007201160766131207, + "loss": 0.1389, + "step": 608 + }, + { + "epoch": 1.08, + "grad_norm": 0.24509336054325104, + "learning_rate": 0.0007192786661114383, + "loss": 0.1376, + "step": 609 + }, + { + "epoch": 1.08, + "grad_norm": 0.29961100220680237, + "learning_rate": 0.0007184404933619377, + "loss": 0.133, + "step": 610 + }, + { + "epoch": 1.08, + "grad_norm": 0.4641360640525818, + "learning_rate": 0.0007176015612782421, + "loss": 0.1491, + "step": 611 + }, + { + "epoch": 1.08, + "grad_norm": 0.059663355350494385, + "learning_rate": 0.0007167618727766138, + "loss": 0.1365, + "step": 612 + }, + { + "epoch": 1.08, + "grad_norm": 0.16221192479133606, + "learning_rate": 0.0007159214307759448, + "loss": 0.1395, + "step": 613 + }, + { + "epoch": 1.09, + "grad_norm": 0.04930780455470085, + "learning_rate": 0.0007150802381977463, + "loss": 0.1368, + "step": 614 + }, + { + "epoch": 1.09, + "grad_norm": 0.6152715086936951, + "learning_rate": 0.0007142382979661386, + "loss": 0.1291, + "step": 615 + }, + { + "epoch": 1.09, + "grad_norm": 0.15247471630573273, + "learning_rate": 0.0007133956130078411, + "loss": 0.1404, + "step": 616 + }, + { + "epoch": 1.09, + "grad_norm": 0.7167736887931824, + "learning_rate": 0.000712552186252162, + "loss": 0.1642, + "step": 617 + }, + { + "epoch": 1.09, + "grad_norm": 0.2419363558292389, + "learning_rate": 0.0007117080206309878, + "loss": 0.1317, + "step": 618 + }, + { + "epoch": 1.1, + "grad_norm": 0.5636677742004395, + "learning_rate": 0.0007108631190787735, + "loss": 0.147, + "step": 619 + }, + { + "epoch": 1.1, + "grad_norm": 0.26012521982192993, + "learning_rate": 0.0007100174845325327, + "loss": 0.1344, + "step": 620 + }, + { + "epoch": 1.1, + "grad_norm": 0.14845141768455505, + "learning_rate": 0.0007091711199318265, + "loss": 0.1299, + "step": 621 + }, + { + "epoch": 1.1, + "grad_norm": 0.1344316601753235, + "learning_rate": 0.0007083240282187542, + "loss": 0.1401, + "step": 622 + }, + { + "epoch": 1.1, + "grad_norm": 0.08974921703338623, + "learning_rate": 0.0007074762123379423, + "loss": 0.1257, + "step": 623 + }, + { + "epoch": 1.1, + "grad_norm": 0.3263636529445648, + "learning_rate": 0.0007066276752365351, + "loss": 0.1587, + "step": 624 + }, + { + "epoch": 1.11, + "grad_norm": 0.12418147176504135, + "learning_rate": 0.0007057784198641835, + "loss": 0.1361, + "step": 625 + }, + { + "epoch": 1.11, + "grad_norm": 0.3086402714252472, + "learning_rate": 0.0007049284491730353, + "loss": 0.1496, + "step": 626 + }, + { + "epoch": 1.11, + "grad_norm": 0.3529713749885559, + "learning_rate": 0.000704077766117725, + "loss": 0.1523, + "step": 627 + }, + { + "epoch": 1.11, + "grad_norm": 0.11717434972524643, + "learning_rate": 0.0007032263736553634, + "loss": 0.1402, + "step": 628 + }, + { + "epoch": 1.11, + "grad_norm": 0.19153334200382233, + "learning_rate": 0.0007023742747455275, + "loss": 0.1407, + "step": 629 + }, + { + "epoch": 1.11, + "grad_norm": 0.22798961400985718, + "learning_rate": 0.0007015214723502495, + "loss": 0.1262, + "step": 630 + }, + { + "epoch": 1.12, + "grad_norm": 0.4415830373764038, + "learning_rate": 0.0007006679694340073, + "loss": 0.1494, + "step": 631 + }, + { + "epoch": 1.12, + "grad_norm": 0.554614782333374, + "learning_rate": 0.0006998137689637142, + "loss": 0.1324, + "step": 632 + }, + { + "epoch": 1.12, + "grad_norm": 0.32604262232780457, + "learning_rate": 0.0006989588739087078, + "loss": 0.1433, + "step": 633 + }, + { + "epoch": 1.12, + "grad_norm": 0.5145484209060669, + "learning_rate": 0.0006981032872407406, + "loss": 0.152, + "step": 634 + }, + { + "epoch": 1.12, + "grad_norm": 0.3538295030593872, + "learning_rate": 0.0006972470119339691, + "loss": 0.137, + "step": 635 + }, + { + "epoch": 1.13, + "grad_norm": 0.2451559156179428, + "learning_rate": 0.0006963900509649435, + "loss": 0.1418, + "step": 636 + }, + { + "epoch": 1.13, + "grad_norm": 0.2240092009305954, + "learning_rate": 0.0006955324073125978, + "loss": 0.1406, + "step": 637 + }, + { + "epoch": 1.13, + "grad_norm": 0.5672935843467712, + "learning_rate": 0.0006946740839582387, + "loss": 0.1682, + "step": 638 + }, + { + "epoch": 1.13, + "grad_norm": 0.3396548628807068, + "learning_rate": 0.000693815083885536, + "loss": 0.1538, + "step": 639 + }, + { + "epoch": 1.13, + "grad_norm": 0.2464788407087326, + "learning_rate": 0.0006929554100805117, + "loss": 0.145, + "step": 640 + }, + { + "epoch": 1.13, + "grad_norm": 0.08380208164453506, + "learning_rate": 0.0006920950655315297, + "loss": 0.1333, + "step": 641 + }, + { + "epoch": 1.14, + "grad_norm": 0.04563472419977188, + "learning_rate": 0.000691234053229286, + "loss": 0.1371, + "step": 642 + }, + { + "epoch": 1.14, + "grad_norm": 0.0336502380669117, + "learning_rate": 0.0006903723761667972, + "loss": 0.1383, + "step": 643 + }, + { + "epoch": 1.14, + "grad_norm": 0.11504160612821579, + "learning_rate": 0.0006895100373393912, + "loss": 0.1366, + "step": 644 + }, + { + "epoch": 1.14, + "grad_norm": 0.4302406311035156, + "learning_rate": 0.0006886470397446957, + "loss": 0.1464, + "step": 645 + }, + { + "epoch": 1.14, + "grad_norm": 0.13670873641967773, + "learning_rate": 0.0006877833863826295, + "loss": 0.1399, + "step": 646 + }, + { + "epoch": 1.14, + "grad_norm": 0.11441440135240555, + "learning_rate": 0.0006869190802553894, + "loss": 0.1389, + "step": 647 + }, + { + "epoch": 1.15, + "grad_norm": 0.07245034724473953, + "learning_rate": 0.0006860541243674426, + "loss": 0.1376, + "step": 648 + }, + { + "epoch": 1.15, + "grad_norm": 0.12628068029880524, + "learning_rate": 0.0006851885217255144, + "loss": 0.1314, + "step": 649 + }, + { + "epoch": 1.15, + "grad_norm": 0.345865935087204, + "learning_rate": 0.0006843222753385784, + "loss": 0.1469, + "step": 650 + }, + { + "epoch": 1.15, + "grad_norm": 0.18721798062324524, + "learning_rate": 0.0006834553882178463, + "loss": 0.129, + "step": 651 + }, + { + "epoch": 1.15, + "grad_norm": 0.1566080003976822, + "learning_rate": 0.0006825878633767564, + "loss": 0.1296, + "step": 652 + }, + { + "epoch": 1.16, + "grad_norm": 0.13990430533885956, + "learning_rate": 0.0006817197038309643, + "loss": 0.1245, + "step": 653 + }, + { + "epoch": 1.16, + "grad_norm": 0.26073744893074036, + "learning_rate": 0.000680850912598332, + "loss": 0.1437, + "step": 654 + }, + { + "epoch": 1.16, + "grad_norm": 0.05034814029932022, + "learning_rate": 0.0006799814926989171, + "loss": 0.1209, + "step": 655 + }, + { + "epoch": 1.16, + "grad_norm": 0.29498428106307983, + "learning_rate": 0.0006791114471549626, + "loss": 0.1476, + "step": 656 + }, + { + "epoch": 1.16, + "grad_norm": 0.24109311401844025, + "learning_rate": 0.0006782407789908863, + "loss": 0.1421, + "step": 657 + }, + { + "epoch": 1.16, + "grad_norm": 0.2070060819387436, + "learning_rate": 0.0006773694912332707, + "loss": 0.1174, + "step": 658 + }, + { + "epoch": 1.17, + "grad_norm": 0.05099210515618324, + "learning_rate": 0.0006764975869108514, + "loss": 0.1325, + "step": 659 + }, + { + "epoch": 1.17, + "grad_norm": 0.03778371214866638, + "learning_rate": 0.0006756250690545078, + "loss": 0.1326, + "step": 660 + }, + { + "epoch": 1.17, + "grad_norm": 0.23074184358119965, + "learning_rate": 0.0006747519406972524, + "loss": 0.1417, + "step": 661 + }, + { + "epoch": 1.17, + "grad_norm": 0.162948340177536, + "learning_rate": 0.0006738782048742187, + "loss": 0.1422, + "step": 662 + }, + { + "epoch": 1.17, + "grad_norm": 0.1257455050945282, + "learning_rate": 0.0006730038646226531, + "loss": 0.1352, + "step": 663 + }, + { + "epoch": 1.17, + "grad_norm": 0.1732119917869568, + "learning_rate": 0.0006721289229819024, + "loss": 0.1313, + "step": 664 + }, + { + "epoch": 1.18, + "grad_norm": 0.15348908305168152, + "learning_rate": 0.0006712533829934043, + "loss": 0.139, + "step": 665 + }, + { + "epoch": 1.18, + "grad_norm": 0.06923094391822815, + "learning_rate": 0.0006703772477006757, + "loss": 0.1381, + "step": 666 + }, + { + "epoch": 1.18, + "grad_norm": 0.307449609041214, + "learning_rate": 0.0006695005201493037, + "loss": 0.1365, + "step": 667 + }, + { + "epoch": 1.18, + "grad_norm": 0.09788268059492111, + "learning_rate": 0.0006686232033869343, + "loss": 0.1358, + "step": 668 + }, + { + "epoch": 1.18, + "grad_norm": 0.23847998678684235, + "learning_rate": 0.0006677453004632608, + "loss": 0.1399, + "step": 669 + }, + { + "epoch": 1.19, + "grad_norm": 0.08175510168075562, + "learning_rate": 0.0006668668144300149, + "loss": 0.1372, + "step": 670 + }, + { + "epoch": 1.19, + "grad_norm": 0.18189309537410736, + "learning_rate": 0.0006659877483409545, + "loss": 0.1401, + "step": 671 + }, + { + "epoch": 1.19, + "grad_norm": 0.08665986359119415, + "learning_rate": 0.000665108105251855, + "loss": 0.1345, + "step": 672 + }, + { + "epoch": 1.19, + "grad_norm": 0.40454381704330444, + "learning_rate": 0.0006642278882204963, + "loss": 0.1342, + "step": 673 + }, + { + "epoch": 1.19, + "grad_norm": 0.27606263756752014, + "learning_rate": 0.0006633471003066543, + "loss": 0.1363, + "step": 674 + }, + { + "epoch": 1.19, + "grad_norm": 0.06796804070472717, + "learning_rate": 0.000662465744572089, + "loss": 0.1353, + "step": 675 + }, + { + "epoch": 1.2, + "grad_norm": 0.4458450376987457, + "learning_rate": 0.0006615838240805343, + "loss": 0.1521, + "step": 676 + }, + { + "epoch": 1.2, + "grad_norm": 0.3369523286819458, + "learning_rate": 0.0006607013418976873, + "loss": 0.1489, + "step": 677 + }, + { + "epoch": 1.2, + "grad_norm": 0.20170435309410095, + "learning_rate": 0.0006598183010911978, + "loss": 0.1263, + "step": 678 + }, + { + "epoch": 1.2, + "grad_norm": 0.11186213046312332, + "learning_rate": 0.0006589347047306571, + "loss": 0.1344, + "step": 679 + }, + { + "epoch": 1.2, + "grad_norm": 0.12327159941196442, + "learning_rate": 0.0006580505558875878, + "loss": 0.1354, + "step": 680 + }, + { + "epoch": 1.2, + "grad_norm": 0.05389246717095375, + "learning_rate": 0.0006571658576354334, + "loss": 0.1333, + "step": 681 + }, + { + "epoch": 1.21, + "grad_norm": 0.20890717208385468, + "learning_rate": 0.0006562806130495466, + "loss": 0.1428, + "step": 682 + }, + { + "epoch": 1.21, + "grad_norm": 0.12948615849018097, + "learning_rate": 0.0006553948252071799, + "loss": 0.1372, + "step": 683 + }, + { + "epoch": 1.21, + "grad_norm": 0.16449519991874695, + "learning_rate": 0.0006545084971874737, + "loss": 0.1418, + "step": 684 + }, + { + "epoch": 1.21, + "grad_norm": 0.04887047037482262, + "learning_rate": 0.0006536216320714466, + "loss": 0.139, + "step": 685 + }, + { + "epoch": 1.21, + "grad_norm": 0.1712215691804886, + "learning_rate": 0.0006527342329419836, + "loss": 0.1389, + "step": 686 + }, + { + "epoch": 1.22, + "grad_norm": 0.14935021102428436, + "learning_rate": 0.000651846302883827, + "loss": 0.1369, + "step": 687 + }, + { + "epoch": 1.22, + "grad_norm": 0.16822853684425354, + "learning_rate": 0.0006509578449835636, + "loss": 0.1393, + "step": 688 + }, + { + "epoch": 1.22, + "grad_norm": 0.04274258390069008, + "learning_rate": 0.0006500688623296158, + "loss": 0.1339, + "step": 689 + }, + { + "epoch": 1.22, + "grad_norm": 0.20485758781433105, + "learning_rate": 0.00064917935801223, + "loss": 0.1232, + "step": 690 + }, + { + "epoch": 1.22, + "grad_norm": 0.16438162326812744, + "learning_rate": 0.0006482893351234658, + "loss": 0.1272, + "step": 691 + }, + { + "epoch": 1.22, + "grad_norm": 0.0820753276348114, + "learning_rate": 0.0006473987967571855, + "loss": 0.1368, + "step": 692 + }, + { + "epoch": 1.23, + "grad_norm": 0.5247365832328796, + "learning_rate": 0.000646507746009043, + "loss": 0.1702, + "step": 693 + }, + { + "epoch": 1.23, + "grad_norm": 0.21259160339832306, + "learning_rate": 0.0006456161859764745, + "loss": 0.1384, + "step": 694 + }, + { + "epoch": 1.23, + "grad_norm": 0.10756111145019531, + "learning_rate": 0.0006447241197586847, + "loss": 0.1316, + "step": 695 + }, + { + "epoch": 1.23, + "grad_norm": 0.32431429624557495, + "learning_rate": 0.0006438315504566397, + "loss": 0.1505, + "step": 696 + }, + { + "epoch": 1.23, + "grad_norm": 0.09354525059461594, + "learning_rate": 0.0006429384811730528, + "loss": 0.1338, + "step": 697 + }, + { + "epoch": 1.23, + "grad_norm": 0.25492650270462036, + "learning_rate": 0.0006420449150123767, + "loss": 0.1391, + "step": 698 + }, + { + "epoch": 1.24, + "grad_norm": 0.28658854961395264, + "learning_rate": 0.0006411508550807905, + "loss": 0.1336, + "step": 699 + }, + { + "epoch": 1.24, + "grad_norm": 0.21230942010879517, + "learning_rate": 0.0006402563044861899, + "loss": 0.1369, + "step": 700 + }, + { + "epoch": 1.24, + "grad_norm": 0.13693292438983917, + "learning_rate": 0.0006393612663381763, + "loss": 0.1347, + "step": 701 + }, + { + "epoch": 1.24, + "grad_norm": 0.20328965783119202, + "learning_rate": 0.0006384657437480457, + "loss": 0.1349, + "step": 702 + }, + { + "epoch": 1.24, + "grad_norm": 0.1463640034198761, + "learning_rate": 0.0006375697398287788, + "loss": 0.1316, + "step": 703 + }, + { + "epoch": 1.25, + "grad_norm": 0.47083455324172974, + "learning_rate": 0.0006366732576950283, + "loss": 0.1538, + "step": 704 + }, + { + "epoch": 1.25, + "grad_norm": 0.18148604035377502, + "learning_rate": 0.0006357763004631105, + "loss": 0.1264, + "step": 705 + }, + { + "epoch": 1.25, + "grad_norm": 0.10440527647733688, + "learning_rate": 0.000634878871250992, + "loss": 0.1209, + "step": 706 + }, + { + "epoch": 1.25, + "grad_norm": 0.42732179164886475, + "learning_rate": 0.000633980973178281, + "loss": 0.1581, + "step": 707 + }, + { + "epoch": 1.25, + "grad_norm": 0.09864400327205658, + "learning_rate": 0.0006330826093662157, + "loss": 0.1398, + "step": 708 + }, + { + "epoch": 1.25, + "grad_norm": 0.2839510142803192, + "learning_rate": 0.000632183782937652, + "loss": 0.1448, + "step": 709 + }, + { + "epoch": 1.26, + "grad_norm": 0.18296539783477783, + "learning_rate": 0.0006312844970170551, + "loss": 0.1369, + "step": 710 + }, + { + "epoch": 1.26, + "eval_loss": 0.13823845982551575, + "eval_runtime": 15.0864, + "eval_samples_per_second": 31.618, + "eval_steps_per_second": 7.954, + "step": 710 + }, + { + "epoch": 1.26, + "grad_norm": 0.2305176705121994, + "learning_rate": 0.0006303847547304872, + "loss": 0.1253, + "step": 711 + }, + { + "epoch": 1.26, + "grad_norm": 0.2792215049266815, + "learning_rate": 0.0006294845592055967, + "loss": 0.1292, + "step": 712 + }, + { + "epoch": 1.26, + "grad_norm": 0.3560260236263275, + "learning_rate": 0.0006285839135716078, + "loss": 0.142, + "step": 713 + }, + { + "epoch": 1.26, + "grad_norm": 0.5769198536872864, + "learning_rate": 0.000627682820959309, + "loss": 0.1629, + "step": 714 + }, + { + "epoch": 1.26, + "grad_norm": 0.24223147332668304, + "learning_rate": 0.000626781284501043, + "loss": 0.1436, + "step": 715 + }, + { + "epoch": 1.27, + "grad_norm": 0.20025451481342316, + "learning_rate": 0.0006258793073306948, + "loss": 0.1271, + "step": 716 + }, + { + "epoch": 1.27, + "grad_norm": 0.5235323905944824, + "learning_rate": 0.0006249768925836822, + "loss": 0.1362, + "step": 717 + }, + { + "epoch": 1.27, + "grad_norm": 0.12457533925771713, + "learning_rate": 0.0006240740433969432, + "loss": 0.1267, + "step": 718 + }, + { + "epoch": 1.27, + "grad_norm": 0.22851230204105377, + "learning_rate": 0.0006231707629089263, + "loss": 0.1368, + "step": 719 + }, + { + "epoch": 1.27, + "grad_norm": 0.12162783741950989, + "learning_rate": 0.0006222670542595799, + "loss": 0.1348, + "step": 720 + }, + { + "epoch": 1.28, + "grad_norm": 0.21811127662658691, + "learning_rate": 0.0006213629205903399, + "loss": 0.1302, + "step": 721 + }, + { + "epoch": 1.28, + "grad_norm": 0.1042797714471817, + "learning_rate": 0.0006204583650441201, + "loss": 0.1227, + "step": 722 + }, + { + "epoch": 1.28, + "grad_norm": 0.5917842388153076, + "learning_rate": 0.0006195533907653003, + "loss": 0.1218, + "step": 723 + }, + { + "epoch": 1.28, + "grad_norm": 0.6369093656539917, + "learning_rate": 0.000618648000899717, + "loss": 0.1309, + "step": 724 + }, + { + "epoch": 1.28, + "grad_norm": 0.298677921295166, + "learning_rate": 0.0006177421985946498, + "loss": 0.1329, + "step": 725 + }, + { + "epoch": 1.28, + "grad_norm": 0.5087531208992004, + "learning_rate": 0.0006168359869988133, + "loss": 0.1619, + "step": 726 + }, + { + "epoch": 1.29, + "grad_norm": 0.5805624723434448, + "learning_rate": 0.0006159293692623443, + "loss": 0.1388, + "step": 727 + }, + { + "epoch": 1.29, + "grad_norm": 0.595432698726654, + "learning_rate": 0.0006150223485367914, + "loss": 0.1363, + "step": 728 + }, + { + "epoch": 1.29, + "grad_norm": 2.0664656162261963, + "learning_rate": 0.0006141149279751042, + "loss": 0.1373, + "step": 729 + }, + { + "epoch": 1.29, + "grad_norm": 1.3190929889678955, + "learning_rate": 0.0006132071107316221, + "loss": 0.1434, + "step": 730 + }, + { + "epoch": 1.29, + "grad_norm": 0.19045250117778778, + "learning_rate": 0.0006122988999620634, + "loss": 0.1177, + "step": 731 + }, + { + "epoch": 1.3, + "grad_norm": 1.3130842447280884, + "learning_rate": 0.0006113902988235145, + "loss": 0.1542, + "step": 732 + }, + { + "epoch": 1.3, + "grad_norm": 0.5767085552215576, + "learning_rate": 0.0006104813104744187, + "loss": 0.1627, + "step": 733 + }, + { + "epoch": 1.3, + "grad_norm": 0.45873621106147766, + "learning_rate": 0.0006095719380745653, + "loss": 0.1369, + "step": 734 + }, + { + "epoch": 1.3, + "grad_norm": 0.4458267092704773, + "learning_rate": 0.0006086621847850788, + "loss": 0.1207, + "step": 735 + }, + { + "epoch": 1.3, + "grad_norm": 0.13178426027297974, + "learning_rate": 0.0006077520537684072, + "loss": 0.1263, + "step": 736 + }, + { + "epoch": 1.3, + "grad_norm": 0.19360630214214325, + "learning_rate": 0.0006068415481883121, + "loss": 0.1366, + "step": 737 + }, + { + "epoch": 1.31, + "grad_norm": 0.12965673208236694, + "learning_rate": 0.0006059306712098571, + "loss": 0.1436, + "step": 738 + }, + { + "epoch": 1.31, + "grad_norm": 0.13222691416740417, + "learning_rate": 0.0006050194259993966, + "loss": 0.1294, + "step": 739 + }, + { + "epoch": 1.31, + "grad_norm": 0.14453792572021484, + "learning_rate": 0.0006041078157245648, + "loss": 0.1273, + "step": 740 + }, + { + "epoch": 1.31, + "grad_norm": 0.27612432837486267, + "learning_rate": 0.0006031958435542659, + "loss": 0.1145, + "step": 741 + }, + { + "epoch": 1.31, + "grad_norm": 0.6110266447067261, + "learning_rate": 0.0006022835126586609, + "loss": 0.1299, + "step": 742 + }, + { + "epoch": 1.31, + "grad_norm": 0.29062649607658386, + "learning_rate": 0.0006013708262091586, + "loss": 0.132, + "step": 743 + }, + { + "epoch": 1.32, + "grad_norm": 0.44451919198036194, + "learning_rate": 0.0006004577873784034, + "loss": 0.1235, + "step": 744 + }, + { + "epoch": 1.32, + "grad_norm": 0.15329335629940033, + "learning_rate": 0.0005995443993402648, + "loss": 0.1462, + "step": 745 + }, + { + "epoch": 1.32, + "grad_norm": 0.7718572616577148, + "learning_rate": 0.000598630665269826, + "loss": 0.1309, + "step": 746 + }, + { + "epoch": 1.32, + "grad_norm": 0.427112340927124, + "learning_rate": 0.0005977165883433733, + "loss": 0.1565, + "step": 747 + }, + { + "epoch": 1.32, + "grad_norm": 0.18447764217853546, + "learning_rate": 0.0005968021717383849, + "loss": 0.1431, + "step": 748 + }, + { + "epoch": 1.33, + "grad_norm": 0.2808150053024292, + "learning_rate": 0.0005958874186335193, + "loss": 0.1429, + "step": 749 + }, + { + "epoch": 1.33, + "grad_norm": 0.29161331057548523, + "learning_rate": 0.0005949723322086053, + "loss": 0.1238, + "step": 750 + }, + { + "epoch": 1.33, + "grad_norm": 0.3760308027267456, + "learning_rate": 0.0005940569156446298, + "loss": 0.1416, + "step": 751 + }, + { + "epoch": 1.33, + "grad_norm": 0.24679379165172577, + "learning_rate": 0.0005931411721237279, + "loss": 0.1366, + "step": 752 + }, + { + "epoch": 1.33, + "grad_norm": 0.8587498664855957, + "learning_rate": 0.0005922251048291707, + "loss": 0.1525, + "step": 753 + }, + { + "epoch": 1.33, + "grad_norm": 0.17934030294418335, + "learning_rate": 0.0005913087169453553, + "loss": 0.1287, + "step": 754 + }, + { + "epoch": 1.34, + "grad_norm": 0.43589112162590027, + "learning_rate": 0.0005903920116577931, + "loss": 0.1472, + "step": 755 + }, + { + "epoch": 1.34, + "grad_norm": 0.061591099947690964, + "learning_rate": 0.0005894749921530983, + "loss": 0.129, + "step": 756 + }, + { + "epoch": 1.34, + "grad_norm": 2.9088215827941895, + "learning_rate": 0.0005885576616189781, + "loss": 0.1379, + "step": 757 + }, + { + "epoch": 1.34, + "grad_norm": 0.5655280351638794, + "learning_rate": 0.0005876400232442205, + "loss": 0.138, + "step": 758 + }, + { + "epoch": 1.34, + "grad_norm": 0.05325045809149742, + "learning_rate": 0.0005867220802186837, + "loss": 0.1269, + "step": 759 + }, + { + "epoch": 1.34, + "grad_norm": 0.08003886044025421, + "learning_rate": 0.000585803835733285, + "loss": 0.1386, + "step": 760 + }, + { + "epoch": 1.35, + "grad_norm": 0.09640295058488846, + "learning_rate": 0.0005848852929799894, + "loss": 0.1364, + "step": 761 + }, + { + "epoch": 1.35, + "grad_norm": 0.5506378412246704, + "learning_rate": 0.0005839664551517988, + "loss": 0.1424, + "step": 762 + }, + { + "epoch": 1.35, + "grad_norm": 0.06252831220626831, + "learning_rate": 0.000583047325442741, + "loss": 0.1384, + "step": 763 + }, + { + "epoch": 1.35, + "grad_norm": 0.11976012587547302, + "learning_rate": 0.0005821279070478583, + "loss": 0.1422, + "step": 764 + }, + { + "epoch": 1.35, + "grad_norm": 0.13022871315479279, + "learning_rate": 0.0005812082031631966, + "loss": 0.1357, + "step": 765 + }, + { + "epoch": 1.36, + "grad_norm": 0.12146733701229095, + "learning_rate": 0.0005802882169857938, + "loss": 0.14, + "step": 766 + }, + { + "epoch": 1.36, + "grad_norm": 0.17471055686473846, + "learning_rate": 0.00057936795171367, + "loss": 0.1432, + "step": 767 + }, + { + "epoch": 1.36, + "grad_norm": 0.07833831012248993, + "learning_rate": 0.0005784474105458143, + "loss": 0.1425, + "step": 768 + }, + { + "epoch": 1.36, + "grad_norm": 0.17084555327892303, + "learning_rate": 0.000577526596682176, + "loss": 0.1437, + "step": 769 + }, + { + "epoch": 1.36, + "grad_norm": 0.112625353038311, + "learning_rate": 0.0005766055133236513, + "loss": 0.1429, + "step": 770 + }, + { + "epoch": 1.36, + "grad_norm": 0.15991152822971344, + "learning_rate": 0.000575684163672074, + "loss": 0.1387, + "step": 771 + }, + { + "epoch": 1.37, + "grad_norm": 0.18027663230895996, + "learning_rate": 0.0005747625509302033, + "loss": 0.1439, + "step": 772 + }, + { + "epoch": 1.37, + "grad_norm": 0.4315040111541748, + "learning_rate": 0.0005738406783017127, + "loss": 0.1524, + "step": 773 + }, + { + "epoch": 1.37, + "grad_norm": 0.3343091905117035, + "learning_rate": 0.0005729185489911797, + "loss": 0.1481, + "step": 774 + }, + { + "epoch": 1.37, + "grad_norm": 0.16762638092041016, + "learning_rate": 0.0005719961662040733, + "loss": 0.1389, + "step": 775 + }, + { + "epoch": 1.37, + "grad_norm": 0.11396286636590958, + "learning_rate": 0.0005710735331467444, + "loss": 0.1351, + "step": 776 + }, + { + "epoch": 1.37, + "grad_norm": 0.11262958496809006, + "learning_rate": 0.0005701506530264132, + "loss": 0.1343, + "step": 777 + }, + { + "epoch": 1.38, + "grad_norm": 0.06293229013681412, + "learning_rate": 0.0005692275290511592, + "loss": 0.1322, + "step": 778 + }, + { + "epoch": 1.38, + "grad_norm": 0.037539321929216385, + "learning_rate": 0.0005683041644299093, + "loss": 0.13, + "step": 779 + }, + { + "epoch": 1.38, + "grad_norm": 0.2424931526184082, + "learning_rate": 0.0005673805623724272, + "loss": 0.1333, + "step": 780 + }, + { + "epoch": 1.38, + "grad_norm": 0.2825300991535187, + "learning_rate": 0.0005664567260893019, + "loss": 0.15, + "step": 781 + }, + { + "epoch": 1.38, + "grad_norm": 0.12494263052940369, + "learning_rate": 0.000565532658791936, + "loss": 0.1317, + "step": 782 + }, + { + "epoch": 1.39, + "grad_norm": 0.29031720757484436, + "learning_rate": 0.0005646083636925362, + "loss": 0.1593, + "step": 783 + }, + { + "epoch": 1.39, + "grad_norm": 0.3038758933544159, + "learning_rate": 0.0005636838440041004, + "loss": 0.1551, + "step": 784 + }, + { + "epoch": 1.39, + "grad_norm": 0.14983759820461273, + "learning_rate": 0.0005627591029404071, + "loss": 0.1402, + "step": 785 + }, + { + "epoch": 1.39, + "grad_norm": 0.25971877574920654, + "learning_rate": 0.0005618341437160049, + "loss": 0.1389, + "step": 786 + }, + { + "epoch": 1.39, + "grad_norm": 0.2425714135169983, + "learning_rate": 0.0005609089695462002, + "loss": 0.1399, + "step": 787 + }, + { + "epoch": 1.39, + "grad_norm": 0.11737050861120224, + "learning_rate": 0.0005599835836470469, + "loss": 0.1237, + "step": 788 + }, + { + "epoch": 1.4, + "grad_norm": 0.6737673878669739, + "learning_rate": 0.0005590579892353348, + "loss": 0.1837, + "step": 789 + }, + { + "epoch": 1.4, + "grad_norm": 0.2361481785774231, + "learning_rate": 0.0005581321895285787, + "loss": 0.146, + "step": 790 + }, + { + "epoch": 1.4, + "grad_norm": 0.47753140330314636, + "learning_rate": 0.0005572061877450068, + "loss": 0.1664, + "step": 791 + }, + { + "epoch": 1.4, + "grad_norm": 0.2968634068965912, + "learning_rate": 0.0005562799871035495, + "loss": 0.1511, + "step": 792 + }, + { + "epoch": 1.4, + "grad_norm": 0.20170801877975464, + "learning_rate": 0.0005553535908238294, + "loss": 0.1408, + "step": 793 + }, + { + "epoch": 1.4, + "grad_norm": 0.11540532112121582, + "learning_rate": 0.0005544270021261482, + "loss": 0.1415, + "step": 794 + }, + { + "epoch": 1.41, + "grad_norm": 0.10350099951028824, + "learning_rate": 0.0005535002242314772, + "loss": 0.1393, + "step": 795 + }, + { + "epoch": 1.41, + "grad_norm": 0.06757602840662003, + "learning_rate": 0.0005525732603614444, + "loss": 0.1335, + "step": 796 + }, + { + "epoch": 1.41, + "grad_norm": 0.11407013237476349, + "learning_rate": 0.0005516461137383254, + "loss": 0.1342, + "step": 797 + }, + { + "epoch": 1.41, + "grad_norm": 0.40271708369255066, + "learning_rate": 0.0005507187875850305, + "loss": 0.1536, + "step": 798 + }, + { + "epoch": 1.41, + "grad_norm": 0.35031354427337646, + "learning_rate": 0.000549791285125094, + "loss": 0.1489, + "step": 799 + }, + { + "epoch": 1.42, + "grad_norm": 0.347901850938797, + "learning_rate": 0.0005488636095826636, + "loss": 0.1463, + "step": 800 + }, + { + "epoch": 1.42, + "grad_norm": 0.17497143149375916, + "learning_rate": 0.0005479357641824877, + "loss": 0.1385, + "step": 801 + }, + { + "epoch": 1.42, + "grad_norm": 0.42803797125816345, + "learning_rate": 0.0005470077521499062, + "loss": 0.1438, + "step": 802 + }, + { + "epoch": 1.42, + "grad_norm": 0.77762371301651, + "learning_rate": 0.0005460795767108378, + "loss": 0.1616, + "step": 803 + }, + { + "epoch": 1.42, + "grad_norm": 0.27612486481666565, + "learning_rate": 0.0005451512410917691, + "loss": 0.1424, + "step": 804 + }, + { + "epoch": 1.42, + "grad_norm": 0.10936840623617172, + "learning_rate": 0.0005442227485197435, + "loss": 0.1379, + "step": 805 + }, + { + "epoch": 1.43, + "grad_norm": 0.19322127103805542, + "learning_rate": 0.0005432941022223503, + "loss": 0.1279, + "step": 806 + }, + { + "epoch": 1.43, + "grad_norm": 0.14601223170757294, + "learning_rate": 0.000542365305427713, + "loss": 0.1394, + "step": 807 + }, + { + "epoch": 1.43, + "grad_norm": 0.05485713109374046, + "learning_rate": 0.0005414363613644781, + "loss": 0.1245, + "step": 808 + }, + { + "epoch": 1.43, + "grad_norm": 3.4448142051696777, + "learning_rate": 0.0005405072732618043, + "loss": 0.3245, + "step": 809 + }, + { + "epoch": 1.43, + "grad_norm": 0.3802805542945862, + "learning_rate": 0.0005395780443493508, + "loss": 0.1617, + "step": 810 + }, + { + "epoch": 1.43, + "grad_norm": 2.19525146484375, + "learning_rate": 0.0005386486778572665, + "loss": 0.3246, + "step": 811 + }, + { + "epoch": 1.44, + "grad_norm": 0.1237780973315239, + "learning_rate": 0.0005377191770161783, + "loss": 0.1348, + "step": 812 + }, + { + "epoch": 1.44, + "grad_norm": 0.17096708714962006, + "learning_rate": 0.0005367895450571801, + "loss": 0.1417, + "step": 813 + }, + { + "epoch": 1.44, + "grad_norm": 0.033038243651390076, + "learning_rate": 0.0005358597852118219, + "loss": 0.1308, + "step": 814 + }, + { + "epoch": 1.44, + "grad_norm": 0.2681437134742737, + "learning_rate": 0.000534929900712098, + "loss": 0.1465, + "step": 815 + }, + { + "epoch": 1.44, + "grad_norm": 0.11117050051689148, + "learning_rate": 0.0005339998947904363, + "loss": 0.1383, + "step": 816 + }, + { + "epoch": 1.45, + "grad_norm": 0.25842952728271484, + "learning_rate": 0.0005330697706796861, + "loss": 0.1397, + "step": 817 + }, + { + "epoch": 1.45, + "grad_norm": 0.08293187618255615, + "learning_rate": 0.0005321395316131083, + "loss": 0.1356, + "step": 818 + }, + { + "epoch": 1.45, + "grad_norm": 0.14762446284294128, + "learning_rate": 0.0005312091808243631, + "loss": 0.1416, + "step": 819 + }, + { + "epoch": 1.45, + "grad_norm": 0.4471145570278168, + "learning_rate": 0.0005302787215474991, + "loss": 0.1461, + "step": 820 + }, + { + "epoch": 1.45, + "grad_norm": 0.2443719059228897, + "learning_rate": 0.0005293481570169421, + "loss": 0.1458, + "step": 821 + }, + { + "epoch": 1.45, + "grad_norm": 0.22830860316753387, + "learning_rate": 0.0005284174904674835, + "loss": 0.139, + "step": 822 + }, + { + "epoch": 1.46, + "grad_norm": 0.5184169411659241, + "learning_rate": 0.0005274867251342694, + "loss": 0.1417, + "step": 823 + }, + { + "epoch": 1.46, + "grad_norm": 0.3812021017074585, + "learning_rate": 0.0005265558642527897, + "loss": 0.1346, + "step": 824 + }, + { + "epoch": 1.46, + "grad_norm": 0.2922486662864685, + "learning_rate": 0.0005256249110588659, + "loss": 0.1294, + "step": 825 + }, + { + "epoch": 1.46, + "grad_norm": 0.29819944500923157, + "learning_rate": 0.0005246938687886409, + "loss": 0.1401, + "step": 826 + }, + { + "epoch": 1.46, + "grad_norm": 0.07050393521785736, + "learning_rate": 0.0005237627406785666, + "loss": 0.1307, + "step": 827 + }, + { + "epoch": 1.46, + "grad_norm": 0.1988169550895691, + "learning_rate": 0.0005228315299653941, + "loss": 0.1359, + "step": 828 + }, + { + "epoch": 1.47, + "grad_norm": 0.31983789801597595, + "learning_rate": 0.0005219002398861611, + "loss": 0.1459, + "step": 829 + }, + { + "epoch": 1.47, + "grad_norm": 0.28960883617401123, + "learning_rate": 0.000520968873678181, + "loss": 0.1447, + "step": 830 + }, + { + "epoch": 1.47, + "grad_norm": 0.36790764331817627, + "learning_rate": 0.0005200374345790325, + "loss": 0.1287, + "step": 831 + }, + { + "epoch": 1.47, + "grad_norm": 0.055655404925346375, + "learning_rate": 0.0005191059258265471, + "loss": 0.1346, + "step": 832 + }, + { + "epoch": 1.47, + "grad_norm": 0.42271995544433594, + "learning_rate": 0.0005181743506587989, + "loss": 0.1445, + "step": 833 + }, + { + "epoch": 1.48, + "grad_norm": 0.4579026997089386, + "learning_rate": 0.0005172427123140923, + "loss": 0.1397, + "step": 834 + }, + { + "epoch": 1.48, + "grad_norm": 0.30265891551971436, + "learning_rate": 0.0005163110140309518, + "loss": 0.1389, + "step": 835 + }, + { + "epoch": 1.48, + "grad_norm": 0.23372715711593628, + "learning_rate": 0.0005153792590481101, + "loss": 0.1426, + "step": 836 + }, + { + "epoch": 1.48, + "grad_norm": 0.22771018743515015, + "learning_rate": 0.0005144474506044969, + "loss": 0.1412, + "step": 837 + }, + { + "epoch": 1.48, + "grad_norm": 0.32557952404022217, + "learning_rate": 0.000513515591939228, + "loss": 0.1382, + "step": 838 + }, + { + "epoch": 1.48, + "grad_norm": 0.4409979283809662, + "learning_rate": 0.0005125836862915934, + "loss": 0.1382, + "step": 839 + }, + { + "epoch": 1.49, + "grad_norm": 112.177978515625, + "learning_rate": 0.0005116517369010466, + "loss": 1.093, + "step": 840 + }, + { + "epoch": 1.49, + "grad_norm": 0.13140363991260529, + "learning_rate": 0.0005107197470071933, + "loss": 0.1344, + "step": 841 + }, + { + "epoch": 1.49, + "grad_norm": 0.0935206264257431, + "learning_rate": 0.00050978771984978, + "loss": 0.1315, + "step": 842 + }, + { + "epoch": 1.49, + "grad_norm": 0.5304569602012634, + "learning_rate": 0.0005088556586686822, + "loss": 0.1549, + "step": 843 + }, + { + "epoch": 1.49, + "grad_norm": 0.07438669353723526, + "learning_rate": 0.0005079235667038944, + "loss": 0.1311, + "step": 844 + }, + { + "epoch": 1.49, + "grad_norm": 0.17763537168502808, + "learning_rate": 0.0005069914471955179, + "loss": 0.1342, + "step": 845 + }, + { + "epoch": 1.5, + "grad_norm": 0.3256682753562927, + "learning_rate": 0.0005060593033837493, + "loss": 0.1435, + "step": 846 + }, + { + "epoch": 1.5, + "grad_norm": 0.3771526515483856, + "learning_rate": 0.0005051271385088701, + "loss": 0.1434, + "step": 847 + }, + { + "epoch": 1.5, + "grad_norm": 0.3716539740562439, + "learning_rate": 0.0005041949558112351, + "loss": 0.1329, + "step": 848 + }, + { + "epoch": 1.5, + "grad_norm": 0.13685204088687897, + "learning_rate": 0.0005032627585312608, + "loss": 0.1415, + "step": 849 + }, + { + "epoch": 1.5, + "grad_norm": 0.21241213381290436, + "learning_rate": 0.0005023305499094144, + "loss": 0.1384, + "step": 850 + }, + { + "epoch": 1.51, + "grad_norm": 0.05780967324972153, + "learning_rate": 0.0005013983331862026, + "loss": 0.1366, + "step": 851 + }, + { + "epoch": 1.51, + "grad_norm": 0.5117526650428772, + "learning_rate": 0.0005004661116021605, + "loss": 0.1537, + "step": 852 + }, + { + "epoch": 1.51, + "eval_loss": 0.14083661139011383, + "eval_runtime": 14.5613, + "eval_samples_per_second": 32.758, + "eval_steps_per_second": 8.241, + "step": 852 + }, + { + "epoch": 1.51, + "grad_norm": 0.6577463150024414, + "learning_rate": 0.0004995338883978395, + "loss": 0.1461, + "step": 853 + }, + { + "epoch": 1.51, + "grad_norm": 0.35039347410202026, + "learning_rate": 0.0004986016668137974, + "loss": 0.1345, + "step": 854 + }, + { + "epoch": 1.51, + "grad_norm": 0.1379460096359253, + "learning_rate": 0.0004976694500905857, + "loss": 0.1425, + "step": 855 + }, + { + "epoch": 1.51, + "grad_norm": 0.23959362506866455, + "learning_rate": 0.0004967372414687393, + "loss": 0.1535, + "step": 856 + }, + { + "epoch": 1.52, + "grad_norm": 0.387977659702301, + "learning_rate": 0.000495805044188765, + "loss": 0.1709, + "step": 857 + }, + { + "epoch": 1.52, + "grad_norm": 0.10591788589954376, + "learning_rate": 0.0004948728614911299, + "loss": 0.137, + "step": 858 + }, + { + "epoch": 1.52, + "grad_norm": 0.1370454728603363, + "learning_rate": 0.0004939406966162507, + "loss": 0.1413, + "step": 859 + }, + { + "epoch": 1.52, + "grad_norm": 0.10982546955347061, + "learning_rate": 0.0004930085528044823, + "loss": 0.1422, + "step": 860 + }, + { + "epoch": 1.52, + "grad_norm": 0.1631615161895752, + "learning_rate": 0.0004920764332961055, + "loss": 0.1439, + "step": 861 + }, + { + "epoch": 1.52, + "grad_norm": 0.4625565707683563, + "learning_rate": 0.0004911443413313179, + "loss": 0.13, + "step": 862 + }, + { + "epoch": 1.53, + "grad_norm": 0.09019370377063751, + "learning_rate": 0.0004902122801502201, + "loss": 0.1367, + "step": 863 + }, + { + "epoch": 1.53, + "grad_norm": 0.058873746544122696, + "learning_rate": 0.0004892802529928067, + "loss": 0.1388, + "step": 864 + }, + { + "epoch": 1.53, + "grad_norm": 0.16651901602745056, + "learning_rate": 0.0004883482630989535, + "loss": 0.1383, + "step": 865 + }, + { + "epoch": 1.53, + "grad_norm": 0.1222059577703476, + "learning_rate": 0.00048741631370840676, + "loss": 0.1391, + "step": 866 + }, + { + "epoch": 1.53, + "grad_norm": 0.15417732298374176, + "learning_rate": 0.00048648440806077226, + "loss": 0.1368, + "step": 867 + }, + { + "epoch": 1.54, + "grad_norm": 0.19719868898391724, + "learning_rate": 0.00048555254939550326, + "loss": 0.1423, + "step": 868 + }, + { + "epoch": 1.54, + "grad_norm": 0.1811167150735855, + "learning_rate": 0.0004846207409518899, + "loss": 0.1382, + "step": 869 + }, + { + "epoch": 1.54, + "grad_norm": 0.12746267020702362, + "learning_rate": 0.0004836889859690483, + "loss": 0.1375, + "step": 870 + }, + { + "epoch": 1.54, + "grad_norm": 0.18294665217399597, + "learning_rate": 0.00048275728768590776, + "loss": 0.1376, + "step": 871 + }, + { + "epoch": 1.54, + "grad_norm": 0.14346922934055328, + "learning_rate": 0.0004818256493412011, + "loss": 0.137, + "step": 872 + }, + { + "epoch": 1.54, + "grad_norm": 0.07995035499334335, + "learning_rate": 0.00048089407417345296, + "loss": 0.1356, + "step": 873 + }, + { + "epoch": 1.55, + "grad_norm": 0.14909203350543976, + "learning_rate": 0.0004799625654209675, + "loss": 0.1374, + "step": 874 + }, + { + "epoch": 1.55, + "grad_norm": 0.06708569079637527, + "learning_rate": 0.00047903112632181904, + "loss": 0.1381, + "step": 875 + }, + { + "epoch": 1.55, + "grad_norm": 0.22370800375938416, + "learning_rate": 0.00047809976011383906, + "loss": 0.1445, + "step": 876 + }, + { + "epoch": 1.55, + "grad_norm": 0.05151727795600891, + "learning_rate": 0.0004771684700346059, + "loss": 0.1371, + "step": 877 + }, + { + "epoch": 1.55, + "grad_norm": 0.12744151055812836, + "learning_rate": 0.0004762372593214335, + "loss": 0.1369, + "step": 878 + }, + { + "epoch": 1.56, + "grad_norm": 0.13104400038719177, + "learning_rate": 0.0004753061312113592, + "loss": 0.1346, + "step": 879 + }, + { + "epoch": 1.56, + "grad_norm": 0.11967893689870834, + "learning_rate": 0.00047437508894113416, + "loss": 0.1318, + "step": 880 + }, + { + "epoch": 1.56, + "grad_norm": 0.035317592322826385, + "learning_rate": 0.00047344413574721046, + "loss": 0.1352, + "step": 881 + }, + { + "epoch": 1.56, + "grad_norm": 0.15099988877773285, + "learning_rate": 0.0004725132748657307, + "loss": 0.1401, + "step": 882 + }, + { + "epoch": 1.56, + "grad_norm": 0.24859024584293365, + "learning_rate": 0.0004715825095325168, + "loss": 0.1277, + "step": 883 + }, + { + "epoch": 1.56, + "grad_norm": 0.11024681478738785, + "learning_rate": 0.00047065184298305797, + "loss": 0.1375, + "step": 884 + }, + { + "epoch": 1.57, + "grad_norm": 0.031196558848023415, + "learning_rate": 0.00046972127845250084, + "loss": 0.133, + "step": 885 + }, + { + "epoch": 1.57, + "grad_norm": 0.05172949284315109, + "learning_rate": 0.00046879081917563695, + "loss": 0.1324, + "step": 886 + }, + { + "epoch": 1.57, + "grad_norm": 0.04595587030053139, + "learning_rate": 0.0004678604683868918, + "loss": 0.1361, + "step": 887 + }, + { + "epoch": 1.57, + "grad_norm": 0.054625846445560455, + "learning_rate": 0.00046693022932031415, + "loss": 0.1334, + "step": 888 + }, + { + "epoch": 1.57, + "grad_norm": 0.18956537544727325, + "learning_rate": 0.0004660001052095639, + "loss": 0.1419, + "step": 889 + }, + { + "epoch": 1.57, + "grad_norm": 0.12293694168329239, + "learning_rate": 0.00046507009928790195, + "loss": 0.1234, + "step": 890 + }, + { + "epoch": 1.58, + "grad_norm": 0.12140147387981415, + "learning_rate": 0.00046414021478817817, + "loss": 0.1282, + "step": 891 + }, + { + "epoch": 1.58, + "grad_norm": 0.20622491836547852, + "learning_rate": 0.00046321045494282, + "loss": 0.1238, + "step": 892 + }, + { + "epoch": 1.58, + "grad_norm": 0.1974942833185196, + "learning_rate": 0.00046228082298382196, + "loss": 0.1511, + "step": 893 + }, + { + "epoch": 1.58, + "grad_norm": 0.13894042372703552, + "learning_rate": 0.0004613513221427337, + "loss": 0.1349, + "step": 894 + }, + { + "epoch": 1.58, + "grad_norm": 0.023365622386336327, + "learning_rate": 0.00046042195565064914, + "loss": 0.1371, + "step": 895 + }, + { + "epoch": 1.59, + "grad_norm": 0.16795076429843903, + "learning_rate": 0.0004594927267381958, + "loss": 0.1399, + "step": 896 + }, + { + "epoch": 1.59, + "grad_norm": 0.06654185056686401, + "learning_rate": 0.00045856363863552195, + "loss": 0.1365, + "step": 897 + }, + { + "epoch": 1.59, + "grad_norm": 0.21354056894779205, + "learning_rate": 0.00045763469457228695, + "loss": 0.1431, + "step": 898 + }, + { + "epoch": 1.59, + "grad_norm": 0.1247892901301384, + "learning_rate": 0.0004567058977776498, + "loss": 0.1391, + "step": 899 + }, + { + "epoch": 1.59, + "grad_norm": 0.1229679062962532, + "learning_rate": 0.00045577725148025647, + "loss": 0.1324, + "step": 900 + }, + { + "epoch": 1.59, + "grad_norm": 0.0285334512591362, + "learning_rate": 0.000454848758908231, + "loss": 0.1417, + "step": 901 + }, + { + "epoch": 1.6, + "grad_norm": 0.11522159725427628, + "learning_rate": 0.0004539204232891622, + "loss": 0.1349, + "step": 902 + }, + { + "epoch": 1.6, + "grad_norm": 0.04999208077788353, + "learning_rate": 0.00045299224785009374, + "loss": 0.1395, + "step": 903 + }, + { + "epoch": 1.6, + "grad_norm": 0.19387230277061462, + "learning_rate": 0.00045206423581751245, + "loss": 0.1367, + "step": 904 + }, + { + "epoch": 1.6, + "grad_norm": 0.030587391927838326, + "learning_rate": 0.0004511363904173366, + "loss": 0.1392, + "step": 905 + }, + { + "epoch": 1.6, + "grad_norm": 0.031090332195162773, + "learning_rate": 0.0004502087148749061, + "loss": 0.137, + "step": 906 + }, + { + "epoch": 1.6, + "grad_norm": 0.07691047340631485, + "learning_rate": 0.0004492812124149696, + "loss": 0.144, + "step": 907 + }, + { + "epoch": 1.61, + "grad_norm": 0.0668129101395607, + "learning_rate": 0.0004483538862616747, + "loss": 0.1337, + "step": 908 + }, + { + "epoch": 1.61, + "grad_norm": 0.3449445962905884, + "learning_rate": 0.00044742673963855576, + "loss": 0.1526, + "step": 909 + }, + { + "epoch": 1.61, + "grad_norm": 0.19746670126914978, + "learning_rate": 0.000446499775768523, + "loss": 0.1319, + "step": 910 + }, + { + "epoch": 1.61, + "grad_norm": 0.11988267302513123, + "learning_rate": 0.0004455729978738517, + "loss": 0.1411, + "step": 911 + }, + { + "epoch": 1.61, + "grad_norm": 0.17063240706920624, + "learning_rate": 0.00044464640917617063, + "loss": 0.1354, + "step": 912 + }, + { + "epoch": 1.62, + "grad_norm": 0.26187554001808167, + "learning_rate": 0.00044372001289645044, + "loss": 0.136, + "step": 913 + }, + { + "epoch": 1.62, + "grad_norm": 0.05965143442153931, + "learning_rate": 0.00044279381225499344, + "loss": 0.1398, + "step": 914 + }, + { + "epoch": 1.62, + "grad_norm": 0.07176820188760757, + "learning_rate": 0.00044186781047142134, + "loss": 0.1388, + "step": 915 + }, + { + "epoch": 1.62, + "grad_norm": 0.038787998259067535, + "learning_rate": 0.0004409420107646652, + "loss": 0.1383, + "step": 916 + }, + { + "epoch": 1.62, + "grad_norm": 0.03987140208482742, + "learning_rate": 0.0004400164163529532, + "loss": 0.1366, + "step": 917 + }, + { + "epoch": 1.62, + "grad_norm": 0.12179240584373474, + "learning_rate": 0.00043909103045379987, + "loss": 0.1306, + "step": 918 + }, + { + "epoch": 1.63, + "grad_norm": 0.09804455190896988, + "learning_rate": 0.0004381658562839953, + "loss": 0.128, + "step": 919 + }, + { + "epoch": 1.63, + "grad_norm": 0.08840085566043854, + "learning_rate": 0.00043724089705959304, + "loss": 0.1364, + "step": 920 + }, + { + "epoch": 1.63, + "grad_norm": 0.18564368784427643, + "learning_rate": 0.00043631615599589964, + "loss": 0.1485, + "step": 921 + }, + { + "epoch": 1.63, + "grad_norm": 0.3653159737586975, + "learning_rate": 0.00043539163630746384, + "loss": 0.1486, + "step": 922 + }, + { + "epoch": 1.63, + "grad_norm": 0.08679798990488052, + "learning_rate": 0.000434467341208064, + "loss": 0.1291, + "step": 923 + }, + { + "epoch": 1.63, + "grad_norm": 0.1024034321308136, + "learning_rate": 0.00043354327391069826, + "loss": 0.1275, + "step": 924 + }, + { + "epoch": 1.64, + "grad_norm": 0.041372958570718765, + "learning_rate": 0.0004326194376275729, + "loss": 0.1328, + "step": 925 + }, + { + "epoch": 1.64, + "grad_norm": 0.06509742885828018, + "learning_rate": 0.0004316958355700906, + "loss": 0.1324, + "step": 926 + }, + { + "epoch": 1.64, + "grad_norm": 0.09408631920814514, + "learning_rate": 0.0004307724709488409, + "loss": 0.1405, + "step": 927 + }, + { + "epoch": 1.64, + "grad_norm": 0.1963924914598465, + "learning_rate": 0.0004298493469735869, + "loss": 0.1436, + "step": 928 + }, + { + "epoch": 1.64, + "grad_norm": 0.10209079831838608, + "learning_rate": 0.0004289264668532557, + "loss": 0.1277, + "step": 929 + }, + { + "epoch": 1.65, + "grad_norm": 0.026920847594738007, + "learning_rate": 0.00042800383379592677, + "loss": 0.1295, + "step": 930 + }, + { + "epoch": 1.65, + "grad_norm": 0.03551056608557701, + "learning_rate": 0.00042708145100882035, + "loss": 0.1281, + "step": 931 + }, + { + "epoch": 1.65, + "grad_norm": 0.14194993674755096, + "learning_rate": 0.00042615932169828743, + "loss": 0.1398, + "step": 932 + }, + { + "epoch": 1.65, + "grad_norm": 0.2725144326686859, + "learning_rate": 0.00042523744906979683, + "loss": 0.1217, + "step": 933 + }, + { + "epoch": 1.65, + "grad_norm": 0.22893387079238892, + "learning_rate": 0.00042431583632792605, + "loss": 0.1517, + "step": 934 + }, + { + "epoch": 1.65, + "grad_norm": 0.20985311269760132, + "learning_rate": 0.00042339448667634886, + "loss": 0.1433, + "step": 935 + }, + { + "epoch": 1.66, + "grad_norm": 0.053967542946338654, + "learning_rate": 0.00042247340331782416, + "loss": 0.12, + "step": 936 + }, + { + "epoch": 1.66, + "grad_norm": 0.22838272154331207, + "learning_rate": 0.0004215525894541856, + "loss": 0.1176, + "step": 937 + }, + { + "epoch": 1.66, + "grad_norm": 0.3237338066101074, + "learning_rate": 0.0004206320482863301, + "loss": 0.1476, + "step": 938 + }, + { + "epoch": 1.66, + "grad_norm": 0.09525377303361893, + "learning_rate": 0.0004197117830142062, + "loss": 0.1342, + "step": 939 + }, + { + "epoch": 1.66, + "grad_norm": 0.05312574282288551, + "learning_rate": 0.0004187917968368036, + "loss": 0.1311, + "step": 940 + }, + { + "epoch": 1.66, + "grad_norm": 0.11625714600086212, + "learning_rate": 0.00041787209295214177, + "loss": 0.1133, + "step": 941 + }, + { + "epoch": 1.67, + "grad_norm": 0.04892723262310028, + "learning_rate": 0.000416952674557259, + "loss": 0.106, + "step": 942 + }, + { + "epoch": 1.67, + "grad_norm": 0.44417399168014526, + "learning_rate": 0.00041603354484820134, + "loss": 0.1653, + "step": 943 + }, + { + "epoch": 1.67, + "grad_norm": 0.07979090511798859, + "learning_rate": 0.00041511470702001074, + "loss": 0.1144, + "step": 944 + }, + { + "epoch": 1.67, + "grad_norm": 3.163567304611206, + "learning_rate": 0.00041419616426671517, + "loss": 0.1576, + "step": 945 + }, + { + "epoch": 1.67, + "grad_norm": 0.11929760128259659, + "learning_rate": 0.0004132779197813164, + "loss": 0.1329, + "step": 946 + }, + { + "epoch": 1.68, + "grad_norm": 0.21021872758865356, + "learning_rate": 0.0004123599767557795, + "loss": 0.0956, + "step": 947 + }, + { + "epoch": 1.68, + "grad_norm": 0.4803867042064667, + "learning_rate": 0.00041144233838102197, + "loss": 0.2027, + "step": 948 + }, + { + "epoch": 1.68, + "grad_norm": 0.0795937329530716, + "learning_rate": 0.0004105250078469018, + "loss": 0.1226, + "step": 949 + }, + { + "epoch": 1.68, + "grad_norm": 0.19914481043815613, + "learning_rate": 0.00040960798834220705, + "loss": 0.1457, + "step": 950 + }, + { + "epoch": 1.68, + "grad_norm": 1.662695288658142, + "learning_rate": 0.00040869128305464475, + "loss": 0.1465, + "step": 951 + }, + { + "epoch": 1.68, + "grad_norm": 0.1512700617313385, + "learning_rate": 0.00040777489517082924, + "loss": 0.1391, + "step": 952 + }, + { + "epoch": 1.69, + "grad_norm": 0.3317195773124695, + "learning_rate": 0.00040685882787627227, + "loss": 0.1397, + "step": 953 + }, + { + "epoch": 1.69, + "grad_norm": 0.2609153985977173, + "learning_rate": 0.00040594308435537026, + "loss": 0.1217, + "step": 954 + }, + { + "epoch": 1.69, + "grad_norm": 0.40559151768684387, + "learning_rate": 0.00040502766779139485, + "loss": 0.1317, + "step": 955 + }, + { + "epoch": 1.69, + "grad_norm": 0.21796320378780365, + "learning_rate": 0.0004041125813664808, + "loss": 0.1364, + "step": 956 + }, + { + "epoch": 1.69, + "grad_norm": 0.6307505369186401, + "learning_rate": 0.0004031978282616151, + "loss": 0.1553, + "step": 957 + }, + { + "epoch": 1.69, + "grad_norm": 0.28565332293510437, + "learning_rate": 0.00040228341165662683, + "loss": 0.1344, + "step": 958 + }, + { + "epoch": 1.7, + "grad_norm": 0.4165158271789551, + "learning_rate": 0.0004013693347301741, + "loss": 0.1455, + "step": 959 + }, + { + "epoch": 1.7, + "grad_norm": 0.821273922920227, + "learning_rate": 0.0004004556006597353, + "loss": 0.1287, + "step": 960 + }, + { + "epoch": 1.7, + "grad_norm": 0.11364096403121948, + "learning_rate": 0.0003995422126215967, + "loss": 0.1177, + "step": 961 + }, + { + "epoch": 1.7, + "grad_norm": 0.349627822637558, + "learning_rate": 0.0003986291737908414, + "loss": 0.1217, + "step": 962 + }, + { + "epoch": 1.7, + "grad_norm": 0.10462171584367752, + "learning_rate": 0.0003977164873413391, + "loss": 0.1168, + "step": 963 + }, + { + "epoch": 1.71, + "grad_norm": 0.11335984617471695, + "learning_rate": 0.0003968041564457342, + "loss": 0.1313, + "step": 964 + }, + { + "epoch": 1.71, + "grad_norm": 0.37488850951194763, + "learning_rate": 0.0003958921842754351, + "loss": 0.133, + "step": 965 + }, + { + "epoch": 1.71, + "grad_norm": 0.09337367117404938, + "learning_rate": 0.00039498057400060363, + "loss": 0.1464, + "step": 966 + }, + { + "epoch": 1.71, + "grad_norm": 0.27405792474746704, + "learning_rate": 0.000394069328790143, + "loss": 0.1322, + "step": 967 + }, + { + "epoch": 1.71, + "grad_norm": 0.5987095832824707, + "learning_rate": 0.00039315845181168784, + "loss": 0.1307, + "step": 968 + }, + { + "epoch": 1.71, + "grad_norm": 0.3096538484096527, + "learning_rate": 0.00039224794623159294, + "loss": 0.1349, + "step": 969 + }, + { + "epoch": 1.72, + "grad_norm": 0.5547122359275818, + "learning_rate": 0.0003913378152149214, + "loss": 0.1455, + "step": 970 + }, + { + "epoch": 1.72, + "grad_norm": 0.4886229634284973, + "learning_rate": 0.0003904280619254348, + "loss": 0.1251, + "step": 971 + }, + { + "epoch": 1.72, + "grad_norm": 0.18052807450294495, + "learning_rate": 0.0003895186895255814, + "loss": 0.1407, + "step": 972 + }, + { + "epoch": 1.72, + "grad_norm": 0.09001462161540985, + "learning_rate": 0.0003886097011764855, + "loss": 0.1143, + "step": 973 + }, + { + "epoch": 1.72, + "grad_norm": 0.3248112201690674, + "learning_rate": 0.0003877011000379367, + "loss": 0.1212, + "step": 974 + }, + { + "epoch": 1.72, + "grad_norm": 0.11648620665073395, + "learning_rate": 0.000386792889268378, + "loss": 0.1167, + "step": 975 + }, + { + "epoch": 1.73, + "grad_norm": 1.3816261291503906, + "learning_rate": 0.00038588507202489585, + "loss": 0.1518, + "step": 976 + }, + { + "epoch": 1.73, + "grad_norm": 0.6389634013175964, + "learning_rate": 0.00038497765146320873, + "loss": 0.1372, + "step": 977 + }, + { + "epoch": 1.73, + "grad_norm": 0.6133326888084412, + "learning_rate": 0.0003840706307376557, + "loss": 0.1252, + "step": 978 + }, + { + "epoch": 1.73, + "grad_norm": 0.20734143257141113, + "learning_rate": 0.00038316401300118674, + "loss": 0.1115, + "step": 979 + }, + { + "epoch": 1.73, + "grad_norm": 0.06368093192577362, + "learning_rate": 0.0003822578014053502, + "loss": 0.145, + "step": 980 + }, + { + "epoch": 1.74, + "grad_norm": 0.07665737718343735, + "learning_rate": 0.0003813519991002831, + "loss": 0.1557, + "step": 981 + }, + { + "epoch": 1.74, + "grad_norm": 0.09085717052221298, + "learning_rate": 0.00038044660923469963, + "loss": 0.1251, + "step": 982 + }, + { + "epoch": 1.74, + "grad_norm": 0.08342912048101425, + "learning_rate": 0.00037954163495588, + "loss": 0.1256, + "step": 983 + }, + { + "epoch": 1.74, + "grad_norm": 0.6068560481071472, + "learning_rate": 0.00037863707940966024, + "loss": 0.1506, + "step": 984 + }, + { + "epoch": 1.74, + "grad_norm": 0.22720251977443695, + "learning_rate": 0.00037773294574042015, + "loss": 0.1151, + "step": 985 + }, + { + "epoch": 1.74, + "grad_norm": 0.5528678297996521, + "learning_rate": 0.00037682923709107363, + "loss": 0.1436, + "step": 986 + }, + { + "epoch": 1.75, + "grad_norm": 0.6791836619377136, + "learning_rate": 0.00037592595660305707, + "loss": 0.1405, + "step": 987 + }, + { + "epoch": 1.75, + "grad_norm": 0.07115644961595535, + "learning_rate": 0.0003750231074163179, + "loss": 0.0997, + "step": 988 + }, + { + "epoch": 1.75, + "grad_norm": 0.36401447653770447, + "learning_rate": 0.00037412069266930514, + "loss": 0.1471, + "step": 989 + }, + { + "epoch": 1.75, + "grad_norm": 0.7831732630729675, + "learning_rate": 0.00037321871549895715, + "loss": 0.1314, + "step": 990 + }, + { + "epoch": 1.75, + "grad_norm": 0.12779557704925537, + "learning_rate": 0.00037231717904069096, + "loss": 0.1446, + "step": 991 + }, + { + "epoch": 1.75, + "grad_norm": 0.41478636860847473, + "learning_rate": 0.0003714160864283923, + "loss": 0.1429, + "step": 992 + }, + { + "epoch": 1.76, + "grad_norm": 0.1117364913225174, + "learning_rate": 0.00037051544079440334, + "loss": 0.1148, + "step": 993 + }, + { + "epoch": 1.76, + "grad_norm": 0.5920963287353516, + "learning_rate": 0.00036961524526951277, + "loss": 0.1204, + "step": 994 + }, + { + "epoch": 1.76, + "eval_loss": 0.13537168502807617, + "eval_runtime": 14.0251, + "eval_samples_per_second": 34.01, + "eval_steps_per_second": 8.556, + "step": 994 + }, + { + "epoch": 1.76, + "grad_norm": 0.43262025713920593, + "learning_rate": 0.000368715502982945, + "loss": 0.1302, + "step": 995 + }, + { + "epoch": 1.76, + "grad_norm": 0.10716990381479263, + "learning_rate": 0.00036781621706234816, + "loss": 0.133, + "step": 996 + }, + { + "epoch": 1.76, + "grad_norm": 0.0918804183602333, + "learning_rate": 0.0003669173906337846, + "loss": 0.1354, + "step": 997 + }, + { + "epoch": 1.77, + "grad_norm": 0.13421286642551422, + "learning_rate": 0.0003660190268217189, + "loss": 0.121, + "step": 998 + }, + { + "epoch": 1.77, + "grad_norm": 0.15904036164283752, + "learning_rate": 0.00036512112874900797, + "loss": 0.1341, + "step": 999 + }, + { + "epoch": 1.77, + "grad_norm": 0.3201177418231964, + "learning_rate": 0.00036422369953688973, + "loss": 0.1454, + "step": 1000 + }, + { + "epoch": 1.77, + "grad_norm": 0.3361368477344513, + "learning_rate": 0.0003633267423049717, + "loss": 0.152, + "step": 1001 + }, + { + "epoch": 1.77, + "grad_norm": 0.09073235839605331, + "learning_rate": 0.0003624302601712213, + "loss": 0.1331, + "step": 1002 + }, + { + "epoch": 1.77, + "grad_norm": 0.2717398405075073, + "learning_rate": 0.0003615342562519542, + "loss": 0.1373, + "step": 1003 + }, + { + "epoch": 1.78, + "grad_norm": 0.0666472539305687, + "learning_rate": 0.0003606387336618237, + "loss": 0.1496, + "step": 1004 + }, + { + "epoch": 1.78, + "grad_norm": 0.15450868010520935, + "learning_rate": 0.0003597436955138102, + "loss": 0.1464, + "step": 1005 + }, + { + "epoch": 1.78, + "grad_norm": 0.236429825425148, + "learning_rate": 0.0003588491449192096, + "loss": 0.1342, + "step": 1006 + }, + { + "epoch": 1.78, + "grad_norm": 0.06421036273241043, + "learning_rate": 0.0003579550849876233, + "loss": 0.1352, + "step": 1007 + }, + { + "epoch": 1.78, + "grad_norm": 0.07432877272367477, + "learning_rate": 0.00035706151882694727, + "loss": 0.131, + "step": 1008 + }, + { + "epoch": 1.79, + "grad_norm": 0.10126742720603943, + "learning_rate": 0.00035616844954336046, + "loss": 0.1471, + "step": 1009 + }, + { + "epoch": 1.79, + "grad_norm": 0.15761522948741913, + "learning_rate": 0.0003552758802413154, + "loss": 0.1358, + "step": 1010 + }, + { + "epoch": 1.79, + "grad_norm": 0.02369426190853119, + "learning_rate": 0.0003543838140235257, + "loss": 0.1296, + "step": 1011 + }, + { + "epoch": 1.79, + "grad_norm": 0.27005845308303833, + "learning_rate": 0.0003534922539909569, + "loss": 0.1412, + "step": 1012 + }, + { + "epoch": 1.79, + "grad_norm": 0.0638512596487999, + "learning_rate": 0.00035260120324281474, + "loss": 0.1366, + "step": 1013 + }, + { + "epoch": 1.79, + "grad_norm": 0.5134268403053284, + "learning_rate": 0.00035171066487653423, + "loss": 0.146, + "step": 1014 + }, + { + "epoch": 1.8, + "grad_norm": 0.2569257915019989, + "learning_rate": 0.00035082064198776997, + "loss": 0.1507, + "step": 1015 + }, + { + "epoch": 1.8, + "grad_norm": 0.16015255451202393, + "learning_rate": 0.0003499311376703842, + "loss": 0.1297, + "step": 1016 + }, + { + "epoch": 1.8, + "grad_norm": 0.22499139606952667, + "learning_rate": 0.0003490421550164364, + "loss": 0.1357, + "step": 1017 + }, + { + "epoch": 1.8, + "grad_norm": 0.5117542743682861, + "learning_rate": 0.0003481536971161732, + "loss": 0.1418, + "step": 1018 + }, + { + "epoch": 1.8, + "grad_norm": 0.27242857217788696, + "learning_rate": 0.00034726576705801636, + "loss": 0.1358, + "step": 1019 + }, + { + "epoch": 1.8, + "grad_norm": 0.3779907822608948, + "learning_rate": 0.0003463783679285535, + "loss": 0.1512, + "step": 1020 + }, + { + "epoch": 1.81, + "grad_norm": 0.2148500680923462, + "learning_rate": 0.00034549150281252633, + "loss": 0.1445, + "step": 1021 + }, + { + "epoch": 1.81, + "grad_norm": 0.08610748499631882, + "learning_rate": 0.0003446051747928202, + "loss": 0.1333, + "step": 1022 + }, + { + "epoch": 1.81, + "grad_norm": 0.07877200841903687, + "learning_rate": 0.0003437193869504535, + "loss": 0.1333, + "step": 1023 + }, + { + "epoch": 1.81, + "grad_norm": 0.1410919725894928, + "learning_rate": 0.0003428341423645668, + "loss": 0.1303, + "step": 1024 + }, + { + "epoch": 1.81, + "grad_norm": 0.1181708350777626, + "learning_rate": 0.00034194944411241213, + "loss": 0.1234, + "step": 1025 + }, + { + "epoch": 1.82, + "grad_norm": 0.13002073764801025, + "learning_rate": 0.00034106529526934303, + "loss": 0.1405, + "step": 1026 + }, + { + "epoch": 1.82, + "grad_norm": 0.14099909365177155, + "learning_rate": 0.00034018169890880225, + "loss": 0.1342, + "step": 1027 + }, + { + "epoch": 1.82, + "grad_norm": 0.24455852806568146, + "learning_rate": 0.00033929865810231264, + "loss": 0.1482, + "step": 1028 + }, + { + "epoch": 1.82, + "grad_norm": 0.23751892149448395, + "learning_rate": 0.0003384161759194658, + "loss": 0.1509, + "step": 1029 + }, + { + "epoch": 1.82, + "grad_norm": 0.40307796001434326, + "learning_rate": 0.00033753425542791104, + "loss": 0.1542, + "step": 1030 + }, + { + "epoch": 1.82, + "grad_norm": 0.123422771692276, + "learning_rate": 0.0003366528996933458, + "loss": 0.129, + "step": 1031 + }, + { + "epoch": 1.83, + "grad_norm": 0.15652324259281158, + "learning_rate": 0.00033577211177950386, + "loss": 0.1277, + "step": 1032 + }, + { + "epoch": 1.83, + "grad_norm": 0.2987327575683594, + "learning_rate": 0.0003348918947481452, + "loss": 0.1395, + "step": 1033 + }, + { + "epoch": 1.83, + "grad_norm": 0.2202194184064865, + "learning_rate": 0.00033401225165904556, + "loss": 0.1287, + "step": 1034 + }, + { + "epoch": 1.83, + "grad_norm": 0.10783470422029495, + "learning_rate": 0.0003331331855699852, + "loss": 0.1423, + "step": 1035 + }, + { + "epoch": 1.83, + "grad_norm": 0.26680612564086914, + "learning_rate": 0.0003322546995367394, + "loss": 0.1307, + "step": 1036 + }, + { + "epoch": 1.83, + "grad_norm": 0.14138604700565338, + "learning_rate": 0.00033137679661306575, + "loss": 0.1273, + "step": 1037 + }, + { + "epoch": 1.84, + "grad_norm": 0.2755129039287567, + "learning_rate": 0.0003304994798506962, + "loss": 0.1485, + "step": 1038 + }, + { + "epoch": 1.84, + "grad_norm": 0.15637388825416565, + "learning_rate": 0.00032962275229932446, + "loss": 0.1233, + "step": 1039 + }, + { + "epoch": 1.84, + "grad_norm": 0.10934972018003464, + "learning_rate": 0.00032874661700659587, + "loss": 0.1438, + "step": 1040 + }, + { + "epoch": 1.84, + "grad_norm": 0.16785453259944916, + "learning_rate": 0.00032787107701809755, + "loss": 0.1257, + "step": 1041 + }, + { + "epoch": 1.84, + "grad_norm": 0.2813141644001007, + "learning_rate": 0.0003269961353773469, + "loss": 0.1594, + "step": 1042 + }, + { + "epoch": 1.85, + "grad_norm": 0.321236789226532, + "learning_rate": 0.00032612179512578126, + "loss": 0.1476, + "step": 1043 + }, + { + "epoch": 1.85, + "grad_norm": 0.35957199335098267, + "learning_rate": 0.0003252480593027478, + "loss": 0.1614, + "step": 1044 + }, + { + "epoch": 1.85, + "grad_norm": 0.2529314160346985, + "learning_rate": 0.0003243749309454922, + "loss": 0.1436, + "step": 1045 + }, + { + "epoch": 1.85, + "grad_norm": 0.36090412735939026, + "learning_rate": 0.00032350241308914864, + "loss": 0.1578, + "step": 1046 + }, + { + "epoch": 1.85, + "grad_norm": 0.47541487216949463, + "learning_rate": 0.0003226305087667295, + "loss": 0.1247, + "step": 1047 + }, + { + "epoch": 1.85, + "grad_norm": 0.23453806340694427, + "learning_rate": 0.0003217592210091137, + "loss": 0.1435, + "step": 1048 + }, + { + "epoch": 1.86, + "grad_norm": 0.12492989748716354, + "learning_rate": 0.0003208885528450376, + "loss": 0.1229, + "step": 1049 + }, + { + "epoch": 1.86, + "grad_norm": 0.19712020456790924, + "learning_rate": 0.00032001850730108307, + "loss": 0.1292, + "step": 1050 + }, + { + "epoch": 1.86, + "grad_norm": 0.09731408208608627, + "learning_rate": 0.00031914908740166795, + "loss": 0.1333, + "step": 1051 + }, + { + "epoch": 1.86, + "grad_norm": 0.06944354623556137, + "learning_rate": 0.0003182802961690357, + "loss": 0.1292, + "step": 1052 + }, + { + "epoch": 1.86, + "grad_norm": 0.07448045909404755, + "learning_rate": 0.00031741213662324363, + "loss": 0.1349, + "step": 1053 + }, + { + "epoch": 1.86, + "grad_norm": 0.28523683547973633, + "learning_rate": 0.0003165446117821538, + "loss": 0.1452, + "step": 1054 + }, + { + "epoch": 1.87, + "grad_norm": 0.09108186513185501, + "learning_rate": 0.0003156777246614215, + "loss": 0.1361, + "step": 1055 + }, + { + "epoch": 1.87, + "grad_norm": 0.13375020027160645, + "learning_rate": 0.0003148114782744855, + "loss": 0.1381, + "step": 1056 + }, + { + "epoch": 1.87, + "grad_norm": 0.06716307252645493, + "learning_rate": 0.00031394587563255755, + "loss": 0.1383, + "step": 1057 + }, + { + "epoch": 1.87, + "grad_norm": 0.11596639454364777, + "learning_rate": 0.00031308091974461064, + "loss": 0.1313, + "step": 1058 + }, + { + "epoch": 1.87, + "grad_norm": 0.39337942004203796, + "learning_rate": 0.00031221661361737065, + "loss": 0.1359, + "step": 1059 + }, + { + "epoch": 1.88, + "grad_norm": 0.07525162398815155, + "learning_rate": 0.00031135296025530424, + "loss": 0.1326, + "step": 1060 + }, + { + "epoch": 1.88, + "grad_norm": 0.020530417561531067, + "learning_rate": 0.0003104899626606088, + "loss": 0.1368, + "step": 1061 + }, + { + "epoch": 1.88, + "grad_norm": 0.17400570213794708, + "learning_rate": 0.00030962762383320285, + "loss": 0.1309, + "step": 1062 + }, + { + "epoch": 1.88, + "grad_norm": 0.1574063003063202, + "learning_rate": 0.00030876594677071404, + "loss": 0.1365, + "step": 1063 + }, + { + "epoch": 1.88, + "grad_norm": 0.21009187400341034, + "learning_rate": 0.0003079049344684702, + "loss": 0.1382, + "step": 1064 + }, + { + "epoch": 1.88, + "grad_norm": 0.047014713287353516, + "learning_rate": 0.00030704458991948844, + "loss": 0.1327, + "step": 1065 + }, + { + "epoch": 1.89, + "grad_norm": 0.11032029241323471, + "learning_rate": 0.0003061849161144641, + "loss": 0.1252, + "step": 1066 + }, + { + "epoch": 1.89, + "grad_norm": 0.08818018436431885, + "learning_rate": 0.0003053259160417613, + "loss": 0.1495, + "step": 1067 + }, + { + "epoch": 1.89, + "grad_norm": 0.08112979680299759, + "learning_rate": 0.0003044675926874023, + "loss": 0.1408, + "step": 1068 + }, + { + "epoch": 1.89, + "grad_norm": 0.11870189756155014, + "learning_rate": 0.00030360994903505653, + "loss": 0.1342, + "step": 1069 + }, + { + "epoch": 1.89, + "grad_norm": 0.16511432826519012, + "learning_rate": 0.000302752988066031, + "loss": 0.148, + "step": 1070 + }, + { + "epoch": 1.89, + "grad_norm": 0.31104427576065063, + "learning_rate": 0.0003018967127592595, + "loss": 0.15, + "step": 1071 + }, + { + "epoch": 1.9, + "grad_norm": 0.1434139758348465, + "learning_rate": 0.0003010411260912922, + "loss": 0.142, + "step": 1072 + }, + { + "epoch": 1.9, + "grad_norm": 0.29530733823776245, + "learning_rate": 0.00030018623103628594, + "loss": 0.1284, + "step": 1073 + }, + { + "epoch": 1.9, + "grad_norm": 0.06294587254524231, + "learning_rate": 0.00029933203056599274, + "loss": 0.1379, + "step": 1074 + }, + { + "epoch": 1.9, + "grad_norm": 0.0581539049744606, + "learning_rate": 0.0002984785276497507, + "loss": 0.1278, + "step": 1075 + }, + { + "epoch": 1.9, + "grad_norm": 0.09157228469848633, + "learning_rate": 0.0002976257252544726, + "loss": 0.1348, + "step": 1076 + }, + { + "epoch": 1.91, + "grad_norm": 0.10196894407272339, + "learning_rate": 0.00029677362634463643, + "loss": 0.1386, + "step": 1077 + }, + { + "epoch": 1.91, + "grad_norm": 0.1152050793170929, + "learning_rate": 0.00029592223388227504, + "loss": 0.1337, + "step": 1078 + }, + { + "epoch": 1.91, + "grad_norm": 0.34997934103012085, + "learning_rate": 0.0002950715508269648, + "loss": 0.1346, + "step": 1079 + }, + { + "epoch": 1.91, + "grad_norm": 0.21548382937908173, + "learning_rate": 0.00029422158013581656, + "loss": 0.1309, + "step": 1080 + }, + { + "epoch": 1.91, + "grad_norm": 0.3693360686302185, + "learning_rate": 0.000293372324763465, + "loss": 0.1587, + "step": 1081 + }, + { + "epoch": 1.91, + "grad_norm": 0.2655669152736664, + "learning_rate": 0.0002925237876620576, + "loss": 0.1285, + "step": 1082 + }, + { + "epoch": 1.92, + "grad_norm": 0.08570755273103714, + "learning_rate": 0.00029167597178124583, + "loss": 0.1359, + "step": 1083 + }, + { + "epoch": 1.92, + "grad_norm": 0.2622168958187103, + "learning_rate": 0.00029082888006817364, + "loss": 0.1315, + "step": 1084 + }, + { + "epoch": 1.92, + "grad_norm": 0.020985718816518784, + "learning_rate": 0.0002899825154674674, + "loss": 0.1308, + "step": 1085 + }, + { + "epoch": 1.92, + "grad_norm": 0.15370753407478333, + "learning_rate": 0.00028913688092122665, + "loss": 0.1269, + "step": 1086 + }, + { + "epoch": 1.92, + "grad_norm": 0.1030414029955864, + "learning_rate": 0.0002882919793690123, + "loss": 0.1143, + "step": 1087 + }, + { + "epoch": 1.92, + "grad_norm": 0.31633636355400085, + "learning_rate": 0.00028744781374783813, + "loss": 0.1563, + "step": 1088 + }, + { + "epoch": 1.93, + "grad_norm": 0.03694160282611847, + "learning_rate": 0.00028660438699215895, + "loss": 0.1276, + "step": 1089 + }, + { + "epoch": 1.93, + "grad_norm": 0.14471565186977386, + "learning_rate": 0.0002857617020338614, + "loss": 0.137, + "step": 1090 + }, + { + "epoch": 1.93, + "grad_norm": 0.08222481608390808, + "learning_rate": 0.0002849197618022539, + "loss": 0.1279, + "step": 1091 + }, + { + "epoch": 1.93, + "grad_norm": 0.11603690683841705, + "learning_rate": 0.00028407856922405526, + "loss": 0.138, + "step": 1092 + }, + { + "epoch": 1.93, + "grad_norm": 0.18144792318344116, + "learning_rate": 0.0002832381272233864, + "loss": 0.1481, + "step": 1093 + }, + { + "epoch": 1.94, + "grad_norm": 0.05054265260696411, + "learning_rate": 0.00028239843872175814, + "loss": 0.1363, + "step": 1094 + }, + { + "epoch": 1.94, + "grad_norm": 2.8396074771881104, + "learning_rate": 0.00028155950663806236, + "loss": 0.1345, + "step": 1095 + }, + { + "epoch": 1.94, + "grad_norm": 0.30984073877334595, + "learning_rate": 0.0002807213338885619, + "loss": 0.1338, + "step": 1096 + }, + { + "epoch": 1.94, + "grad_norm": 0.34585434198379517, + "learning_rate": 0.00027988392338687925, + "loss": 0.1389, + "step": 1097 + }, + { + "epoch": 1.94, + "grad_norm": 0.0312834158539772, + "learning_rate": 0.0002790472780439881, + "loss": 0.1338, + "step": 1098 + }, + { + "epoch": 1.94, + "grad_norm": 0.5297988057136536, + "learning_rate": 0.0002782114007682016, + "loss": 0.1321, + "step": 1099 + }, + { + "epoch": 1.95, + "grad_norm": 0.10174748301506042, + "learning_rate": 0.0002773762944651632, + "loss": 0.1202, + "step": 1100 + }, + { + "epoch": 1.95, + "grad_norm": 0.0418776273727417, + "learning_rate": 0.0002765419620378366, + "loss": 0.1257, + "step": 1101 + }, + { + "epoch": 1.95, + "grad_norm": 0.3508782386779785, + "learning_rate": 0.0002757084063864949, + "loss": 0.1612, + "step": 1102 + }, + { + "epoch": 1.95, + "grad_norm": 0.2867041230201721, + "learning_rate": 0.00027487563040871145, + "loss": 0.1574, + "step": 1103 + }, + { + "epoch": 1.95, + "grad_norm": 0.24160172045230865, + "learning_rate": 0.00027404363699934907, + "loss": 0.1175, + "step": 1104 + }, + { + "epoch": 1.95, + "grad_norm": 0.03382538631558418, + "learning_rate": 0.0002732124290505501, + "loss": 0.1363, + "step": 1105 + }, + { + "epoch": 1.96, + "grad_norm": 0.05089818313717842, + "learning_rate": 0.000272382009451727, + "loss": 0.1384, + "step": 1106 + }, + { + "epoch": 1.96, + "grad_norm": 0.08688928186893463, + "learning_rate": 0.0002715523810895515, + "loss": 0.1415, + "step": 1107 + }, + { + "epoch": 1.96, + "grad_norm": 0.03926026448607445, + "learning_rate": 0.00027072354684794486, + "loss": 0.1357, + "step": 1108 + }, + { + "epoch": 1.96, + "grad_norm": 0.05058757960796356, + "learning_rate": 0.0002698955096080677, + "loss": 0.1356, + "step": 1109 + }, + { + "epoch": 1.96, + "grad_norm": 0.0489107221364975, + "learning_rate": 0.00026906827224831023, + "loss": 0.1394, + "step": 1110 + }, + { + "epoch": 1.97, + "grad_norm": 0.09765997529029846, + "learning_rate": 0.00026824183764428223, + "loss": 0.1388, + "step": 1111 + }, + { + "epoch": 1.97, + "grad_norm": 0.06232646107673645, + "learning_rate": 0.00026741620866880335, + "loss": 0.1322, + "step": 1112 + }, + { + "epoch": 1.97, + "grad_norm": 0.08469201624393463, + "learning_rate": 0.0002665913881918921, + "loss": 0.1369, + "step": 1113 + }, + { + "epoch": 1.97, + "grad_norm": 0.08312228322029114, + "learning_rate": 0.00026576737908075667, + "loss": 0.137, + "step": 1114 + }, + { + "epoch": 1.97, + "grad_norm": 0.13206493854522705, + "learning_rate": 0.00026494418419978485, + "loss": 0.127, + "step": 1115 + }, + { + "epoch": 1.97, + "grad_norm": 0.050156012177467346, + "learning_rate": 0.0002641218064105341, + "loss": 0.1304, + "step": 1116 + }, + { + "epoch": 1.98, + "grad_norm": 0.22207175195217133, + "learning_rate": 0.0002633002485717219, + "loss": 0.1406, + "step": 1117 + }, + { + "epoch": 1.98, + "grad_norm": 0.25118494033813477, + "learning_rate": 0.0002624795135392148, + "loss": 0.1373, + "step": 1118 + }, + { + "epoch": 1.98, + "grad_norm": 0.09831628948450089, + "learning_rate": 0.00026165960416601943, + "loss": 0.1459, + "step": 1119 + }, + { + "epoch": 1.98, + "grad_norm": 0.037262722849845886, + "learning_rate": 0.00026084052330227237, + "loss": 0.1314, + "step": 1120 + }, + { + "epoch": 1.98, + "grad_norm": 0.05748564377427101, + "learning_rate": 0.0002600222737952299, + "loss": 0.1439, + "step": 1121 + }, + { + "epoch": 1.98, + "grad_norm": 0.06702205538749695, + "learning_rate": 0.00025920485848925914, + "loss": 0.1265, + "step": 1122 + }, + { + "epoch": 1.99, + "grad_norm": 0.11122670769691467, + "learning_rate": 0.00025838828022582596, + "loss": 0.1275, + "step": 1123 + }, + { + "epoch": 1.99, + "grad_norm": 0.1843162626028061, + "learning_rate": 0.0002575725418434878, + "loss": 0.1419, + "step": 1124 + }, + { + "epoch": 1.99, + "grad_norm": 0.11396101117134094, + "learning_rate": 0.00025675764617788234, + "loss": 0.1342, + "step": 1125 + }, + { + "epoch": 1.99, + "grad_norm": 0.043184638023376465, + "learning_rate": 0.00025594359606171725, + "loss": 0.1295, + "step": 1126 + }, + { + "epoch": 1.99, + "grad_norm": 0.15845736861228943, + "learning_rate": 0.0002551303943247619, + "loss": 0.1429, + "step": 1127 + }, + { + "epoch": 2.0, + "grad_norm": 0.1975352019071579, + "learning_rate": 0.0002543180437938352, + "loss": 0.1153, + "step": 1128 + }, + { + "epoch": 2.0, + "grad_norm": 0.13739655911922455, + "learning_rate": 0.00025350654729279834, + "loss": 0.1347, + "step": 1129 + }, + { + "epoch": 2.0, + "grad_norm": 0.2761506140232086, + "learning_rate": 0.0002526959076425434, + "loss": 0.147, + "step": 1130 + } + ], + "logging_steps": 1, + "max_steps": 1695, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 565, + "total_flos": 1.0339891388035891e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}