{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9986731534719153, "eval_steps": 142, "global_step": 1695, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.6921162605285645, "learning_rate": 5e-05, "loss": 3.3182, "step": 1 }, { "epoch": 0.0, "eval_loss": 3.3362529277801514, "eval_runtime": 14.4412, "eval_samples_per_second": 33.031, "eval_steps_per_second": 8.31, "step": 1 }, { "epoch": 0.0, "grad_norm": 4.620742321014404, "learning_rate": 0.0001, "loss": 3.2788, "step": 2 }, { "epoch": 0.01, "grad_norm": 4.650161266326904, "learning_rate": 0.00015, "loss": 3.2271, "step": 3 }, { "epoch": 0.01, "grad_norm": 4.024933815002441, "learning_rate": 0.0002, "loss": 2.402, "step": 4 }, { "epoch": 0.01, "grad_norm": 2.751981496810913, "learning_rate": 0.00025, "loss": 1.0544, "step": 5 }, { "epoch": 0.01, "grad_norm": 1.4766970872879028, "learning_rate": 0.0003, "loss": 0.3549, "step": 6 }, { "epoch": 0.01, "grad_norm": 0.8064658641815186, "learning_rate": 0.00035, "loss": 0.1533, "step": 7 }, { "epoch": 0.01, "grad_norm": 2.232205390930176, "learning_rate": 0.0004, "loss": 0.31, "step": 8 }, { "epoch": 0.02, "grad_norm": 1.1898847818374634, "learning_rate": 0.00045000000000000004, "loss": 0.1818, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.7394833564758301, "learning_rate": 0.0005, "loss": 0.1751, "step": 10 }, { "epoch": 0.02, "grad_norm": 0.16317571699619293, "learning_rate": 0.0004999995654799487, "loss": 0.1411, "step": 11 }, { "epoch": 0.02, "grad_norm": 0.10235322266817093, "learning_rate": 0.0004999982619213052, "loss": 0.1363, "step": 12 }, { "epoch": 0.02, "grad_norm": 0.19907887279987335, "learning_rate": 0.0004999960893286008, "loss": 0.128, "step": 13 }, { "epoch": 0.02, "grad_norm": 0.6823816299438477, "learning_rate": 0.0004999930477093878, "loss": 0.143, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.10187644511461258, "learning_rate": 0.0004999891370742394, "loss": 0.1322, "step": 15 }, { "epoch": 0.03, "grad_norm": 0.09401004016399384, "learning_rate": 0.0004999843574367498, "loss": 0.1361, "step": 16 }, { "epoch": 0.03, "grad_norm": 0.17946797609329224, "learning_rate": 0.0004999787088135334, "loss": 0.1412, "step": 17 }, { "epoch": 0.03, "grad_norm": 0.890545666217804, "learning_rate": 0.0004999721912242259, "loss": 0.1593, "step": 18 }, { "epoch": 0.03, "grad_norm": 0.434042751789093, "learning_rate": 0.0004999648046914836, "loss": 0.1548, "step": 19 }, { "epoch": 0.04, "grad_norm": 0.28103551268577576, "learning_rate": 0.0004999565492409831, "loss": 0.1459, "step": 20 }, { "epoch": 0.04, "grad_norm": 0.2690610885620117, "learning_rate": 0.0004999474249014217, "loss": 0.1248, "step": 21 }, { "epoch": 0.04, "grad_norm": 0.37668731808662415, "learning_rate": 0.0004999374317045172, "loss": 0.1481, "step": 22 }, { "epoch": 0.04, "grad_norm": 0.23762015998363495, "learning_rate": 0.0004999265696850074, "loss": 0.1407, "step": 23 }, { "epoch": 0.04, "grad_norm": 0.1988176554441452, "learning_rate": 0.0004999148388806504, "loss": 0.1398, "step": 24 }, { "epoch": 0.04, "grad_norm": 0.3805619776248932, "learning_rate": 0.0004999022393322246, "loss": 0.1474, "step": 25 }, { "epoch": 0.05, "grad_norm": 0.5069771409034729, "learning_rate": 0.0004998887710835278, "loss": 0.1509, "step": 26 }, { "epoch": 0.05, "grad_norm": 0.42066043615341187, "learning_rate": 0.0004998744341813779, "loss": 0.1341, "step": 27 }, { "epoch": 0.05, "grad_norm": 0.0944904088973999, "learning_rate": 0.0004998592286756123, "loss": 0.1233, "step": 28 }, { "epoch": 0.05, "grad_norm": 0.849244236946106, "learning_rate": 0.0004998431546190875, "loss": 0.1999, "step": 29 }, { "epoch": 0.05, "grad_norm": 0.09785456210374832, "learning_rate": 0.00049982621206768, "loss": 0.1272, "step": 30 }, { "epoch": 0.05, "grad_norm": 0.38225457072257996, "learning_rate": 0.0004998084010802845, "loss": 0.1634, "step": 31 }, { "epoch": 0.06, "grad_norm": 0.08135183900594711, "learning_rate": 0.0004997897217188149, "loss": 0.1383, "step": 32 }, { "epoch": 0.06, "grad_norm": 0.17299437522888184, "learning_rate": 0.0004997701740482036, "loss": 0.1427, "step": 33 }, { "epoch": 0.06, "grad_norm": 0.11125747114419937, "learning_rate": 0.0004997497581364015, "loss": 0.1379, "step": 34 }, { "epoch": 0.06, "grad_norm": 0.08914893865585327, "learning_rate": 0.0004997284740543776, "loss": 0.1388, "step": 35 }, { "epoch": 0.06, "grad_norm": 0.034590039402246475, "learning_rate": 0.0004997063218761188, "loss": 0.1387, "step": 36 }, { "epoch": 0.07, "grad_norm": 0.08675777167081833, "learning_rate": 0.0004996833016786296, "loss": 0.1384, "step": 37 }, { "epoch": 0.07, "grad_norm": 0.4440009295940399, "learning_rate": 0.0004996594135419318, "loss": 0.152, "step": 38 }, { "epoch": 0.07, "grad_norm": 0.0814109519124031, "learning_rate": 0.0004996346575490646, "loss": 0.1373, "step": 39 }, { "epoch": 0.07, "grad_norm": 0.37724560499191284, "learning_rate": 0.0004996090337860836, "loss": 0.1362, "step": 40 }, { "epoch": 0.07, "grad_norm": 0.21177273988723755, "learning_rate": 0.0004995825423420613, "loss": 0.1423, "step": 41 }, { "epoch": 0.07, "grad_norm": 0.12168041616678238, "learning_rate": 0.000499555183309086, "loss": 0.1381, "step": 42 }, { "epoch": 0.08, "grad_norm": 0.21096466481685638, "learning_rate": 0.0004995269567822623, "loss": 0.139, "step": 43 }, { "epoch": 0.08, "grad_norm": 0.49395841360092163, "learning_rate": 0.0004994978628597099, "loss": 0.1016, "step": 44 }, { "epoch": 0.08, "grad_norm": 0.1108216792345047, "learning_rate": 0.0004994679016425642, "loss": 0.1334, "step": 45 }, { "epoch": 0.08, "grad_norm": 0.5518127679824829, "learning_rate": 0.000499437073234975, "loss": 0.1568, "step": 46 }, { "epoch": 0.08, "grad_norm": 0.4762812852859497, "learning_rate": 0.0004994053777441069, "loss": 0.1543, "step": 47 }, { "epoch": 0.08, "grad_norm": 0.3477722108364105, "learning_rate": 0.0004993728152801384, "loss": 0.1464, "step": 48 }, { "epoch": 0.09, "grad_norm": 0.4996407628059387, "learning_rate": 0.0004993393859562621, "loss": 0.154, "step": 49 }, { "epoch": 0.09, "grad_norm": 0.20425601303577423, "learning_rate": 0.0004993050898886833, "loss": 0.1372, "step": 50 }, { "epoch": 0.09, "grad_norm": 0.034631408751010895, "learning_rate": 0.000499269927196621, "loss": 0.137, "step": 51 }, { "epoch": 0.09, "grad_norm": 0.24027873575687408, "learning_rate": 0.0004992338980023062, "loss": 0.1468, "step": 52 }, { "epoch": 0.09, "grad_norm": 0.22242723405361176, "learning_rate": 0.000499197002430982, "loss": 0.1418, "step": 53 }, { "epoch": 0.1, "grad_norm": 0.6540514826774597, "learning_rate": 0.0004991592406109036, "loss": 0.1564, "step": 54 }, { "epoch": 0.1, "grad_norm": 0.030118577182292938, "learning_rate": 0.000499120612673337, "loss": 0.1365, "step": 55 }, { "epoch": 0.1, "grad_norm": 0.07544097304344177, "learning_rate": 0.0004990811187525592, "loss": 0.1334, "step": 56 }, { "epoch": 0.1, "grad_norm": 0.37415480613708496, "learning_rate": 0.0004990407589858572, "loss": 0.155, "step": 57 }, { "epoch": 0.1, "grad_norm": 0.557809054851532, "learning_rate": 0.0004989995335135282, "loss": 0.1603, "step": 58 }, { "epoch": 0.1, "grad_norm": 0.14802873134613037, "learning_rate": 0.0004989574424788787, "loss": 0.1387, "step": 59 }, { "epoch": 0.11, "grad_norm": 0.3581993281841278, "learning_rate": 0.0004989144860282236, "loss": 0.1475, "step": 60 }, { "epoch": 0.11, "grad_norm": 0.04818522185087204, "learning_rate": 0.0004988706643108864, "loss": 0.1362, "step": 61 }, { "epoch": 0.11, "grad_norm": 0.21908174455165863, "learning_rate": 0.0004988259774791987, "loss": 0.1386, "step": 62 }, { "epoch": 0.11, "grad_norm": 0.1852695643901825, "learning_rate": 0.0004987804256884988, "loss": 0.1387, "step": 63 }, { "epoch": 0.11, "grad_norm": 0.025747304782271385, "learning_rate": 0.0004987340090971323, "loss": 0.1393, "step": 64 }, { "epoch": 0.11, "grad_norm": 0.045346710830926895, "learning_rate": 0.0004986867278664504, "loss": 0.1354, "step": 65 }, { "epoch": 0.12, "grad_norm": 0.34946465492248535, "learning_rate": 0.0004986385821608106, "loss": 0.152, "step": 66 }, { "epoch": 0.12, "grad_norm": 0.2552882432937622, "learning_rate": 0.0004985895721475748, "loss": 0.1463, "step": 67 }, { "epoch": 0.12, "grad_norm": 0.0560542456805706, "learning_rate": 0.0004985396979971099, "loss": 0.1391, "step": 68 }, { "epoch": 0.12, "grad_norm": 0.14347511529922485, "learning_rate": 0.0004984889598827863, "loss": 0.1353, "step": 69 }, { "epoch": 0.12, "grad_norm": 0.12386342883110046, "learning_rate": 0.0004984373579809778, "loss": 0.1343, "step": 70 }, { "epoch": 0.13, "grad_norm": 0.03070697747170925, "learning_rate": 0.000498384892471061, "loss": 0.1356, "step": 71 }, { "epoch": 0.13, "grad_norm": 0.0531514473259449, "learning_rate": 0.0004983315635354144, "loss": 0.1346, "step": 72 }, { "epoch": 0.13, "grad_norm": 0.24197503924369812, "learning_rate": 0.0004982773713594178, "loss": 0.1217, "step": 73 }, { "epoch": 0.13, "grad_norm": 0.08417380601167679, "learning_rate": 0.0004982223161314522, "loss": 0.1223, "step": 74 }, { "epoch": 0.13, "grad_norm": 0.40045711398124695, "learning_rate": 0.000498166398042898, "loss": 0.1513, "step": 75 }, { "epoch": 0.13, "grad_norm": 0.12452740222215652, "learning_rate": 0.0004981096172881358, "loss": 0.1296, "step": 76 }, { "epoch": 0.14, "grad_norm": 0.21590262651443481, "learning_rate": 0.0004980519740645444, "loss": 0.1375, "step": 77 }, { "epoch": 0.14, "grad_norm": 0.07704459875822067, "learning_rate": 0.0004979934685725011, "loss": 0.1299, "step": 78 }, { "epoch": 0.14, "grad_norm": 0.28334081172943115, "learning_rate": 0.0004979341010153801, "loss": 0.1387, "step": 79 }, { "epoch": 0.14, "grad_norm": 0.12374007701873779, "learning_rate": 0.0004978738715995526, "loss": 0.1383, "step": 80 }, { "epoch": 0.14, "grad_norm": 0.040613267570734024, "learning_rate": 0.000497812780534386, "loss": 0.1367, "step": 81 }, { "epoch": 0.15, "grad_norm": 0.09974126517772675, "learning_rate": 0.0004977508280322423, "loss": 0.1248, "step": 82 }, { "epoch": 0.15, "grad_norm": 0.2616259753704071, "learning_rate": 0.0004976880143084786, "loss": 0.1311, "step": 83 }, { "epoch": 0.15, "grad_norm": 0.15635579824447632, "learning_rate": 0.0004976243395814452, "loss": 0.1189, "step": 84 }, { "epoch": 0.15, "grad_norm": 0.259250670671463, "learning_rate": 0.000497559804072486, "loss": 0.1099, "step": 85 }, { "epoch": 0.15, "grad_norm": 1.2778699398040771, "learning_rate": 0.0004974944080059365, "loss": 0.1416, "step": 86 }, { "epoch": 0.15, "grad_norm": 0.2155281901359558, "learning_rate": 0.000497428151609124, "loss": 0.1253, "step": 87 }, { "epoch": 0.16, "grad_norm": 0.17533721029758453, "learning_rate": 0.0004973610351123664, "loss": 0.1446, "step": 88 }, { "epoch": 0.16, "grad_norm": 0.07620590180158615, "learning_rate": 0.0004972930587489715, "loss": 0.1309, "step": 89 }, { "epoch": 0.16, "grad_norm": 0.2370779663324356, "learning_rate": 0.0004972242227552358, "loss": 0.149, "step": 90 }, { "epoch": 0.16, "grad_norm": 0.06374065577983856, "learning_rate": 0.0004971545273704444, "loss": 0.1307, "step": 91 }, { "epoch": 0.16, "grad_norm": 0.22728750109672546, "learning_rate": 0.0004970839728368697, "loss": 0.1438, "step": 92 }, { "epoch": 0.16, "grad_norm": 0.16872233152389526, "learning_rate": 0.0004970125593997706, "loss": 0.1364, "step": 93 }, { "epoch": 0.17, "grad_norm": 0.18773947656154633, "learning_rate": 0.0004969402873073914, "loss": 0.146, "step": 94 }, { "epoch": 0.17, "grad_norm": 0.1468167006969452, "learning_rate": 0.0004968671568109616, "loss": 0.1401, "step": 95 }, { "epoch": 0.17, "grad_norm": 0.09030504524707794, "learning_rate": 0.0004967931681646948, "loss": 0.1318, "step": 96 }, { "epoch": 0.17, "grad_norm": 0.061796192079782486, "learning_rate": 0.000496718321625787, "loss": 0.1244, "step": 97 }, { "epoch": 0.17, "grad_norm": 0.045495226979255676, "learning_rate": 0.0004966426174544171, "loss": 0.1265, "step": 98 }, { "epoch": 0.18, "grad_norm": 0.08449025452136993, "learning_rate": 0.0004965660559137448, "loss": 0.1276, "step": 99 }, { "epoch": 0.18, "grad_norm": 0.09982559829950333, "learning_rate": 0.0004964886372699102, "loss": 0.1253, "step": 100 }, { "epoch": 0.18, "grad_norm": 0.05831208825111389, "learning_rate": 0.0004964103617920332, "loss": 0.1271, "step": 101 }, { "epoch": 0.18, "grad_norm": 0.20548835396766663, "learning_rate": 0.0004963312297522116, "loss": 0.1415, "step": 102 }, { "epoch": 0.18, "grad_norm": 0.09664470702409744, "learning_rate": 0.0004962512414255214, "loss": 0.1083, "step": 103 }, { "epoch": 0.18, "grad_norm": 0.16931602358818054, "learning_rate": 0.0004961703970900145, "loss": 0.1431, "step": 104 }, { "epoch": 0.19, "grad_norm": 0.10859667509794235, "learning_rate": 0.000496088697026719, "loss": 0.12, "step": 105 }, { "epoch": 0.19, "grad_norm": 0.21958191692829132, "learning_rate": 0.0004960061415196374, "loss": 0.1492, "step": 106 }, { "epoch": 0.19, "grad_norm": 0.06437578052282333, "learning_rate": 0.0004959227308557459, "loss": 0.1083, "step": 107 }, { "epoch": 0.19, "grad_norm": 0.14975550770759583, "learning_rate": 0.0004958384653249932, "loss": 0.1155, "step": 108 }, { "epoch": 0.19, "grad_norm": 0.11868852376937866, "learning_rate": 0.0004957533452203, "loss": 0.1237, "step": 109 }, { "epoch": 0.19, "grad_norm": 0.2610260546207428, "learning_rate": 0.0004956673708375574, "loss": 0.1264, "step": 110 }, { "epoch": 0.2, "grad_norm": 0.378467321395874, "learning_rate": 0.000495580542475626, "loss": 0.1222, "step": 111 }, { "epoch": 0.2, "grad_norm": 0.0926096960902214, "learning_rate": 0.0004954928604363352, "loss": 0.1096, "step": 112 }, { "epoch": 0.2, "grad_norm": 0.06858692318201065, "learning_rate": 0.0004954043250244819, "loss": 0.1144, "step": 113 }, { "epoch": 0.2, "grad_norm": 0.3068992495536804, "learning_rate": 0.0004953149365478293, "loss": 0.1563, "step": 114 }, { "epoch": 0.2, "grad_norm": 0.15458936989307404, "learning_rate": 0.0004952246953171061, "loss": 0.1216, "step": 115 }, { "epoch": 0.21, "grad_norm": 0.10287577658891678, "learning_rate": 0.0004951336016460053, "loss": 0.0893, "step": 116 }, { "epoch": 0.21, "grad_norm": 0.1215134710073471, "learning_rate": 0.0004950416558511833, "loss": 0.1016, "step": 117 }, { "epoch": 0.21, "grad_norm": 0.1392650604248047, "learning_rate": 0.000494948858252258, "loss": 0.1111, "step": 118 }, { "epoch": 0.21, "grad_norm": 0.4350431263446808, "learning_rate": 0.0004948552091718092, "loss": 0.1192, "step": 119 }, { "epoch": 0.21, "grad_norm": 0.21448662877082825, "learning_rate": 0.0004947607089353758, "loss": 0.07, "step": 120 }, { "epoch": 0.21, "grad_norm": 1.6086686849594116, "learning_rate": 0.0004946653578714559, "loss": 0.1352, "step": 121 }, { "epoch": 0.22, "grad_norm": 0.25963085889816284, "learning_rate": 0.0004945691563115051, "loss": 0.1447, "step": 122 }, { "epoch": 0.22, "grad_norm": 0.11575956642627716, "learning_rate": 0.0004944721045899356, "loss": 0.1055, "step": 123 }, { "epoch": 0.22, "grad_norm": 0.11230157315731049, "learning_rate": 0.0004943742030441145, "loss": 0.0917, "step": 124 }, { "epoch": 0.22, "grad_norm": 0.3376341760158539, "learning_rate": 0.0004942754520143634, "loss": 0.1364, "step": 125 }, { "epoch": 0.22, "grad_norm": 0.2757412791252136, "learning_rate": 0.0004941758518439566, "loss": 0.1418, "step": 126 }, { "epoch": 0.22, "grad_norm": 0.1438644975423813, "learning_rate": 0.0004940754028791205, "loss": 0.1162, "step": 127 }, { "epoch": 0.23, "grad_norm": 0.14210884273052216, "learning_rate": 0.0004939741054690316, "loss": 0.1312, "step": 128 }, { "epoch": 0.23, "grad_norm": 0.1861649751663208, "learning_rate": 0.0004938719599658162, "loss": 0.1447, "step": 129 }, { "epoch": 0.23, "grad_norm": 0.19665485620498657, "learning_rate": 0.0004937689667245481, "loss": 0.1439, "step": 130 }, { "epoch": 0.23, "grad_norm": 0.22447055578231812, "learning_rate": 0.0004936651261032486, "loss": 0.1568, "step": 131 }, { "epoch": 0.23, "grad_norm": 0.10008269548416138, "learning_rate": 0.0004935604384628843, "loss": 0.1081, "step": 132 }, { "epoch": 0.24, "grad_norm": 0.0549234002828598, "learning_rate": 0.0004934549041673661, "loss": 0.1216, "step": 133 }, { "epoch": 0.24, "grad_norm": 0.11616212874650955, "learning_rate": 0.0004933485235835483, "loss": 0.1108, "step": 134 }, { "epoch": 0.24, "grad_norm": 0.08554813265800476, "learning_rate": 0.0004932412970812269, "loss": 0.135, "step": 135 }, { "epoch": 0.24, "grad_norm": 0.08642842620611191, "learning_rate": 0.0004931332250331382, "loss": 0.1205, "step": 136 }, { "epoch": 0.24, "grad_norm": 0.20417262613773346, "learning_rate": 0.0004930243078149582, "loss": 0.1169, "step": 137 }, { "epoch": 0.24, "grad_norm": 0.11088764667510986, "learning_rate": 0.0004929145458053005, "loss": 0.1014, "step": 138 }, { "epoch": 0.25, "grad_norm": 0.3510516881942749, "learning_rate": 0.0004928039393857155, "loss": 0.0967, "step": 139 }, { "epoch": 0.25, "grad_norm": 0.2401883453130722, "learning_rate": 0.0004926924889406888, "loss": 0.106, "step": 140 }, { "epoch": 0.25, "grad_norm": 0.28403300046920776, "learning_rate": 0.0004925801948576402, "loss": 0.079, "step": 141 }, { "epoch": 0.25, "grad_norm": 0.46027252078056335, "learning_rate": 0.0004924670575269217, "loss": 0.0899, "step": 142 }, { "epoch": 0.25, "eval_loss": 0.09421269595623016, "eval_runtime": 14.7696, "eval_samples_per_second": 32.296, "eval_steps_per_second": 8.125, "step": 142 }, { "epoch": 0.25, "grad_norm": 0.29767730832099915, "learning_rate": 0.0004923530773418169, "loss": 0.1265, "step": 143 }, { "epoch": 0.25, "grad_norm": 0.37391072511672974, "learning_rate": 0.0004922382546985394, "loss": 0.1244, "step": 144 }, { "epoch": 0.26, "grad_norm": 0.8874172568321228, "learning_rate": 0.0004921225899962308, "loss": 0.1796, "step": 145 }, { "epoch": 0.26, "grad_norm": 0.2554258704185486, "learning_rate": 0.0004920060836369603, "loss": 0.0528, "step": 146 }, { "epoch": 0.26, "grad_norm": 0.1981816440820694, "learning_rate": 0.0004918887360257228, "loss": 0.1159, "step": 147 }, { "epoch": 0.26, "grad_norm": 0.14500874280929565, "learning_rate": 0.0004917705475704373, "loss": 0.0992, "step": 148 }, { "epoch": 0.26, "grad_norm": 0.1315220594406128, "learning_rate": 0.000491651518681946, "loss": 0.1248, "step": 149 }, { "epoch": 0.27, "grad_norm": 0.0798826813697815, "learning_rate": 0.0004915316497740121, "loss": 0.1151, "step": 150 }, { "epoch": 0.27, "grad_norm": 0.10213784873485565, "learning_rate": 0.0004914109412633194, "loss": 0.1098, "step": 151 }, { "epoch": 0.27, "grad_norm": 0.23167072236537933, "learning_rate": 0.00049128939356947, "loss": 0.1236, "step": 152 }, { "epoch": 0.27, "grad_norm": 0.173340305685997, "learning_rate": 0.0004911670071149831, "loss": 0.1098, "step": 153 }, { "epoch": 0.27, "grad_norm": 0.1079009547829628, "learning_rate": 0.0004910437823252937, "loss": 0.1014, "step": 154 }, { "epoch": 0.27, "grad_norm": 0.320765882730484, "learning_rate": 0.0004909197196287509, "loss": 0.1285, "step": 155 }, { "epoch": 0.28, "grad_norm": 0.40041017532348633, "learning_rate": 0.0004907948194566166, "loss": 0.1421, "step": 156 }, { "epoch": 0.28, "grad_norm": 0.4091287851333618, "learning_rate": 0.0004906690822430638, "loss": 0.1451, "step": 157 }, { "epoch": 0.28, "grad_norm": 0.39893922209739685, "learning_rate": 0.0004905425084251753, "loss": 0.1289, "step": 158 }, { "epoch": 0.28, "grad_norm": 0.14173893630504608, "learning_rate": 0.0004904150984429419, "loss": 0.0712, "step": 159 }, { "epoch": 0.28, "grad_norm": 0.4723054766654968, "learning_rate": 0.0004902868527392611, "loss": 0.2141, "step": 160 }, { "epoch": 0.28, "grad_norm": 0.13493523001670837, "learning_rate": 0.0004901577717599355, "loss": 0.0881, "step": 161 }, { "epoch": 0.29, "grad_norm": 0.10770414024591446, "learning_rate": 0.0004900278559536716, "loss": 0.0746, "step": 162 }, { "epoch": 0.29, "grad_norm": 0.5121994614601135, "learning_rate": 0.0004898971057720773, "loss": 0.1705, "step": 163 }, { "epoch": 0.29, "grad_norm": 0.09419309347867966, "learning_rate": 0.0004897655216696612, "loss": 0.1085, "step": 164 }, { "epoch": 0.29, "grad_norm": 0.3557867407798767, "learning_rate": 0.0004896331041038309, "loss": 0.1027, "step": 165 }, { "epoch": 0.29, "grad_norm": 0.082126185297966, "learning_rate": 0.000489499853534891, "loss": 0.1113, "step": 166 }, { "epoch": 0.3, "grad_norm": 0.8520584106445312, "learning_rate": 0.0004893657704260419, "loss": 0.1291, "step": 167 }, { "epoch": 0.3, "grad_norm": 0.4607222080230713, "learning_rate": 0.000489230855243378, "loss": 0.1241, "step": 168 }, { "epoch": 0.3, "grad_norm": 0.5181136727333069, "learning_rate": 0.0004890951084558859, "loss": 0.0957, "step": 169 }, { "epoch": 0.3, "grad_norm": 0.42894089221954346, "learning_rate": 0.0004889585305354435, "loss": 0.0895, "step": 170 }, { "epoch": 0.3, "grad_norm": 0.14509521424770355, "learning_rate": 0.0004888211219568175, "loss": 0.0732, "step": 171 }, { "epoch": 0.3, "grad_norm": 0.24262909591197968, "learning_rate": 0.0004886828831976621, "loss": 0.0917, "step": 172 }, { "epoch": 0.31, "grad_norm": 0.44387635588645935, "learning_rate": 0.0004885438147385175, "loss": 0.0636, "step": 173 }, { "epoch": 0.31, "grad_norm": 0.1804012507200241, "learning_rate": 0.0004884039170628077, "loss": 0.0295, "step": 174 }, { "epoch": 0.31, "grad_norm": 0.5566735863685608, "learning_rate": 0.0004882631906568398, "loss": 0.1104, "step": 175 }, { "epoch": 0.31, "grad_norm": 0.9653083682060242, "learning_rate": 0.0004881216360098012, "loss": 0.2236, "step": 176 }, { "epoch": 0.31, "grad_norm": 0.27046507596969604, "learning_rate": 0.0004879792536137585, "loss": 0.1082, "step": 177 }, { "epoch": 0.31, "grad_norm": 0.47974228858947754, "learning_rate": 0.00048783604396365586, "loss": 0.0884, "step": 178 }, { "epoch": 0.32, "grad_norm": 0.23638087511062622, "learning_rate": 0.0004876920075573129, "loss": 0.0968, "step": 179 }, { "epoch": 0.32, "grad_norm": 0.12476328015327454, "learning_rate": 0.0004875471448954234, "loss": 0.1078, "step": 180 }, { "epoch": 0.32, "grad_norm": 0.2455732375383377, "learning_rate": 0.00048740145648155307, "loss": 0.1124, "step": 181 }, { "epoch": 0.32, "grad_norm": 0.2744804620742798, "learning_rate": 0.0004872549428221384, "loss": 0.0797, "step": 182 }, { "epoch": 0.32, "grad_norm": 0.19536937773227692, "learning_rate": 0.00048710760442648415, "loss": 0.1091, "step": 183 }, { "epoch": 0.33, "grad_norm": 0.5277348160743713, "learning_rate": 0.0004869594418067623, "loss": 0.1261, "step": 184 }, { "epoch": 0.33, "grad_norm": 0.13960392773151398, "learning_rate": 0.00048681045547801003, "loss": 0.0879, "step": 185 }, { "epoch": 0.33, "grad_norm": 0.2567049562931061, "learning_rate": 0.00048666064595812746, "loss": 0.083, "step": 186 }, { "epoch": 0.33, "grad_norm": 0.3075740337371826, "learning_rate": 0.00048651001376787676, "loss": 0.1167, "step": 187 }, { "epoch": 0.33, "grad_norm": 0.5257586240768433, "learning_rate": 0.0004863585594308794, "loss": 0.1019, "step": 188 }, { "epoch": 0.33, "grad_norm": 0.41611766815185547, "learning_rate": 0.00048620628347361496, "loss": 0.1392, "step": 189 }, { "epoch": 0.34, "grad_norm": 0.30399614572525024, "learning_rate": 0.00048605318642541917, "loss": 0.1339, "step": 190 }, { "epoch": 0.34, "grad_norm": 0.41276878118515015, "learning_rate": 0.00048589926881848194, "loss": 0.1028, "step": 191 }, { "epoch": 0.34, "grad_norm": 0.19717253744602203, "learning_rate": 0.0004857445311878456, "loss": 0.1032, "step": 192 }, { "epoch": 0.34, "grad_norm": 0.3766873776912689, "learning_rate": 0.0004855889740714028, "loss": 0.1486, "step": 193 }, { "epoch": 0.34, "grad_norm": 0.17443525791168213, "learning_rate": 0.0004854325980098951, "loss": 0.096, "step": 194 }, { "epoch": 0.34, "grad_norm": 0.1278471201658249, "learning_rate": 0.0004852754035469109, "loss": 0.0746, "step": 195 }, { "epoch": 0.35, "grad_norm": 0.14356929063796997, "learning_rate": 0.0004851173912288833, "loss": 0.0857, "step": 196 }, { "epoch": 0.35, "grad_norm": 0.20514866709709167, "learning_rate": 0.0004849585616050884, "loss": 0.0833, "step": 197 }, { "epoch": 0.35, "grad_norm": 0.4683605134487152, "learning_rate": 0.0004847989152276435, "loss": 0.1538, "step": 198 }, { "epoch": 0.35, "grad_norm": 0.29194721579551697, "learning_rate": 0.00048463845265150495, "loss": 0.1035, "step": 199 }, { "epoch": 0.35, "grad_norm": 0.22838515043258667, "learning_rate": 0.0004844771744344666, "loss": 0.0762, "step": 200 }, { "epoch": 0.36, "grad_norm": 0.3635599911212921, "learning_rate": 0.0004843150811371572, "loss": 0.1165, "step": 201 }, { "epoch": 0.36, "grad_norm": 0.2508073151111603, "learning_rate": 0.0004841521733230391, "loss": 0.0736, "step": 202 }, { "epoch": 0.36, "grad_norm": 0.24161550402641296, "learning_rate": 0.000483988451558406, "loss": 0.1309, "step": 203 }, { "epoch": 0.36, "grad_norm": 0.4697308838367462, "learning_rate": 0.0004838239164123811, "loss": 0.1731, "step": 204 }, { "epoch": 0.36, "grad_norm": 0.17773008346557617, "learning_rate": 0.0004836585684569148, "loss": 0.1158, "step": 205 }, { "epoch": 0.36, "grad_norm": 0.21285519003868103, "learning_rate": 0.0004834924082667833, "loss": 0.0949, "step": 206 }, { "epoch": 0.37, "grad_norm": 0.2403111308813095, "learning_rate": 0.0004833254364195859, "loss": 0.0801, "step": 207 }, { "epoch": 0.37, "grad_norm": 0.2033465951681137, "learning_rate": 0.0004831576534957437, "loss": 0.069, "step": 208 }, { "epoch": 0.37, "grad_norm": 0.5510303378105164, "learning_rate": 0.000482989060078497, "loss": 0.1766, "step": 209 }, { "epoch": 0.37, "grad_norm": 0.32342344522476196, "learning_rate": 0.0004828196567539034, "loss": 0.1229, "step": 210 }, { "epoch": 0.37, "grad_norm": 0.3102104663848877, "learning_rate": 0.00048264944411083625, "loss": 0.1297, "step": 211 }, { "epoch": 0.38, "grad_norm": 0.32639122009277344, "learning_rate": 0.00048247842274098187, "loss": 0.1011, "step": 212 }, { "epoch": 0.38, "grad_norm": 0.4720034897327423, "learning_rate": 0.00048230659323883804, "loss": 0.1282, "step": 213 }, { "epoch": 0.38, "grad_norm": 0.5249712467193604, "learning_rate": 0.00048213395620171166, "loss": 0.1376, "step": 214 }, { "epoch": 0.38, "grad_norm": 0.3953443467617035, "learning_rate": 0.00048196051222971673, "loss": 0.1186, "step": 215 }, { "epoch": 0.38, "grad_norm": 0.15697738528251648, "learning_rate": 0.0004817862619257723, "loss": 0.1079, "step": 216 }, { "epoch": 0.38, "grad_norm": 0.32511651515960693, "learning_rate": 0.0004816112058956005, "loss": 0.1052, "step": 217 }, { "epoch": 0.39, "grad_norm": 0.1850031018257141, "learning_rate": 0.00048143534474772397, "loss": 0.1236, "step": 218 }, { "epoch": 0.39, "grad_norm": 0.10901057720184326, "learning_rate": 0.0004812586790934645, "loss": 0.1094, "step": 219 }, { "epoch": 0.39, "grad_norm": 0.23395784199237823, "learning_rate": 0.00048108120954694014, "loss": 0.0556, "step": 220 }, { "epoch": 0.39, "grad_norm": 0.21469372510910034, "learning_rate": 0.00048090293672506347, "loss": 0.0594, "step": 221 }, { "epoch": 0.39, "grad_norm": 0.17289988696575165, "learning_rate": 0.00048072386124753944, "loss": 0.0219, "step": 222 }, { "epoch": 0.39, "grad_norm": 0.21490757167339325, "learning_rate": 0.0004805439837368631, "loss": 0.0203, "step": 223 }, { "epoch": 0.4, "grad_norm": 1.1259506940841675, "learning_rate": 0.0004803633048183176, "loss": 0.1576, "step": 224 }, { "epoch": 0.4, "grad_norm": 1.2934038639068604, "learning_rate": 0.00048018182511997185, "loss": 0.1233, "step": 225 }, { "epoch": 0.4, "grad_norm": 0.4250846207141876, "learning_rate": 0.0004799995452726783, "loss": 0.1023, "step": 226 }, { "epoch": 0.4, "grad_norm": 1.4675579071044922, "learning_rate": 0.000479816465910071, "loss": 0.1242, "step": 227 }, { "epoch": 0.4, "grad_norm": 0.7030429840087891, "learning_rate": 0.0004796325876685632, "loss": 0.0514, "step": 228 }, { "epoch": 0.41, "grad_norm": 0.5683910846710205, "learning_rate": 0.00047944791118734517, "loss": 0.0923, "step": 229 }, { "epoch": 0.41, "grad_norm": 0.8425244092941284, "learning_rate": 0.0004792624371083819, "loss": 0.0976, "step": 230 }, { "epoch": 0.41, "grad_norm": 0.21189981698989868, "learning_rate": 0.00047907616607641113, "loss": 0.1016, "step": 231 }, { "epoch": 0.41, "grad_norm": 0.36100390553474426, "learning_rate": 0.0004788890987389408, "loss": 0.1015, "step": 232 }, { "epoch": 0.41, "grad_norm": 0.42600420117378235, "learning_rate": 0.000478701235746247, "loss": 0.1401, "step": 233 }, { "epoch": 0.41, "grad_norm": 0.649318516254425, "learning_rate": 0.0004785125777513716, "loss": 0.1012, "step": 234 }, { "epoch": 0.42, "grad_norm": 0.3490477204322815, "learning_rate": 0.00047832312541012007, "loss": 0.1015, "step": 235 }, { "epoch": 0.42, "grad_norm": 0.6937799453735352, "learning_rate": 0.0004781328793810592, "loss": 0.1188, "step": 236 }, { "epoch": 0.42, "grad_norm": 1.0924077033996582, "learning_rate": 0.0004779418403255146, "loss": 0.1093, "step": 237 }, { "epoch": 0.42, "grad_norm": 0.36075183749198914, "learning_rate": 0.0004777500089075687, "loss": 0.0971, "step": 238 }, { "epoch": 0.42, "grad_norm": 0.41673243045806885, "learning_rate": 0.00047755738579405836, "loss": 0.0953, "step": 239 }, { "epoch": 0.42, "grad_norm": 0.13159583508968353, "learning_rate": 0.0004773639716545723, "loss": 0.0571, "step": 240 }, { "epoch": 0.43, "grad_norm": 0.9338862895965576, "learning_rate": 0.00047716976716144917, "loss": 0.202, "step": 241 }, { "epoch": 0.43, "grad_norm": 0.3190581798553467, "learning_rate": 0.0004769747729897749, "loss": 0.1071, "step": 242 }, { "epoch": 0.43, "grad_norm": 0.23796042799949646, "learning_rate": 0.0004767789898173806, "loss": 0.0659, "step": 243 }, { "epoch": 0.43, "grad_norm": 0.19194231927394867, "learning_rate": 0.0004765824183248399, "loss": 0.0611, "step": 244 }, { "epoch": 0.43, "grad_norm": 0.16703608632087708, "learning_rate": 0.0004763850591954668, "loss": 0.0855, "step": 245 }, { "epoch": 0.44, "grad_norm": 0.3395439684391022, "learning_rate": 0.0004761869131153135, "loss": 0.0926, "step": 246 }, { "epoch": 0.44, "grad_norm": 0.2820179760456085, "learning_rate": 0.0004759879807731673, "loss": 0.0508, "step": 247 }, { "epoch": 0.44, "grad_norm": 0.20656561851501465, "learning_rate": 0.00047578826286054897, "loss": 0.068, "step": 248 }, { "epoch": 0.44, "grad_norm": 0.4477837383747101, "learning_rate": 0.00047558776007171024, "loss": 0.0918, "step": 249 }, { "epoch": 0.44, "grad_norm": 0.18997950851917267, "learning_rate": 0.0004753864731036307, "loss": 0.0734, "step": 250 }, { "epoch": 0.44, "grad_norm": 0.2841518521308899, "learning_rate": 0.0004751844026560163, "loss": 0.1194, "step": 251 }, { "epoch": 0.45, "grad_norm": 0.29770052433013916, "learning_rate": 0.0004749815494312963, "loss": 0.0996, "step": 252 }, { "epoch": 0.45, "grad_norm": 0.2982254922389984, "learning_rate": 0.00047477791413462104, "loss": 0.0945, "step": 253 }, { "epoch": 0.45, "grad_norm": 0.4625980854034424, "learning_rate": 0.00047457349747385936, "loss": 0.131, "step": 254 }, { "epoch": 0.45, "grad_norm": 0.29756709933280945, "learning_rate": 0.00047436830015959653, "loss": 0.1057, "step": 255 }, { "epoch": 0.45, "grad_norm": 0.19971434772014618, "learning_rate": 0.00047416232290513127, "loss": 0.0794, "step": 256 }, { "epoch": 0.45, "grad_norm": 0.12171836197376251, "learning_rate": 0.0004739555664264736, "loss": 0.0527, "step": 257 }, { "epoch": 0.46, "grad_norm": 0.23848529160022736, "learning_rate": 0.00047374803144234213, "loss": 0.134, "step": 258 }, { "epoch": 0.46, "grad_norm": 0.12673752009868622, "learning_rate": 0.0004735397186741618, "loss": 0.0774, "step": 259 }, { "epoch": 0.46, "grad_norm": 0.11961629241704941, "learning_rate": 0.00047333062884606116, "loss": 0.0661, "step": 260 }, { "epoch": 0.46, "grad_norm": 0.18004140257835388, "learning_rate": 0.00047312076268487, "loss": 0.1132, "step": 261 }, { "epoch": 0.46, "grad_norm": 0.1698005348443985, "learning_rate": 0.00047291012092011685, "loss": 0.057, "step": 262 }, { "epoch": 0.47, "grad_norm": 0.1949334442615509, "learning_rate": 0.0004726987042840263, "loss": 0.0703, "step": 263 }, { "epoch": 0.47, "grad_norm": 0.4016534686088562, "learning_rate": 0.0004724865135115163, "loss": 0.1178, "step": 264 }, { "epoch": 0.47, "grad_norm": 0.36885496973991394, "learning_rate": 0.00047227354934019605, "loss": 0.1303, "step": 265 }, { "epoch": 0.47, "grad_norm": 0.3214585483074188, "learning_rate": 0.00047205981251036334, "loss": 0.1019, "step": 266 }, { "epoch": 0.47, "grad_norm": 0.15313082933425903, "learning_rate": 0.0004718453037650016, "loss": 0.0581, "step": 267 }, { "epoch": 0.47, "grad_norm": 0.3251878321170807, "learning_rate": 0.0004716300238497775, "loss": 0.099, "step": 268 }, { "epoch": 0.48, "grad_norm": 0.20356950163841248, "learning_rate": 0.0004714139735130388, "loss": 0.0767, "step": 269 }, { "epoch": 0.48, "grad_norm": 0.2644464373588562, "learning_rate": 0.00047119715350581095, "loss": 0.1003, "step": 270 }, { "epoch": 0.48, "grad_norm": 0.22035302221775055, "learning_rate": 0.000470979564581795, "loss": 0.0722, "step": 271 }, { "epoch": 0.48, "grad_norm": 0.5284466743469238, "learning_rate": 0.0004707612074973653, "loss": 0.1282, "step": 272 }, { "epoch": 0.48, "grad_norm": 0.34032565355300903, "learning_rate": 0.0004705420830115658, "loss": 0.099, "step": 273 }, { "epoch": 0.48, "grad_norm": 0.26527565717697144, "learning_rate": 0.00047032219188610836, "loss": 0.0911, "step": 274 }, { "epoch": 0.49, "grad_norm": 0.2254990190267563, "learning_rate": 0.0004701015348853699, "loss": 0.0667, "step": 275 }, { "epoch": 0.49, "grad_norm": 0.21334387362003326, "learning_rate": 0.0004698801127763895, "loss": 0.0659, "step": 276 }, { "epoch": 0.49, "grad_norm": 0.2917044758796692, "learning_rate": 0.0004696579263288661, "loss": 0.1159, "step": 277 }, { "epoch": 0.49, "grad_norm": 0.14027804136276245, "learning_rate": 0.00046943497631515526, "loss": 0.0323, "step": 278 }, { "epoch": 0.49, "grad_norm": 0.3988366425037384, "learning_rate": 0.00046921126351026697, "loss": 0.0887, "step": 279 }, { "epoch": 0.5, "grad_norm": 0.36629319190979004, "learning_rate": 0.00046898678869186297, "loss": 0.1079, "step": 280 }, { "epoch": 0.5, "grad_norm": 0.35548141598701477, "learning_rate": 0.0004687615526402536, "loss": 0.1056, "step": 281 }, { "epoch": 0.5, "grad_norm": 0.21030637621879578, "learning_rate": 0.0004685355561383956, "loss": 0.0717, "step": 282 }, { "epoch": 0.5, "grad_norm": 0.24192889034748077, "learning_rate": 0.000468308799971889, "loss": 0.1047, "step": 283 }, { "epoch": 0.5, "grad_norm": 0.16289295256137848, "learning_rate": 0.00046808128492897464, "loss": 0.0519, "step": 284 }, { "epoch": 0.5, "eval_loss": 0.08938124030828476, "eval_runtime": 14.7518, "eval_samples_per_second": 32.335, "eval_steps_per_second": 8.135, "step": 284 }, { "epoch": 0.5, "grad_norm": 0.23021583259105682, "learning_rate": 0.00046785301180053126, "loss": 0.1161, "step": 285 }, { "epoch": 0.51, "grad_norm": 0.3577558398246765, "learning_rate": 0.0004676239813800729, "loss": 0.1239, "step": 286 }, { "epoch": 0.51, "grad_norm": 0.15293735265731812, "learning_rate": 0.0004673941944637461, "loss": 0.0401, "step": 287 }, { "epoch": 0.51, "grad_norm": 0.342631459236145, "learning_rate": 0.00046716365185032696, "loss": 0.1358, "step": 288 }, { "epoch": 0.51, "grad_norm": 0.4987104833126068, "learning_rate": 0.0004669323543412186, "loss": 0.1312, "step": 289 }, { "epoch": 0.51, "grad_norm": 0.21678434312343597, "learning_rate": 0.0004667003027404483, "loss": 0.0791, "step": 290 }, { "epoch": 0.51, "grad_norm": 0.2781723141670227, "learning_rate": 0.00046646749785466464, "loss": 0.0809, "step": 291 }, { "epoch": 0.52, "grad_norm": 0.3997693359851837, "learning_rate": 0.00046623394049313474, "loss": 0.0938, "step": 292 }, { "epoch": 0.52, "grad_norm": 0.2478984147310257, "learning_rate": 0.00046599963146774136, "loss": 0.0671, "step": 293 }, { "epoch": 0.52, "grad_norm": 0.35655421018600464, "learning_rate": 0.0004657645715929805, "loss": 0.107, "step": 294 }, { "epoch": 0.52, "grad_norm": 0.41986069083213806, "learning_rate": 0.0004655287616859577, "loss": 0.1381, "step": 295 }, { "epoch": 0.52, "grad_norm": 0.2831580340862274, "learning_rate": 0.00046529220256638626, "loss": 0.1012, "step": 296 }, { "epoch": 0.53, "grad_norm": 0.2183172106742859, "learning_rate": 0.0004650548950565835, "loss": 0.0883, "step": 297 }, { "epoch": 0.53, "grad_norm": 0.1485687792301178, "learning_rate": 0.0004648168399814684, "loss": 0.094, "step": 298 }, { "epoch": 0.53, "grad_norm": 0.3192533552646637, "learning_rate": 0.0004645780381685586, "loss": 0.1144, "step": 299 }, { "epoch": 0.53, "grad_norm": 0.20768460631370544, "learning_rate": 0.0004643384904479675, "loss": 0.1119, "step": 300 }, { "epoch": 0.53, "grad_norm": 0.16704390943050385, "learning_rate": 0.00046409819765240147, "loss": 0.0852, "step": 301 }, { "epoch": 0.53, "grad_norm": 0.33123648166656494, "learning_rate": 0.0004638571606171567, "loss": 0.1608, "step": 302 }, { "epoch": 0.54, "grad_norm": 0.408978134393692, "learning_rate": 0.0004636153801801167, "loss": 0.0906, "step": 303 }, { "epoch": 0.54, "grad_norm": 0.29201096296310425, "learning_rate": 0.00046337285718174896, "loss": 0.1237, "step": 304 }, { "epoch": 0.54, "grad_norm": 0.45836058259010315, "learning_rate": 0.00046312959246510237, "loss": 0.0926, "step": 305 }, { "epoch": 0.54, "grad_norm": 0.5405777096748352, "learning_rate": 0.0004628855868758041, "loss": 0.0727, "step": 306 }, { "epoch": 0.54, "grad_norm": 0.3068138062953949, "learning_rate": 0.00046264084126205676, "loss": 0.1006, "step": 307 }, { "epoch": 0.54, "grad_norm": 0.2990975081920624, "learning_rate": 0.00046239535647463534, "loss": 0.1033, "step": 308 }, { "epoch": 0.55, "grad_norm": 0.2938540279865265, "learning_rate": 0.00046214913336688424, "loss": 0.1084, "step": 309 }, { "epoch": 0.55, "grad_norm": 0.49840983748435974, "learning_rate": 0.00046190217279471466, "loss": 0.1066, "step": 310 }, { "epoch": 0.55, "grad_norm": 0.4558626711368561, "learning_rate": 0.000461654475616601, "loss": 0.121, "step": 311 }, { "epoch": 0.55, "grad_norm": 0.20964759588241577, "learning_rate": 0.0004614060426935786, "loss": 0.0843, "step": 312 }, { "epoch": 0.55, "grad_norm": 0.2254151701927185, "learning_rate": 0.00046115687488923983, "loss": 0.0781, "step": 313 }, { "epoch": 0.56, "grad_norm": 0.6117066740989685, "learning_rate": 0.0004609069730697322, "loss": 0.1208, "step": 314 }, { "epoch": 0.56, "grad_norm": 0.5352897644042969, "learning_rate": 0.0004606563381037544, "loss": 0.1056, "step": 315 }, { "epoch": 0.56, "grad_norm": 0.31001269817352295, "learning_rate": 0.00046040497086255385, "loss": 0.1213, "step": 316 }, { "epoch": 0.56, "grad_norm": 0.22637054324150085, "learning_rate": 0.0004601528722199234, "loss": 0.105, "step": 317 }, { "epoch": 0.56, "grad_norm": 0.20077432692050934, "learning_rate": 0.0004599000430521984, "loss": 0.0837, "step": 318 }, { "epoch": 0.56, "grad_norm": 0.24702684581279755, "learning_rate": 0.0004596464842382534, "loss": 0.0695, "step": 319 }, { "epoch": 0.57, "grad_norm": 0.253387987613678, "learning_rate": 0.0004593921966594997, "loss": 0.1184, "step": 320 }, { "epoch": 0.57, "grad_norm": 0.2703799605369568, "learning_rate": 0.0004591371811998817, "loss": 0.117, "step": 321 }, { "epoch": 0.57, "grad_norm": 0.23513701558113098, "learning_rate": 0.00045888143874587396, "loss": 0.1359, "step": 322 }, { "epoch": 0.57, "grad_norm": 0.25604313611984253, "learning_rate": 0.00045862497018647833, "loss": 0.1018, "step": 323 }, { "epoch": 0.57, "grad_norm": 0.16947636008262634, "learning_rate": 0.0004583677764132207, "loss": 0.0958, "step": 324 }, { "epoch": 0.57, "grad_norm": 0.20054908096790314, "learning_rate": 0.0004581098583201478, "loss": 0.0803, "step": 325 }, { "epoch": 0.58, "grad_norm": 0.12656472623348236, "learning_rate": 0.00045785121680382436, "loss": 0.0679, "step": 326 }, { "epoch": 0.58, "grad_norm": 0.1423499882221222, "learning_rate": 0.0004575918527633297, "loss": 0.0959, "step": 327 }, { "epoch": 0.58, "grad_norm": 0.36370569467544556, "learning_rate": 0.0004573317671002549, "loss": 0.1088, "step": 328 }, { "epoch": 0.58, "grad_norm": 0.18775340914726257, "learning_rate": 0.0004570709607186995, "loss": 0.0905, "step": 329 }, { "epoch": 0.58, "grad_norm": 0.14833571016788483, "learning_rate": 0.0004568094345252681, "loss": 0.0661, "step": 330 }, { "epoch": 0.59, "grad_norm": 0.2987270653247833, "learning_rate": 0.00045654718942906794, "loss": 0.0872, "step": 331 }, { "epoch": 0.59, "grad_norm": 0.21985827386379242, "learning_rate": 0.000456284226341705, "loss": 0.0882, "step": 332 }, { "epoch": 0.59, "grad_norm": 0.2726268470287323, "learning_rate": 0.00045602054617728093, "loss": 0.0864, "step": 333 }, { "epoch": 0.59, "grad_norm": 0.2882244884967804, "learning_rate": 0.00045575614985239057, "loss": 0.1032, "step": 334 }, { "epoch": 0.59, "grad_norm": 0.427500456571579, "learning_rate": 0.0004554910382861178, "loss": 0.1309, "step": 335 }, { "epoch": 0.59, "grad_norm": 0.43029338121414185, "learning_rate": 0.000455225212400033, "loss": 0.1071, "step": 336 }, { "epoch": 0.6, "grad_norm": 0.2297673523426056, "learning_rate": 0.0004549586731181896, "loss": 0.0526, "step": 337 }, { "epoch": 0.6, "grad_norm": 0.4533613920211792, "learning_rate": 0.0004546914213671209, "loss": 0.1154, "step": 338 }, { "epoch": 0.6, "grad_norm": 0.3973630666732788, "learning_rate": 0.0004544234580758367, "loss": 0.0707, "step": 339 }, { "epoch": 0.6, "grad_norm": 0.40036290884017944, "learning_rate": 0.0004541547841758207, "loss": 0.0932, "step": 340 }, { "epoch": 0.6, "grad_norm": 0.4273395240306854, "learning_rate": 0.0004538854006010262, "loss": 0.1112, "step": 341 }, { "epoch": 0.61, "grad_norm": 0.28109779953956604, "learning_rate": 0.0004536153082878738, "loss": 0.1003, "step": 342 }, { "epoch": 0.61, "grad_norm": 0.21950216591358185, "learning_rate": 0.00045334450817524776, "loss": 0.0538, "step": 343 }, { "epoch": 0.61, "grad_norm": 0.2968471646308899, "learning_rate": 0.00045307300120449263, "loss": 0.0775, "step": 344 }, { "epoch": 0.61, "grad_norm": 0.1488364040851593, "learning_rate": 0.00045280078831941024, "loss": 0.0513, "step": 345 }, { "epoch": 0.61, "grad_norm": 0.22750218212604523, "learning_rate": 0.00045252787046625624, "loss": 0.0943, "step": 346 }, { "epoch": 0.61, "grad_norm": 0.3048767149448395, "learning_rate": 0.0004522542485937369, "loss": 0.079, "step": 347 }, { "epoch": 0.62, "grad_norm": 0.33520030975341797, "learning_rate": 0.0004519799236530057, "loss": 0.1584, "step": 348 }, { "epoch": 0.62, "grad_norm": 0.20777581632137299, "learning_rate": 0.00045170489659766003, "loss": 0.0903, "step": 349 }, { "epoch": 0.62, "grad_norm": 0.1602245271205902, "learning_rate": 0.00045142916838373826, "loss": 0.0446, "step": 350 }, { "epoch": 0.62, "grad_norm": 0.2512218952178955, "learning_rate": 0.0004511527399697158, "loss": 0.069, "step": 351 }, { "epoch": 0.62, "grad_norm": 0.17349962890148163, "learning_rate": 0.0004508756123165021, "loss": 0.0765, "step": 352 }, { "epoch": 0.62, "grad_norm": 0.26563215255737305, "learning_rate": 0.00045059778638743744, "loss": 0.0966, "step": 353 }, { "epoch": 0.63, "grad_norm": 0.23987066745758057, "learning_rate": 0.00045031926314828926, "loss": 0.0702, "step": 354 }, { "epoch": 0.63, "grad_norm": 0.21901372075080872, "learning_rate": 0.000450040043567249, "loss": 0.0457, "step": 355 }, { "epoch": 0.63, "grad_norm": 0.24179872870445251, "learning_rate": 0.00044976012861492877, "loss": 0.0651, "step": 356 }, { "epoch": 0.63, "grad_norm": 0.3544818162918091, "learning_rate": 0.0004494795192643578, "loss": 0.0622, "step": 357 }, { "epoch": 0.63, "grad_norm": 0.4363332986831665, "learning_rate": 0.00044919821649097916, "loss": 0.0972, "step": 358 }, { "epoch": 0.64, "grad_norm": 0.43788430094718933, "learning_rate": 0.0004489162212726465, "loss": 0.0843, "step": 359 }, { "epoch": 0.64, "grad_norm": 0.5084832906723022, "learning_rate": 0.00044863353458962044, "loss": 0.0888, "step": 360 }, { "epoch": 0.64, "grad_norm": 0.44660842418670654, "learning_rate": 0.0004483501574245652, "loss": 0.113, "step": 361 }, { "epoch": 0.64, "grad_norm": 0.7528813481330872, "learning_rate": 0.0004480660907625452, "loss": 0.0512, "step": 362 }, { "epoch": 0.64, "grad_norm": 0.9723535776138306, "learning_rate": 0.0004477813355910219, "loss": 0.1154, "step": 363 }, { "epoch": 0.64, "grad_norm": 0.2641480565071106, "learning_rate": 0.0004474958928998498, "loss": 0.0575, "step": 364 }, { "epoch": 0.65, "grad_norm": 0.12234170734882355, "learning_rate": 0.00044720976368127355, "loss": 0.0441, "step": 365 }, { "epoch": 0.65, "grad_norm": 0.26976636052131653, "learning_rate": 0.00044692294892992416, "loss": 0.0676, "step": 366 }, { "epoch": 0.65, "grad_norm": 0.22729526460170746, "learning_rate": 0.00044663544964281573, "loss": 0.098, "step": 367 }, { "epoch": 0.65, "grad_norm": 0.2270442545413971, "learning_rate": 0.0004463472668193419, "loss": 0.0842, "step": 368 }, { "epoch": 0.65, "grad_norm": 0.19249562919139862, "learning_rate": 0.0004460584014612724, "loss": 0.0537, "step": 369 }, { "epoch": 0.65, "grad_norm": 0.22312623262405396, "learning_rate": 0.0004457688545727496, "loss": 0.0547, "step": 370 }, { "epoch": 0.66, "grad_norm": 0.281658411026001, "learning_rate": 0.0004454786271602849, "loss": 0.089, "step": 371 }, { "epoch": 0.66, "grad_norm": 0.49952250719070435, "learning_rate": 0.00044518772023275526, "loss": 0.1298, "step": 372 }, { "epoch": 0.66, "grad_norm": 0.186232328414917, "learning_rate": 0.0004448961348013999, "loss": 0.0628, "step": 373 }, { "epoch": 0.66, "grad_norm": 0.2980823814868927, "learning_rate": 0.0004446038718798166, "loss": 0.0828, "step": 374 }, { "epoch": 0.66, "grad_norm": 0.3794187605381012, "learning_rate": 0.00044431093248395806, "loss": 0.0776, "step": 375 }, { "epoch": 0.67, "grad_norm": 0.29262277483940125, "learning_rate": 0.0004440173176321287, "loss": 0.0924, "step": 376 }, { "epoch": 0.67, "grad_norm": 0.30543988943099976, "learning_rate": 0.0004437230283449808, "loss": 0.1264, "step": 377 }, { "epoch": 0.67, "grad_norm": 0.3436485826969147, "learning_rate": 0.0004434280656455111, "loss": 0.1066, "step": 378 }, { "epoch": 0.67, "grad_norm": 0.23679965734481812, "learning_rate": 0.0004431324305590572, "loss": 0.075, "step": 379 }, { "epoch": 0.67, "grad_norm": 0.4399561882019043, "learning_rate": 0.0004428361241132943, "loss": 0.1445, "step": 380 }, { "epoch": 0.67, "grad_norm": 0.39203163981437683, "learning_rate": 0.0004425391473382309, "loss": 0.0995, "step": 381 }, { "epoch": 0.68, "grad_norm": 0.4687665104866028, "learning_rate": 0.0004422415012662061, "loss": 0.1489, "step": 382 }, { "epoch": 0.68, "grad_norm": 0.2634904086589813, "learning_rate": 0.00044194318693188526, "loss": 0.1164, "step": 383 }, { "epoch": 0.68, "grad_norm": 0.23031170666217804, "learning_rate": 0.0004416442053722569, "loss": 0.0742, "step": 384 }, { "epoch": 0.68, "grad_norm": 0.30467960238456726, "learning_rate": 0.00044134455762662894, "loss": 0.0984, "step": 385 }, { "epoch": 0.68, "grad_norm": 0.16692829132080078, "learning_rate": 0.0004410442447366249, "loss": 0.0732, "step": 386 }, { "epoch": 0.68, "grad_norm": 0.20100833475589752, "learning_rate": 0.00044074326774618065, "loss": 0.1082, "step": 387 }, { "epoch": 0.69, "grad_norm": 0.29799607396125793, "learning_rate": 0.0004404416277015404, "loss": 0.0761, "step": 388 }, { "epoch": 0.69, "grad_norm": 0.647639274597168, "learning_rate": 0.0004401393256512534, "loss": 0.1218, "step": 389 }, { "epoch": 0.69, "grad_norm": 0.2610540986061096, "learning_rate": 0.00043983636264617013, "loss": 0.0923, "step": 390 }, { "epoch": 0.69, "grad_norm": 0.4049086570739746, "learning_rate": 0.0004395327397394384, "loss": 0.1091, "step": 391 }, { "epoch": 0.69, "grad_norm": 0.36092105507850647, "learning_rate": 0.00043922845798650034, "loss": 0.0927, "step": 392 }, { "epoch": 0.7, "grad_norm": 0.542421281337738, "learning_rate": 0.00043892351844508805, "loss": 0.1014, "step": 393 }, { "epoch": 0.7, "grad_norm": 0.291595995426178, "learning_rate": 0.0004386179221752202, "loss": 0.0902, "step": 394 }, { "epoch": 0.7, "grad_norm": 0.17152707278728485, "learning_rate": 0.0004383116702391987, "loss": 0.0651, "step": 395 }, { "epoch": 0.7, "grad_norm": 0.16654878854751587, "learning_rate": 0.00043800476370160416, "loss": 0.0824, "step": 396 }, { "epoch": 0.7, "grad_norm": 0.18530108034610748, "learning_rate": 0.000437697203629293, "loss": 0.0549, "step": 397 }, { "epoch": 0.7, "grad_norm": 0.5760988593101501, "learning_rate": 0.0004373889910913934, "loss": 0.0803, "step": 398 }, { "epoch": 0.71, "grad_norm": 0.4253963232040405, "learning_rate": 0.00043708012715930154, "loss": 0.0728, "step": 399 }, { "epoch": 0.71, "grad_norm": 0.7932385206222534, "learning_rate": 0.00043677061290667805, "loss": 0.1442, "step": 400 }, { "epoch": 0.71, "grad_norm": 0.5904709696769714, "learning_rate": 0.00043646044940944407, "loss": 0.0999, "step": 401 }, { "epoch": 0.71, "grad_norm": 0.9570127129554749, "learning_rate": 0.0004361496377457777, "loss": 0.1298, "step": 402 }, { "epoch": 0.71, "grad_norm": 0.5049470663070679, "learning_rate": 0.00043583817899611017, "loss": 0.0263, "step": 403 }, { "epoch": 0.71, "grad_norm": 0.589408814907074, "learning_rate": 0.00043552607424312195, "loss": 0.1051, "step": 404 }, { "epoch": 0.72, "grad_norm": 0.43722283840179443, "learning_rate": 0.0004352133245717393, "loss": 0.0715, "step": 405 }, { "epoch": 0.72, "grad_norm": 1.3537758588790894, "learning_rate": 0.00043489993106913036, "loss": 0.0322, "step": 406 }, { "epoch": 0.72, "grad_norm": 0.3382836580276489, "learning_rate": 0.000434585894824701, "loss": 0.0818, "step": 407 }, { "epoch": 0.72, "grad_norm": 0.9946733713150024, "learning_rate": 0.00043427121693009164, "loss": 0.1536, "step": 408 }, { "epoch": 0.72, "grad_norm": 0.9138526320457458, "learning_rate": 0.0004339558984791732, "loss": 0.1299, "step": 409 }, { "epoch": 0.73, "grad_norm": 0.35993850231170654, "learning_rate": 0.0004336399405680432, "loss": 0.0654, "step": 410 }, { "epoch": 0.73, "grad_norm": 0.30418309569358826, "learning_rate": 0.0004333233442950219, "loss": 0.1026, "step": 411 }, { "epoch": 0.73, "grad_norm": 0.256728857755661, "learning_rate": 0.00043300611076064886, "loss": 0.083, "step": 412 }, { "epoch": 0.73, "grad_norm": 0.3472314774990082, "learning_rate": 0.00043268824106767865, "loss": 0.0637, "step": 413 }, { "epoch": 0.73, "grad_norm": 0.5615578293800354, "learning_rate": 0.00043236973632107735, "loss": 0.1028, "step": 414 }, { "epoch": 0.73, "grad_norm": 0.35775747895240784, "learning_rate": 0.00043205059762801854, "loss": 0.0829, "step": 415 }, { "epoch": 0.74, "grad_norm": 0.287270724773407, "learning_rate": 0.0004317308260978795, "loss": 0.0718, "step": 416 }, { "epoch": 0.74, "grad_norm": 0.3237059414386749, "learning_rate": 0.00043141042284223737, "loss": 0.0797, "step": 417 }, { "epoch": 0.74, "grad_norm": 0.2153153419494629, "learning_rate": 0.0004310893889748653, "loss": 0.0778, "step": 418 }, { "epoch": 0.74, "grad_norm": 0.33600860834121704, "learning_rate": 0.00043076772561172845, "loss": 0.0594, "step": 419 }, { "epoch": 0.74, "grad_norm": 0.8778895139694214, "learning_rate": 0.00043044543387098027, "loss": 0.1722, "step": 420 }, { "epoch": 0.74, "grad_norm": 0.503434419631958, "learning_rate": 0.0004301225148729586, "loss": 0.1228, "step": 421 }, { "epoch": 0.75, "grad_norm": 0.3694842457771301, "learning_rate": 0.00042979896974018166, "loss": 0.1033, "step": 422 }, { "epoch": 0.75, "grad_norm": 0.329818457365036, "learning_rate": 0.00042947479959734423, "loss": 0.0471, "step": 423 }, { "epoch": 0.75, "grad_norm": 0.19436267018318176, "learning_rate": 0.0004291500055713138, "loss": 0.0409, "step": 424 }, { "epoch": 0.75, "grad_norm": 0.3269858658313751, "learning_rate": 0.0004288245887911263, "loss": 0.096, "step": 425 }, { "epoch": 0.75, "grad_norm": 0.3542991578578949, "learning_rate": 0.00042849855038798283, "loss": 0.0986, "step": 426 }, { "epoch": 0.75, "eval_loss": 0.07792978733778, "eval_runtime": 14.8115, "eval_samples_per_second": 32.205, "eval_steps_per_second": 8.102, "step": 426 }, { "epoch": 0.76, "grad_norm": 0.29685521125793457, "learning_rate": 0.00042817189149524517, "loss": 0.11, "step": 427 }, { "epoch": 0.76, "grad_norm": 0.2116887867450714, "learning_rate": 0.00042784461324843194, "loss": 0.0686, "step": 428 }, { "epoch": 0.76, "grad_norm": 0.4866883456707001, "learning_rate": 0.00042751671678521486, "loss": 0.0824, "step": 429 }, { "epoch": 0.76, "grad_norm": 0.1293468475341797, "learning_rate": 0.00042718820324541475, "loss": 0.0464, "step": 430 }, { "epoch": 0.76, "grad_norm": 0.3253125250339508, "learning_rate": 0.0004268590737709972, "loss": 0.0996, "step": 431 }, { "epoch": 0.76, "grad_norm": 0.25559771060943604, "learning_rate": 0.00042652932950606917, "loss": 0.0545, "step": 432 }, { "epoch": 0.77, "grad_norm": 0.2788093686103821, "learning_rate": 0.0004261989715968746, "loss": 0.0502, "step": 433 }, { "epoch": 0.77, "grad_norm": 0.6902124285697937, "learning_rate": 0.00042586800119179046, "loss": 0.1598, "step": 434 }, { "epoch": 0.77, "grad_norm": 0.4788605570793152, "learning_rate": 0.00042553641944132316, "loss": 0.1552, "step": 435 }, { "epoch": 0.77, "grad_norm": 0.43495067954063416, "learning_rate": 0.00042520422749810395, "loss": 0.0907, "step": 436 }, { "epoch": 0.77, "grad_norm": 0.3549440801143646, "learning_rate": 0.0004248714265168853, "loss": 0.1152, "step": 437 }, { "epoch": 0.77, "grad_norm": 0.7210204601287842, "learning_rate": 0.00042453801765453687, "loss": 0.1891, "step": 438 }, { "epoch": 0.78, "grad_norm": 0.4578750729560852, "learning_rate": 0.00042420400207004126, "loss": 0.1383, "step": 439 }, { "epoch": 0.78, "grad_norm": 0.3323976993560791, "learning_rate": 0.00042386938092449036, "loss": 0.0936, "step": 440 }, { "epoch": 0.78, "grad_norm": 0.15145371854305267, "learning_rate": 0.00042353415538108076, "loss": 0.0608, "step": 441 }, { "epoch": 0.78, "grad_norm": 0.10744435340166092, "learning_rate": 0.00042319832660511037, "loss": 0.0865, "step": 442 }, { "epoch": 0.78, "grad_norm": 0.13599476218223572, "learning_rate": 0.0004228618957639738, "loss": 0.0763, "step": 443 }, { "epoch": 0.79, "grad_norm": 0.18250028789043427, "learning_rate": 0.00042252486402715865, "loss": 0.0813, "step": 444 }, { "epoch": 0.79, "grad_norm": 0.5180188417434692, "learning_rate": 0.00042218723256624136, "loss": 0.1603, "step": 445 }, { "epoch": 0.79, "grad_norm": 0.2943187355995178, "learning_rate": 0.000421849002554883, "loss": 0.1031, "step": 446 }, { "epoch": 0.79, "grad_norm": 0.14898087084293365, "learning_rate": 0.0004215101751688253, "loss": 0.071, "step": 447 }, { "epoch": 0.79, "grad_norm": 0.2951905131340027, "learning_rate": 0.00042117075158588663, "loss": 0.0772, "step": 448 }, { "epoch": 0.79, "grad_norm": 0.39807453751564026, "learning_rate": 0.00042083073298595787, "loss": 0.0561, "step": 449 }, { "epoch": 0.8, "grad_norm": 0.45217999815940857, "learning_rate": 0.0004204901205509981, "loss": 0.1076, "step": 450 }, { "epoch": 0.8, "grad_norm": 0.24114732444286346, "learning_rate": 0.000420148915465031, "loss": 0.1169, "step": 451 }, { "epoch": 0.8, "grad_norm": 0.6120204329490662, "learning_rate": 0.00041980711891413994, "loss": 0.1144, "step": 452 }, { "epoch": 0.8, "grad_norm": 0.3900619447231293, "learning_rate": 0.0004194647320864646, "loss": 0.0806, "step": 453 }, { "epoch": 0.8, "grad_norm": 0.3331635296344757, "learning_rate": 0.0004191217561721967, "loss": 0.0655, "step": 454 }, { "epoch": 0.8, "grad_norm": 0.29893186688423157, "learning_rate": 0.0004187781923635753, "loss": 0.0482, "step": 455 }, { "epoch": 0.81, "grad_norm": 0.20024164021015167, "learning_rate": 0.00041843404185488346, "loss": 0.0773, "step": 456 }, { "epoch": 0.81, "grad_norm": 0.3644329905509949, "learning_rate": 0.0004180893058424435, "loss": 0.1062, "step": 457 }, { "epoch": 0.81, "grad_norm": 0.5457159280776978, "learning_rate": 0.0004177439855246132, "loss": 0.1901, "step": 458 }, { "epoch": 0.81, "grad_norm": 0.282032310962677, "learning_rate": 0.0004173980821017812, "loss": 0.0656, "step": 459 }, { "epoch": 0.81, "grad_norm": 0.1957680881023407, "learning_rate": 0.00041705159677636334, "loss": 0.0725, "step": 460 }, { "epoch": 0.82, "grad_norm": 0.2736223042011261, "learning_rate": 0.00041670453075279827, "loss": 0.0897, "step": 461 }, { "epoch": 0.82, "grad_norm": 0.2145017832517624, "learning_rate": 0.0004163568852375431, "loss": 0.046, "step": 462 }, { "epoch": 0.82, "grad_norm": 0.1434750258922577, "learning_rate": 0.00041600866143906947, "loss": 0.0483, "step": 463 }, { "epoch": 0.82, "grad_norm": 0.2438279092311859, "learning_rate": 0.000415659860567859, "loss": 0.0935, "step": 464 }, { "epoch": 0.82, "grad_norm": 0.24830487370491028, "learning_rate": 0.00041531048383639966, "loss": 0.1061, "step": 465 }, { "epoch": 0.82, "grad_norm": 0.25185227394104004, "learning_rate": 0.000414960532459181, "loss": 0.082, "step": 466 }, { "epoch": 0.83, "grad_norm": 0.391631156206131, "learning_rate": 0.00041461000765269, "loss": 0.1274, "step": 467 }, { "epoch": 0.83, "grad_norm": 0.30484774708747864, "learning_rate": 0.0004142589106354071, "loss": 0.0672, "step": 468 }, { "epoch": 0.83, "grad_norm": 0.2584599554538727, "learning_rate": 0.0004139072426278021, "loss": 0.0863, "step": 469 }, { "epoch": 0.83, "grad_norm": 0.27182772755622864, "learning_rate": 0.0004135550048523292, "loss": 0.0996, "step": 470 }, { "epoch": 0.83, "grad_norm": 0.2670001685619354, "learning_rate": 0.00041320219853342347, "loss": 0.0592, "step": 471 }, { "epoch": 0.84, "grad_norm": 0.19571639597415924, "learning_rate": 0.0004128488248974962, "loss": 0.0618, "step": 472 }, { "epoch": 0.84, "grad_norm": 0.436814546585083, "learning_rate": 0.00041249488517293095, "loss": 0.1131, "step": 473 }, { "epoch": 0.84, "grad_norm": 0.21684250235557556, "learning_rate": 0.0004121403805900789, "loss": 0.0759, "step": 474 }, { "epoch": 0.84, "grad_norm": 0.39313605427742004, "learning_rate": 0.0004117853123812549, "loss": 0.0992, "step": 475 }, { "epoch": 0.84, "grad_norm": 0.3653202950954437, "learning_rate": 0.00041142968178073294, "loss": 0.099, "step": 476 }, { "epoch": 0.84, "grad_norm": 0.36615628004074097, "learning_rate": 0.00041107349002474206, "loss": 0.06, "step": 477 }, { "epoch": 0.85, "grad_norm": 0.2431243658065796, "learning_rate": 0.00041071673835146194, "loss": 0.0689, "step": 478 }, { "epoch": 0.85, "grad_norm": 0.7869367599487305, "learning_rate": 0.00041035942800101864, "loss": 0.1308, "step": 479 }, { "epoch": 0.85, "grad_norm": 0.2831230163574219, "learning_rate": 0.0004100015602154802, "loss": 0.087, "step": 480 }, { "epoch": 0.85, "grad_norm": 0.3709629774093628, "learning_rate": 0.0004096431362388525, "loss": 0.0822, "step": 481 }, { "epoch": 0.85, "grad_norm": 0.4082586467266083, "learning_rate": 0.0004092841573170748, "loss": 0.1114, "step": 482 }, { "epoch": 0.85, "grad_norm": 0.2919554114341736, "learning_rate": 0.0004089246246980154, "loss": 0.1059, "step": 483 }, { "epoch": 0.86, "grad_norm": 0.3750731945037842, "learning_rate": 0.0004085645396314673, "loss": 0.082, "step": 484 }, { "epoch": 0.86, "grad_norm": 0.21013659238815308, "learning_rate": 0.000408203903369144, "loss": 0.0819, "step": 485 }, { "epoch": 0.86, "grad_norm": 0.20771674811840057, "learning_rate": 0.00040784271716467503, "loss": 0.0687, "step": 486 }, { "epoch": 0.86, "grad_norm": 0.157434344291687, "learning_rate": 0.00040748098227360154, "loss": 0.0826, "step": 487 }, { "epoch": 0.86, "grad_norm": 0.40467727184295654, "learning_rate": 0.000407118699953372, "loss": 0.1131, "step": 488 }, { "epoch": 0.87, "grad_norm": 0.17521728575229645, "learning_rate": 0.0004067558714633378, "loss": 0.116, "step": 489 }, { "epoch": 0.87, "grad_norm": 0.2975709140300751, "learning_rate": 0.0004063924980647492, "loss": 0.0787, "step": 490 }, { "epoch": 0.87, "grad_norm": 0.22513332962989807, "learning_rate": 0.0004060285810207503, "loss": 0.0754, "step": 491 }, { "epoch": 0.87, "grad_norm": 0.2939409613609314, "learning_rate": 0.00040566412159637514, "loss": 0.0505, "step": 492 }, { "epoch": 0.87, "grad_norm": 0.21415212750434875, "learning_rate": 0.000405299121058543, "loss": 0.0486, "step": 493 }, { "epoch": 0.87, "grad_norm": 0.24846945703029633, "learning_rate": 0.00040493358067605445, "loss": 0.0645, "step": 494 }, { "epoch": 0.88, "grad_norm": 0.42928287386894226, "learning_rate": 0.00040456750171958655, "loss": 0.1455, "step": 495 }, { "epoch": 0.88, "grad_norm": 0.30920714139938354, "learning_rate": 0.0004042008854616883, "loss": 0.0743, "step": 496 }, { "epoch": 0.88, "grad_norm": 0.43211719393730164, "learning_rate": 0.00040383373317677687, "loss": 0.1037, "step": 497 }, { "epoch": 0.88, "grad_norm": 0.49942275881767273, "learning_rate": 0.00040346604614113215, "loss": 0.123, "step": 498 }, { "epoch": 0.88, "grad_norm": 0.18615621328353882, "learning_rate": 0.00040309782563289353, "loss": 0.0783, "step": 499 }, { "epoch": 0.88, "grad_norm": 0.22238926589488983, "learning_rate": 0.0004027290729320545, "loss": 0.0698, "step": 500 }, { "epoch": 0.89, "grad_norm": 0.31746548414230347, "learning_rate": 0.0004023597893204586, "loss": 0.1682, "step": 501 }, { "epoch": 0.89, "grad_norm": 0.19328100979328156, "learning_rate": 0.00040198997608179477, "loss": 0.1028, "step": 502 }, { "epoch": 0.89, "grad_norm": 0.15466806292533875, "learning_rate": 0.00040161963450159333, "loss": 0.065, "step": 503 }, { "epoch": 0.89, "grad_norm": 0.3000398874282837, "learning_rate": 0.00040124876586722103, "loss": 0.1071, "step": 504 }, { "epoch": 0.89, "grad_norm": 0.16753748059272766, "learning_rate": 0.00040087737146787654, "loss": 0.056, "step": 505 }, { "epoch": 0.9, "grad_norm": 0.17570586502552032, "learning_rate": 0.00040050545259458654, "loss": 0.0732, "step": 506 }, { "epoch": 0.9, "grad_norm": 0.19240190088748932, "learning_rate": 0.00040013301054020055, "loss": 0.0444, "step": 507 }, { "epoch": 0.9, "grad_norm": 0.23935984075069427, "learning_rate": 0.00039976004659938714, "loss": 0.0583, "step": 508 }, { "epoch": 0.9, "grad_norm": 0.22633028030395508, "learning_rate": 0.00039938656206862857, "loss": 0.065, "step": 509 }, { "epoch": 0.9, "grad_norm": 0.18621531128883362, "learning_rate": 0.000399012558246217, "loss": 0.0489, "step": 510 }, { "epoch": 0.9, "grad_norm": 0.37711310386657715, "learning_rate": 0.0003986380364322498, "loss": 0.1367, "step": 511 }, { "epoch": 0.91, "grad_norm": 0.26448771357536316, "learning_rate": 0.00039826299792862475, "loss": 0.076, "step": 512 }, { "epoch": 0.91, "grad_norm": 0.22461633384227753, "learning_rate": 0.00039788744403903604, "loss": 0.0734, "step": 513 }, { "epoch": 0.91, "grad_norm": 0.23908165097236633, "learning_rate": 0.00039751137606896907, "loss": 0.0718, "step": 514 }, { "epoch": 0.91, "grad_norm": 0.37807080149650574, "learning_rate": 0.00039713479532569646, "loss": 0.1495, "step": 515 }, { "epoch": 0.91, "grad_norm": 0.16840259730815887, "learning_rate": 0.00039675770311827337, "loss": 0.0491, "step": 516 }, { "epoch": 0.91, "grad_norm": 0.35179728269577026, "learning_rate": 0.00039638010075753274, "loss": 0.0839, "step": 517 }, { "epoch": 0.92, "grad_norm": 0.3631207048892975, "learning_rate": 0.00039600198955608084, "loss": 0.1348, "step": 518 }, { "epoch": 0.92, "grad_norm": 0.38650691509246826, "learning_rate": 0.00039562337082829304, "loss": 0.15, "step": 519 }, { "epoch": 0.92, "grad_norm": 0.2523843050003052, "learning_rate": 0.00039524424589030866, "loss": 0.1172, "step": 520 }, { "epoch": 0.92, "grad_norm": 0.2690166234970093, "learning_rate": 0.00039486461606002686, "loss": 0.0619, "step": 521 }, { "epoch": 0.92, "grad_norm": 0.31193405389785767, "learning_rate": 0.0003944844826571018, "loss": 0.0834, "step": 522 }, { "epoch": 0.93, "grad_norm": 0.21751855313777924, "learning_rate": 0.00039410384700293814, "loss": 0.068, "step": 523 }, { "epoch": 0.93, "grad_norm": 0.34191232919692993, "learning_rate": 0.0003937227104206865, "loss": 0.1337, "step": 524 }, { "epoch": 0.93, "grad_norm": 0.34457269310951233, "learning_rate": 0.0003933410742352388, "loss": 0.0929, "step": 525 }, { "epoch": 0.93, "grad_norm": 0.22599942982196808, "learning_rate": 0.0003929589397732236, "loss": 0.0899, "step": 526 }, { "epoch": 0.93, "grad_norm": 0.23162932693958282, "learning_rate": 0.0003925763083630017, "loss": 0.0869, "step": 527 }, { "epoch": 0.93, "grad_norm": 0.19502510130405426, "learning_rate": 0.00039219318133466104, "loss": 0.0834, "step": 528 }, { "epoch": 0.94, "grad_norm": 0.2539670169353485, "learning_rate": 0.0003918095600200128, "loss": 0.0589, "step": 529 }, { "epoch": 0.94, "grad_norm": 0.15578749775886536, "learning_rate": 0.00039142544575258614, "loss": 0.0471, "step": 530 }, { "epoch": 0.94, "grad_norm": 0.41006144881248474, "learning_rate": 0.00039104083986762396, "loss": 0.1215, "step": 531 }, { "epoch": 0.94, "grad_norm": 0.3161672055721283, "learning_rate": 0.00039065574370207785, "loss": 0.0599, "step": 532 }, { "epoch": 0.94, "grad_norm": 0.2556127607822418, "learning_rate": 0.00039027015859460394, "loss": 0.0882, "step": 533 }, { "epoch": 0.94, "grad_norm": 0.5484500527381897, "learning_rate": 0.000389884085885558, "loss": 0.1342, "step": 534 }, { "epoch": 0.95, "grad_norm": 0.3688224256038666, "learning_rate": 0.0003894975269169906, "loss": 0.062, "step": 535 }, { "epoch": 0.95, "grad_norm": 0.6328185796737671, "learning_rate": 0.0003891104830326427, "loss": 0.1068, "step": 536 }, { "epoch": 0.95, "grad_norm": 0.5094593167304993, "learning_rate": 0.00038872295557794103, "loss": 0.0593, "step": 537 }, { "epoch": 0.95, "grad_norm": 0.44920942187309265, "learning_rate": 0.0003883349458999931, "loss": 0.1134, "step": 538 }, { "epoch": 0.95, "grad_norm": 0.25559201836586, "learning_rate": 0.0003879464553475828, "loss": 0.0842, "step": 539 }, { "epoch": 0.96, "grad_norm": 0.24992522597312927, "learning_rate": 0.0003875574852711656, "loss": 0.0684, "step": 540 }, { "epoch": 0.96, "grad_norm": 0.7482407093048096, "learning_rate": 0.0003871680370228639, "loss": 0.1698, "step": 541 }, { "epoch": 0.96, "grad_norm": 0.42716777324676514, "learning_rate": 0.00038677811195646233, "loss": 0.1335, "step": 542 }, { "epoch": 0.96, "grad_norm": 0.5867021083831787, "learning_rate": 0.0003863877114274029, "loss": 0.153, "step": 543 }, { "epoch": 0.96, "grad_norm": 0.14882822334766388, "learning_rate": 0.0003859968367927805, "loss": 0.0548, "step": 544 }, { "epoch": 0.96, "grad_norm": 0.16213174164295197, "learning_rate": 0.0003856054894113381, "loss": 0.0859, "step": 545 }, { "epoch": 0.97, "grad_norm": 0.13216906785964966, "learning_rate": 0.0003852136706434619, "loss": 0.0837, "step": 546 }, { "epoch": 0.97, "grad_norm": 0.28230682015419006, "learning_rate": 0.00038482138185117685, "loss": 0.0746, "step": 547 }, { "epoch": 0.97, "grad_norm": 0.15776745975017548, "learning_rate": 0.0003844286243981417, "loss": 0.0758, "step": 548 }, { "epoch": 0.97, "grad_norm": 0.38748612999916077, "learning_rate": 0.0003840353996496444, "loss": 0.0946, "step": 549 }, { "epoch": 0.97, "grad_norm": 0.4377779960632324, "learning_rate": 0.0003836417089725971, "loss": 0.078, "step": 550 }, { "epoch": 0.97, "grad_norm": 0.4776962101459503, "learning_rate": 0.0003832475537355319, "loss": 0.0996, "step": 551 }, { "epoch": 0.98, "grad_norm": 0.16078083217144012, "learning_rate": 0.00038285293530859553, "loss": 0.0813, "step": 552 }, { "epoch": 0.98, "grad_norm": 0.19620949029922485, "learning_rate": 0.00038245785506354514, "loss": 0.0716, "step": 553 }, { "epoch": 0.98, "grad_norm": 0.23539945483207703, "learning_rate": 0.0003820623143737427, "loss": 0.0727, "step": 554 }, { "epoch": 0.98, "grad_norm": 0.2797366678714752, "learning_rate": 0.0003816663146141514, "loss": 0.0307, "step": 555 }, { "epoch": 0.98, "grad_norm": 0.31704849004745483, "learning_rate": 0.00038126985716132976, "loss": 0.0522, "step": 556 }, { "epoch": 0.99, "grad_norm": 1.038294792175293, "learning_rate": 0.00038087294339342765, "loss": 0.1602, "step": 557 }, { "epoch": 0.99, "grad_norm": 0.39535316824913025, "learning_rate": 0.00038047557469018077, "loss": 0.0672, "step": 558 }, { "epoch": 0.99, "grad_norm": 0.5337291359901428, "learning_rate": 0.00038007775243290666, "loss": 0.238, "step": 559 }, { "epoch": 0.99, "grad_norm": 0.7618711590766907, "learning_rate": 0.0003796794780044992, "loss": 0.0741, "step": 560 }, { "epoch": 0.99, "grad_norm": 0.3507292568683624, "learning_rate": 0.0003792807527894242, "loss": 0.1035, "step": 561 }, { "epoch": 0.99, "grad_norm": 0.29699352383613586, "learning_rate": 0.00037888157817371455, "loss": 0.0732, "step": 562 }, { "epoch": 1.0, "grad_norm": 0.1690889596939087, "learning_rate": 0.0003784819555449651, "loss": 0.0625, "step": 563 }, { "epoch": 1.0, "grad_norm": 0.28516581654548645, "learning_rate": 0.0003780818862923284, "loss": 0.0705, "step": 564 }, { "epoch": 1.0, "grad_norm": 0.3408360481262207, "learning_rate": 0.00037768137180650913, "loss": 0.1025, "step": 565 }, { "epoch": 1.0, "grad_norm": 0.28147757053375244, "learning_rate": 0.00037728041347976005, "loss": 0.0495, "step": 566 }, { "epoch": 1.0, "grad_norm": 0.31090235710144043, "learning_rate": 0.00037687901270587655, "loss": 0.0874, "step": 567 }, { "epoch": 1.0, "grad_norm": 0.29111558198928833, "learning_rate": 0.00037647717088019217, "loss": 0.0589, "step": 568 }, { "epoch": 1.0, "eval_loss": 0.08437130600214005, "eval_runtime": 14.722, "eval_samples_per_second": 32.4, "eval_steps_per_second": 8.151, "step": 568 }, { "epoch": 1.01, "grad_norm": 0.14942067861557007, "learning_rate": 0.0003760748893995736, "loss": 0.0391, "step": 569 }, { "epoch": 1.01, "grad_norm": 0.28915029764175415, "learning_rate": 0.0003756721696624156, "loss": 0.0522, "step": 570 }, { "epoch": 1.01, "grad_norm": 0.1540856510400772, "learning_rate": 0.0003752690130686367, "loss": 0.0473, "step": 571 }, { "epoch": 1.01, "grad_norm": 0.31572067737579346, "learning_rate": 0.0003748654210196739, "loss": 0.058, "step": 572 }, { "epoch": 1.01, "grad_norm": 0.4497004449367523, "learning_rate": 0.0003744613949184779, "loss": 0.0937, "step": 573 }, { "epoch": 1.02, "grad_norm": 0.48680734634399414, "learning_rate": 0.0003740569361695082, "loss": 0.0925, "step": 574 }, { "epoch": 1.02, "grad_norm": 0.3604874610900879, "learning_rate": 0.00037365204617872836, "loss": 0.0273, "step": 575 }, { "epoch": 1.02, "grad_norm": 0.31378456950187683, "learning_rate": 0.0003732467263536008, "loss": 0.048, "step": 576 }, { "epoch": 1.02, "grad_norm": 0.37197205424308777, "learning_rate": 0.0003728409781030824, "loss": 0.0445, "step": 577 }, { "epoch": 1.02, "grad_norm": 0.09396038949489594, "learning_rate": 0.00037243480283761913, "loss": 0.0102, "step": 578 }, { "epoch": 1.02, "grad_norm": 0.4994851052761078, "learning_rate": 0.00037202820196914133, "loss": 0.074, "step": 579 }, { "epoch": 1.03, "grad_norm": 0.20099425315856934, "learning_rate": 0.0003716211769110589, "loss": 0.0239, "step": 580 }, { "epoch": 1.03, "grad_norm": 0.33086463809013367, "learning_rate": 0.0003712137290782561, "loss": 0.0305, "step": 581 }, { "epoch": 1.03, "grad_norm": 0.3871704041957855, "learning_rate": 0.0003708058598870871, "loss": 0.0309, "step": 582 }, { "epoch": 1.03, "grad_norm": 0.514127790927887, "learning_rate": 0.0003703975707553706, "loss": 0.0639, "step": 583 }, { "epoch": 1.03, "grad_norm": 0.29386666417121887, "learning_rate": 0.000369988863102385, "loss": 0.0778, "step": 584 }, { "epoch": 1.03, "grad_norm": 0.4717571437358856, "learning_rate": 0.0003695797383488638, "loss": 0.0414, "step": 585 }, { "epoch": 1.04, "grad_norm": 0.61000657081604, "learning_rate": 0.0003691701979169903, "loss": 0.0687, "step": 586 }, { "epoch": 1.04, "grad_norm": 0.3639252483844757, "learning_rate": 0.0003687602432303926, "loss": 0.0337, "step": 587 }, { "epoch": 1.04, "grad_norm": 0.20936115086078644, "learning_rate": 0.0003683498757141391, "loss": 0.0232, "step": 588 }, { "epoch": 1.04, "grad_norm": 0.5917474031448364, "learning_rate": 0.00036793909679473294, "loss": 0.0564, "step": 589 }, { "epoch": 1.04, "grad_norm": 0.23065000772476196, "learning_rate": 0.00036752790790010767, "loss": 0.0246, "step": 590 }, { "epoch": 1.05, "grad_norm": 0.1876888871192932, "learning_rate": 0.00036711631045962173, "loss": 0.0351, "step": 591 }, { "epoch": 1.05, "grad_norm": 0.6483283042907715, "learning_rate": 0.000366704305904054, "loss": 0.0628, "step": 592 }, { "epoch": 1.05, "grad_norm": 0.8450531363487244, "learning_rate": 0.0003662918956655983, "loss": 0.0922, "step": 593 }, { "epoch": 1.05, "grad_norm": 0.5190649628639221, "learning_rate": 0.00036587908117785887, "loss": 0.0715, "step": 594 }, { "epoch": 1.05, "grad_norm": 0.597562849521637, "learning_rate": 0.000365465863875845, "loss": 0.0728, "step": 595 }, { "epoch": 1.05, "grad_norm": 0.5079246759414673, "learning_rate": 0.0003650522451959663, "loss": 0.1145, "step": 596 }, { "epoch": 1.06, "grad_norm": 0.4016817808151245, "learning_rate": 0.0003646382265760276, "loss": 0.1373, "step": 597 }, { "epoch": 1.06, "grad_norm": 0.2420119345188141, "learning_rate": 0.00036422380945522426, "loss": 0.0428, "step": 598 }, { "epoch": 1.06, "grad_norm": 0.2923775017261505, "learning_rate": 0.00036380899527413646, "loss": 0.0407, "step": 599 }, { "epoch": 1.06, "grad_norm": 0.2968994379043579, "learning_rate": 0.00036339378547472497, "loss": 0.039, "step": 600 }, { "epoch": 1.06, "grad_norm": 0.3694530129432678, "learning_rate": 0.0003629781815003256, "loss": 0.0765, "step": 601 }, { "epoch": 1.07, "grad_norm": 0.19854502379894257, "learning_rate": 0.0003625621847956443, "loss": 0.0283, "step": 602 }, { "epoch": 1.07, "grad_norm": 0.16821172833442688, "learning_rate": 0.0003621457968067526, "loss": 0.0551, "step": 603 }, { "epoch": 1.07, "grad_norm": 0.5689147114753723, "learning_rate": 0.00036172901898108177, "loss": 0.0818, "step": 604 }, { "epoch": 1.07, "grad_norm": 0.16156058013439178, "learning_rate": 0.0003613118527674185, "loss": 0.0368, "step": 605 }, { "epoch": 1.07, "grad_norm": 0.21022377908229828, "learning_rate": 0.00036089429961589926, "loss": 0.0614, "step": 606 }, { "epoch": 1.07, "grad_norm": 0.3432522714138031, "learning_rate": 0.00036047636097800593, "loss": 0.0393, "step": 607 }, { "epoch": 1.08, "grad_norm": 0.2537219524383545, "learning_rate": 0.00036005803830656036, "loss": 0.0852, "step": 608 }, { "epoch": 1.08, "grad_norm": 0.209491565823555, "learning_rate": 0.00035963933305571916, "loss": 0.0476, "step": 609 }, { "epoch": 1.08, "grad_norm": 0.2286662459373474, "learning_rate": 0.00035922024668096883, "loss": 0.0614, "step": 610 }, { "epoch": 1.08, "grad_norm": 0.5772972106933594, "learning_rate": 0.00035880078063912105, "loss": 0.0546, "step": 611 }, { "epoch": 1.08, "grad_norm": 0.37829965353012085, "learning_rate": 0.0003583809363883069, "loss": 0.0526, "step": 612 }, { "epoch": 1.08, "grad_norm": 0.1876819133758545, "learning_rate": 0.0003579607153879724, "loss": 0.0339, "step": 613 }, { "epoch": 1.09, "grad_norm": 0.42904049158096313, "learning_rate": 0.0003575401190988732, "loss": 0.0705, "step": 614 }, { "epoch": 1.09, "grad_norm": 0.2780819833278656, "learning_rate": 0.0003571191489830693, "loss": 0.0425, "step": 615 }, { "epoch": 1.09, "grad_norm": 0.3338189721107483, "learning_rate": 0.00035669780650392056, "loss": 0.0713, "step": 616 }, { "epoch": 1.09, "grad_norm": 0.2791332006454468, "learning_rate": 0.000356276093126081, "loss": 0.0392, "step": 617 }, { "epoch": 1.09, "grad_norm": 0.4691467881202698, "learning_rate": 0.0003558540103154939, "loss": 0.0756, "step": 618 }, { "epoch": 1.1, "grad_norm": 0.34194234013557434, "learning_rate": 0.00035543155953938674, "loss": 0.057, "step": 619 }, { "epoch": 1.1, "grad_norm": 0.43898969888687134, "learning_rate": 0.00035500874226626633, "loss": 0.1484, "step": 620 }, { "epoch": 1.1, "grad_norm": 0.3562189042568207, "learning_rate": 0.00035458555996591325, "loss": 0.0801, "step": 621 }, { "epoch": 1.1, "grad_norm": 0.2978869080543518, "learning_rate": 0.0003541620141093771, "loss": 0.0422, "step": 622 }, { "epoch": 1.1, "grad_norm": 0.415714293718338, "learning_rate": 0.00035373810616897116, "loss": 0.042, "step": 623 }, { "epoch": 1.1, "grad_norm": 0.28547269105911255, "learning_rate": 0.00035331383761826756, "loss": 0.0722, "step": 624 }, { "epoch": 1.11, "grad_norm": 0.2831112742424011, "learning_rate": 0.00035288920993209173, "loss": 0.0339, "step": 625 }, { "epoch": 1.11, "grad_norm": 0.372010201215744, "learning_rate": 0.00035246422458651766, "loss": 0.0573, "step": 626 }, { "epoch": 1.11, "grad_norm": 0.07014724612236023, "learning_rate": 0.0003520388830588625, "loss": 0.0108, "step": 627 }, { "epoch": 1.11, "grad_norm": 0.5464847087860107, "learning_rate": 0.0003516131868276817, "loss": 0.0871, "step": 628 }, { "epoch": 1.11, "grad_norm": 0.118097685277462, "learning_rate": 0.00035118713737276376, "loss": 0.0176, "step": 629 }, { "epoch": 1.11, "grad_norm": 0.43580326437950134, "learning_rate": 0.00035076073617512475, "loss": 0.0817, "step": 630 }, { "epoch": 1.12, "grad_norm": 0.42866209149360657, "learning_rate": 0.00035033398471700367, "loss": 0.1195, "step": 631 }, { "epoch": 1.12, "grad_norm": 0.42996305227279663, "learning_rate": 0.0003499068844818571, "loss": 0.12, "step": 632 }, { "epoch": 1.12, "grad_norm": 0.4283413290977478, "learning_rate": 0.0003494794369543539, "loss": 0.085, "step": 633 }, { "epoch": 1.12, "grad_norm": 0.693706214427948, "learning_rate": 0.0003490516436203703, "loss": 0.126, "step": 634 }, { "epoch": 1.12, "grad_norm": 0.264961838722229, "learning_rate": 0.00034862350596698456, "loss": 0.0556, "step": 635 }, { "epoch": 1.13, "grad_norm": 0.2530398368835449, "learning_rate": 0.00034819502548247175, "loss": 0.0514, "step": 636 }, { "epoch": 1.13, "grad_norm": 0.18521098792552948, "learning_rate": 0.0003477662036562989, "loss": 0.0387, "step": 637 }, { "epoch": 1.13, "grad_norm": 0.34398314356803894, "learning_rate": 0.00034733704197911937, "loss": 0.1047, "step": 638 }, { "epoch": 1.13, "grad_norm": 0.16019423305988312, "learning_rate": 0.000346907541942768, "loss": 0.0299, "step": 639 }, { "epoch": 1.13, "grad_norm": 0.3269899785518646, "learning_rate": 0.00034647770504025587, "loss": 0.0405, "step": 640 }, { "epoch": 1.13, "grad_norm": 0.46410876512527466, "learning_rate": 0.00034604753276576487, "loss": 0.0855, "step": 641 }, { "epoch": 1.14, "grad_norm": 0.33000048995018005, "learning_rate": 0.000345617026614643, "loss": 0.0759, "step": 642 }, { "epoch": 1.14, "grad_norm": 0.31162315607070923, "learning_rate": 0.0003451861880833986, "loss": 0.0558, "step": 643 }, { "epoch": 1.14, "grad_norm": 0.42918407917022705, "learning_rate": 0.0003447550186696956, "loss": 0.0365, "step": 644 }, { "epoch": 1.14, "grad_norm": 0.2732023298740387, "learning_rate": 0.00034432351987234786, "loss": 0.0616, "step": 645 }, { "epoch": 1.14, "grad_norm": 0.1899593621492386, "learning_rate": 0.00034389169319131476, "loss": 0.0286, "step": 646 }, { "epoch": 1.14, "grad_norm": 0.447968065738678, "learning_rate": 0.0003434595401276947, "loss": 0.0701, "step": 647 }, { "epoch": 1.15, "grad_norm": 0.15938018262386322, "learning_rate": 0.0003430270621837213, "loss": 0.026, "step": 648 }, { "epoch": 1.15, "grad_norm": 0.17606832087039948, "learning_rate": 0.0003425942608627572, "loss": 0.0245, "step": 649 }, { "epoch": 1.15, "grad_norm": 0.49266988039016724, "learning_rate": 0.0003421611376692892, "loss": 0.0823, "step": 650 }, { "epoch": 1.15, "grad_norm": 0.3935730755329132, "learning_rate": 0.0003417276941089232, "loss": 0.0426, "step": 651 }, { "epoch": 1.15, "grad_norm": 0.5984533429145813, "learning_rate": 0.0003412939316883782, "loss": 0.0833, "step": 652 }, { "epoch": 1.16, "grad_norm": 0.3196690082550049, "learning_rate": 0.00034085985191548217, "loss": 0.0337, "step": 653 }, { "epoch": 1.16, "grad_norm": 0.39022788405418396, "learning_rate": 0.000340425456299166, "loss": 0.0235, "step": 654 }, { "epoch": 1.16, "grad_norm": 0.29681891202926636, "learning_rate": 0.00033999074634945856, "loss": 0.0155, "step": 655 }, { "epoch": 1.16, "grad_norm": 0.5547076463699341, "learning_rate": 0.0003395557235774813, "loss": 0.0942, "step": 656 }, { "epoch": 1.16, "grad_norm": 0.9071078300476074, "learning_rate": 0.00033912038949544316, "loss": 0.1004, "step": 657 }, { "epoch": 1.16, "grad_norm": 0.5410562753677368, "learning_rate": 0.00033868474561663534, "loss": 0.0743, "step": 658 }, { "epoch": 1.17, "grad_norm": 0.5785720348358154, "learning_rate": 0.0003382487934554257, "loss": 0.1017, "step": 659 }, { "epoch": 1.17, "grad_norm": 0.60345858335495, "learning_rate": 0.0003378125345272539, "loss": 0.117, "step": 660 }, { "epoch": 1.17, "grad_norm": 0.23607215285301208, "learning_rate": 0.0003373759703486262, "loss": 0.0149, "step": 661 }, { "epoch": 1.17, "grad_norm": 0.3551620543003082, "learning_rate": 0.0003369391024371093, "loss": 0.0435, "step": 662 }, { "epoch": 1.17, "grad_norm": 0.4280162453651428, "learning_rate": 0.00033650193231132657, "loss": 0.1019, "step": 663 }, { "epoch": 1.17, "grad_norm": 0.12040708214044571, "learning_rate": 0.0003360644614909512, "loss": 0.0165, "step": 664 }, { "epoch": 1.18, "grad_norm": 0.6838027238845825, "learning_rate": 0.00033562669149670213, "loss": 0.0909, "step": 665 }, { "epoch": 1.18, "grad_norm": 0.2861780524253845, "learning_rate": 0.00033518862385033786, "loss": 0.0719, "step": 666 }, { "epoch": 1.18, "grad_norm": 0.24380998313426971, "learning_rate": 0.00033475026007465184, "loss": 0.0388, "step": 667 }, { "epoch": 1.18, "grad_norm": 0.41201332211494446, "learning_rate": 0.00033431160169346714, "loss": 0.0442, "step": 668 }, { "epoch": 1.18, "grad_norm": 0.3734363615512848, "learning_rate": 0.0003338726502316304, "loss": 0.0687, "step": 669 }, { "epoch": 1.19, "grad_norm": 0.21814176440238953, "learning_rate": 0.00033343340721500743, "loss": 0.0743, "step": 670 }, { "epoch": 1.19, "grad_norm": 0.17123498022556305, "learning_rate": 0.00033299387417047723, "loss": 0.0446, "step": 671 }, { "epoch": 1.19, "grad_norm": 0.3857256770133972, "learning_rate": 0.0003325540526259275, "loss": 0.0524, "step": 672 }, { "epoch": 1.19, "grad_norm": 0.7980711460113525, "learning_rate": 0.00033211394411024813, "loss": 0.0786, "step": 673 }, { "epoch": 1.19, "grad_norm": 0.31176111102104187, "learning_rate": 0.00033167355015332713, "loss": 0.0499, "step": 674 }, { "epoch": 1.19, "grad_norm": 0.6255938410758972, "learning_rate": 0.0003312328722860445, "loss": 0.0664, "step": 675 }, { "epoch": 1.2, "grad_norm": 0.3685753047466278, "learning_rate": 0.00033079191204026713, "loss": 0.0495, "step": 676 }, { "epoch": 1.2, "grad_norm": 0.4045291841030121, "learning_rate": 0.00033035067094884366, "loss": 0.0697, "step": 677 }, { "epoch": 1.2, "grad_norm": 0.6035248637199402, "learning_rate": 0.0003299091505455989, "loss": 0.1206, "step": 678 }, { "epoch": 1.2, "grad_norm": 0.3399547338485718, "learning_rate": 0.00032946735236532855, "loss": 0.035, "step": 679 }, { "epoch": 1.2, "grad_norm": 0.3604506552219391, "learning_rate": 0.0003290252779437939, "loss": 0.1087, "step": 680 }, { "epoch": 1.2, "grad_norm": 0.28665006160736084, "learning_rate": 0.0003285829288177167, "loss": 0.0858, "step": 681 }, { "epoch": 1.21, "grad_norm": 0.41967740654945374, "learning_rate": 0.0003281403065247733, "loss": 0.0851, "step": 682 }, { "epoch": 1.21, "grad_norm": 0.43989259004592896, "learning_rate": 0.00032769741260358997, "loss": 0.0793, "step": 683 }, { "epoch": 1.21, "grad_norm": 0.29274123907089233, "learning_rate": 0.00032725424859373687, "loss": 0.0538, "step": 684 }, { "epoch": 1.21, "grad_norm": 0.27231287956237793, "learning_rate": 0.0003268108160357233, "loss": 0.0692, "step": 685 }, { "epoch": 1.21, "grad_norm": 0.3030160963535309, "learning_rate": 0.0003263671164709918, "loss": 0.0786, "step": 686 }, { "epoch": 1.22, "grad_norm": 0.19824832677841187, "learning_rate": 0.0003259231514419135, "loss": 0.0699, "step": 687 }, { "epoch": 1.22, "grad_norm": 0.23121508955955505, "learning_rate": 0.0003254789224917818, "loss": 0.0499, "step": 688 }, { "epoch": 1.22, "grad_norm": 0.15240328013896942, "learning_rate": 0.0003250344311648079, "loss": 0.0431, "step": 689 }, { "epoch": 1.22, "grad_norm": 0.16165931522846222, "learning_rate": 0.000324589679006115, "loss": 0.0554, "step": 690 }, { "epoch": 1.22, "grad_norm": 0.29693150520324707, "learning_rate": 0.0003241446675617329, "loss": 0.0554, "step": 691 }, { "epoch": 1.22, "grad_norm": 0.7295424938201904, "learning_rate": 0.00032369939837859275, "loss": 0.1232, "step": 692 }, { "epoch": 1.23, "grad_norm": 0.43246909976005554, "learning_rate": 0.0003232538730045215, "loss": 0.0598, "step": 693 }, { "epoch": 1.23, "grad_norm": 0.18855467438697815, "learning_rate": 0.00032280809298823723, "loss": 0.0252, "step": 694 }, { "epoch": 1.23, "grad_norm": 0.26345816254615784, "learning_rate": 0.00032236205987934234, "loss": 0.0809, "step": 695 }, { "epoch": 1.23, "grad_norm": 0.497403085231781, "learning_rate": 0.00032191577522831984, "loss": 0.0482, "step": 696 }, { "epoch": 1.23, "grad_norm": 0.2640454173088074, "learning_rate": 0.0003214692405865264, "loss": 0.0538, "step": 697 }, { "epoch": 1.23, "grad_norm": 0.335443377494812, "learning_rate": 0.00032102245750618833, "loss": 0.1, "step": 698 }, { "epoch": 1.24, "grad_norm": 0.37383145093917847, "learning_rate": 0.00032057542754039526, "loss": 0.0767, "step": 699 }, { "epoch": 1.24, "grad_norm": 0.3101638853549957, "learning_rate": 0.00032012815224309496, "loss": 0.0499, "step": 700 }, { "epoch": 1.24, "grad_norm": 0.2282581478357315, "learning_rate": 0.00031968063316908815, "loss": 0.0424, "step": 701 }, { "epoch": 1.24, "grad_norm": 0.1553642451763153, "learning_rate": 0.00031923287187402287, "loss": 0.0446, "step": 702 }, { "epoch": 1.24, "grad_norm": 0.1549568623304367, "learning_rate": 0.0003187848699143894, "loss": 0.0252, "step": 703 }, { "epoch": 1.25, "grad_norm": 0.4515601694583893, "learning_rate": 0.00031833662884751416, "loss": 0.0852, "step": 704 }, { "epoch": 1.25, "grad_norm": 0.23497383296489716, "learning_rate": 0.0003178881502315552, "loss": 0.0291, "step": 705 }, { "epoch": 1.25, "grad_norm": 0.5249956846237183, "learning_rate": 0.000317439435625496, "loss": 0.0478, "step": 706 }, { "epoch": 1.25, "grad_norm": 0.7284122705459595, "learning_rate": 0.0003169904865891405, "loss": 0.0584, "step": 707 }, { "epoch": 1.25, "grad_norm": 0.34631770849227905, "learning_rate": 0.00031654130468310784, "loss": 0.092, "step": 708 }, { "epoch": 1.25, "grad_norm": 0.43634921312332153, "learning_rate": 0.000316091891468826, "loss": 0.0758, "step": 709 }, { "epoch": 1.26, "grad_norm": 0.2862977683544159, "learning_rate": 0.00031564224850852754, "loss": 0.057, "step": 710 }, { "epoch": 1.26, "eval_loss": 0.08585863560438156, "eval_runtime": 14.7073, "eval_samples_per_second": 32.433, "eval_steps_per_second": 8.159, "step": 710 }, { "epoch": 1.26, "grad_norm": 0.4469147324562073, "learning_rate": 0.0003151923773652436, "loss": 0.0807, "step": 711 }, { "epoch": 1.26, "grad_norm": 0.25653207302093506, "learning_rate": 0.00031474227960279834, "loss": 0.0618, "step": 712 }, { "epoch": 1.26, "grad_norm": 0.4219815731048584, "learning_rate": 0.0003142919567858039, "loss": 0.0592, "step": 713 }, { "epoch": 1.26, "grad_norm": 0.8190480470657349, "learning_rate": 0.0003138414104796545, "loss": 0.103, "step": 714 }, { "epoch": 1.26, "grad_norm": 0.3400764465332031, "learning_rate": 0.0003133906422505215, "loss": 0.0998, "step": 715 }, { "epoch": 1.27, "grad_norm": 0.21949267387390137, "learning_rate": 0.0003129396536653474, "loss": 0.0395, "step": 716 }, { "epoch": 1.27, "grad_norm": 0.30582305788993835, "learning_rate": 0.0003124884462918411, "loss": 0.0835, "step": 717 }, { "epoch": 1.27, "grad_norm": 0.11762493848800659, "learning_rate": 0.0003120370216984716, "loss": 0.026, "step": 718 }, { "epoch": 1.27, "grad_norm": 0.1867324858903885, "learning_rate": 0.00031158538145446314, "loss": 0.0544, "step": 719 }, { "epoch": 1.27, "grad_norm": 0.15806153416633606, "learning_rate": 0.00031113352712978996, "loss": 0.0406, "step": 720 }, { "epoch": 1.28, "grad_norm": 0.2605026662349701, "learning_rate": 0.00031068146029516997, "loss": 0.0431, "step": 721 }, { "epoch": 1.28, "grad_norm": 0.27978816628456116, "learning_rate": 0.00031022918252206005, "loss": 0.0948, "step": 722 }, { "epoch": 1.28, "grad_norm": 0.19412288069725037, "learning_rate": 0.00030977669538265017, "loss": 0.0305, "step": 723 }, { "epoch": 1.28, "grad_norm": 0.42198699712753296, "learning_rate": 0.0003093240004498585, "loss": 0.1205, "step": 724 }, { "epoch": 1.28, "grad_norm": 0.22501109540462494, "learning_rate": 0.0003088710992973249, "loss": 0.0368, "step": 725 }, { "epoch": 1.28, "grad_norm": 0.4651114046573639, "learning_rate": 0.00030841799349940667, "loss": 0.1044, "step": 726 }, { "epoch": 1.29, "grad_norm": 0.4760609567165375, "learning_rate": 0.00030796468463117216, "loss": 0.0829, "step": 727 }, { "epoch": 1.29, "grad_norm": 0.308748722076416, "learning_rate": 0.0003075111742683957, "loss": 0.0382, "step": 728 }, { "epoch": 1.29, "grad_norm": 0.21451212465763092, "learning_rate": 0.0003070574639875521, "loss": 0.0441, "step": 729 }, { "epoch": 1.29, "grad_norm": 0.1944577842950821, "learning_rate": 0.00030660355536581103, "loss": 0.0326, "step": 730 }, { "epoch": 1.29, "grad_norm": 0.5249868035316467, "learning_rate": 0.0003061494499810317, "loss": 0.0857, "step": 731 }, { "epoch": 1.3, "grad_norm": 0.2004554718732834, "learning_rate": 0.00030569514941175725, "loss": 0.0533, "step": 732 }, { "epoch": 1.3, "grad_norm": 0.21708889305591583, "learning_rate": 0.00030524065523720935, "loss": 0.0562, "step": 733 }, { "epoch": 1.3, "grad_norm": 0.36287248134613037, "learning_rate": 0.00030478596903728267, "loss": 0.1, "step": 734 }, { "epoch": 1.3, "grad_norm": 0.4405117928981781, "learning_rate": 0.0003043310923925394, "loss": 0.0929, "step": 735 }, { "epoch": 1.3, "grad_norm": 0.3343874514102936, "learning_rate": 0.0003038760268842036, "loss": 0.0549, "step": 736 }, { "epoch": 1.3, "grad_norm": 0.2254178822040558, "learning_rate": 0.00030342077409415606, "loss": 0.0495, "step": 737 }, { "epoch": 1.31, "grad_norm": 0.19972631335258484, "learning_rate": 0.00030296533560492854, "loss": 0.0301, "step": 738 }, { "epoch": 1.31, "grad_norm": 0.19470427930355072, "learning_rate": 0.0003025097129996983, "loss": 0.0485, "step": 739 }, { "epoch": 1.31, "grad_norm": 0.34024572372436523, "learning_rate": 0.0003020539078622824, "loss": 0.0509, "step": 740 }, { "epoch": 1.31, "grad_norm": 0.26424598693847656, "learning_rate": 0.00030159792177713294, "loss": 0.0293, "step": 741 }, { "epoch": 1.31, "grad_norm": 0.3307158946990967, "learning_rate": 0.00030114175632933043, "loss": 0.0302, "step": 742 }, { "epoch": 1.31, "grad_norm": 0.448448121547699, "learning_rate": 0.0003006854131045793, "loss": 0.0683, "step": 743 }, { "epoch": 1.32, "grad_norm": 0.4010024070739746, "learning_rate": 0.0003002288936892017, "loss": 0.078, "step": 744 }, { "epoch": 1.32, "grad_norm": 0.22475454211235046, "learning_rate": 0.0002997721996701324, "loss": 0.0303, "step": 745 }, { "epoch": 1.32, "grad_norm": 0.31867673993110657, "learning_rate": 0.000299315332634913, "loss": 0.0521, "step": 746 }, { "epoch": 1.32, "grad_norm": 0.2040076106786728, "learning_rate": 0.0002988582941716867, "loss": 0.0225, "step": 747 }, { "epoch": 1.32, "grad_norm": 1.1286782026290894, "learning_rate": 0.00029840108586919246, "loss": 0.0833, "step": 748 }, { "epoch": 1.33, "grad_norm": 0.6526787877082825, "learning_rate": 0.00029794370931675963, "loss": 0.1085, "step": 749 }, { "epoch": 1.33, "grad_norm": 0.3272201418876648, "learning_rate": 0.00029748616610430264, "loss": 0.0213, "step": 750 }, { "epoch": 1.33, "grad_norm": 0.5573351383209229, "learning_rate": 0.0002970284578223149, "loss": 0.0478, "step": 751 }, { "epoch": 1.33, "grad_norm": 0.31984782218933105, "learning_rate": 0.00029657058606186393, "loss": 0.0353, "step": 752 }, { "epoch": 1.33, "grad_norm": 1.712653398513794, "learning_rate": 0.00029611255241458533, "loss": 0.0787, "step": 753 }, { "epoch": 1.33, "grad_norm": 0.5753667950630188, "learning_rate": 0.00029565435847267766, "loss": 0.1024, "step": 754 }, { "epoch": 1.34, "grad_norm": 0.44150909781455994, "learning_rate": 0.00029519600582889657, "loss": 0.0261, "step": 755 }, { "epoch": 1.34, "grad_norm": 0.2625367343425751, "learning_rate": 0.00029473749607654914, "loss": 0.0685, "step": 756 }, { "epoch": 1.34, "grad_norm": 0.48232585191726685, "learning_rate": 0.00029427883080948905, "loss": 0.0299, "step": 757 }, { "epoch": 1.34, "grad_norm": 0.2749970853328705, "learning_rate": 0.00029382001162211026, "loss": 0.022, "step": 758 }, { "epoch": 1.34, "grad_norm": 0.2759183645248413, "learning_rate": 0.00029336104010934186, "loss": 0.0417, "step": 759 }, { "epoch": 1.34, "grad_norm": 0.11506503075361252, "learning_rate": 0.0002929019178666425, "loss": 0.0114, "step": 760 }, { "epoch": 1.35, "grad_norm": 0.10981517285108566, "learning_rate": 0.0002924426464899947, "loss": 0.0132, "step": 761 }, { "epoch": 1.35, "grad_norm": 0.2663099467754364, "learning_rate": 0.0002919832275758994, "loss": 0.0628, "step": 762 }, { "epoch": 1.35, "grad_norm": 0.6475871205329895, "learning_rate": 0.0002915236627213705, "loss": 0.0819, "step": 763 }, { "epoch": 1.35, "grad_norm": 0.3743927478790283, "learning_rate": 0.00029106395352392913, "loss": 0.0526, "step": 764 }, { "epoch": 1.35, "grad_norm": 0.2879592776298523, "learning_rate": 0.0002906041015815983, "loss": 0.0441, "step": 765 }, { "epoch": 1.36, "grad_norm": 0.5118088126182556, "learning_rate": 0.0002901441084928969, "loss": 0.0304, "step": 766 }, { "epoch": 1.36, "grad_norm": 0.8219065070152283, "learning_rate": 0.000289683975856835, "loss": 0.0682, "step": 767 }, { "epoch": 1.36, "grad_norm": 0.45978665351867676, "learning_rate": 0.00028922370527290715, "loss": 0.0385, "step": 768 }, { "epoch": 1.36, "grad_norm": 0.3295181095600128, "learning_rate": 0.000288763298341088, "loss": 0.0219, "step": 769 }, { "epoch": 1.36, "grad_norm": 0.9101231694221497, "learning_rate": 0.00028830275666182564, "loss": 0.1396, "step": 770 }, { "epoch": 1.36, "grad_norm": 0.4265996217727661, "learning_rate": 0.000287842081836037, "loss": 0.0514, "step": 771 }, { "epoch": 1.37, "grad_norm": 0.6179928183555603, "learning_rate": 0.00028738127546510165, "loss": 0.0615, "step": 772 }, { "epoch": 1.37, "grad_norm": 0.39278754591941833, "learning_rate": 0.00028692033915085635, "loss": 0.0422, "step": 773 }, { "epoch": 1.37, "grad_norm": 0.4660128653049469, "learning_rate": 0.00028645927449558986, "loss": 0.1055, "step": 774 }, { "epoch": 1.37, "grad_norm": 0.6555572152137756, "learning_rate": 0.0002859980831020366, "loss": 0.0934, "step": 775 }, { "epoch": 1.37, "grad_norm": 0.4676145613193512, "learning_rate": 0.0002855367665733722, "loss": 0.0624, "step": 776 }, { "epoch": 1.37, "grad_norm": 0.6434176564216614, "learning_rate": 0.0002850753265132066, "loss": 0.0415, "step": 777 }, { "epoch": 1.38, "grad_norm": 0.2195662558078766, "learning_rate": 0.0002846137645255796, "loss": 0.0502, "step": 778 }, { "epoch": 1.38, "grad_norm": 0.33307918906211853, "learning_rate": 0.00028415208221495465, "loss": 0.0692, "step": 779 }, { "epoch": 1.38, "grad_norm": 0.16945867240428925, "learning_rate": 0.0002836902811862136, "loss": 0.0296, "step": 780 }, { "epoch": 1.38, "grad_norm": 0.3181239366531372, "learning_rate": 0.00028322836304465093, "loss": 0.056, "step": 781 }, { "epoch": 1.38, "grad_norm": 0.2984309494495392, "learning_rate": 0.000282766329395968, "loss": 0.0603, "step": 782 }, { "epoch": 1.39, "grad_norm": 0.19178460538387299, "learning_rate": 0.0002823041818462681, "loss": 0.0315, "step": 783 }, { "epoch": 1.39, "grad_norm": 0.3132456839084625, "learning_rate": 0.0002818419220020502, "loss": 0.0421, "step": 784 }, { "epoch": 1.39, "grad_norm": 0.4850837290287018, "learning_rate": 0.00028137955147020355, "loss": 0.0835, "step": 785 }, { "epoch": 1.39, "grad_norm": 0.2325926274061203, "learning_rate": 0.00028091707185800245, "loss": 0.033, "step": 786 }, { "epoch": 1.39, "grad_norm": 0.5203066468238831, "learning_rate": 0.0002804544847731001, "loss": 0.0766, "step": 787 }, { "epoch": 1.39, "grad_norm": 0.2967028021812439, "learning_rate": 0.00027999179182352347, "loss": 0.0287, "step": 788 }, { "epoch": 1.4, "grad_norm": 0.42808797955513, "learning_rate": 0.0002795289946176674, "loss": 0.039, "step": 789 }, { "epoch": 1.4, "grad_norm": 0.36871954798698425, "learning_rate": 0.00027906609476428937, "loss": 0.0388, "step": 790 }, { "epoch": 1.4, "grad_norm": 0.31911173462867737, "learning_rate": 0.0002786030938725034, "loss": 0.0575, "step": 791 }, { "epoch": 1.4, "grad_norm": 0.2864239513874054, "learning_rate": 0.00027813999355177476, "loss": 0.0711, "step": 792 }, { "epoch": 1.4, "grad_norm": 0.11181977391242981, "learning_rate": 0.0002776767954119147, "loss": 0.0126, "step": 793 }, { "epoch": 1.4, "grad_norm": 0.32950931787490845, "learning_rate": 0.0002772135010630741, "loss": 0.025, "step": 794 }, { "epoch": 1.41, "grad_norm": 0.3144387900829315, "learning_rate": 0.0002767501121157386, "loss": 0.0244, "step": 795 }, { "epoch": 1.41, "grad_norm": 0.7986451387405396, "learning_rate": 0.0002762866301807222, "loss": 0.013, "step": 796 }, { "epoch": 1.41, "grad_norm": 0.11357055604457855, "learning_rate": 0.0002758230568691627, "loss": 0.0105, "step": 797 }, { "epoch": 1.41, "grad_norm": 0.8880018591880798, "learning_rate": 0.00027535939379251523, "loss": 0.1036, "step": 798 }, { "epoch": 1.41, "grad_norm": 0.5905174016952515, "learning_rate": 0.000274895642562547, "loss": 0.1287, "step": 799 }, { "epoch": 1.42, "grad_norm": 0.1973883956670761, "learning_rate": 0.0002744318047913318, "loss": 0.0161, "step": 800 }, { "epoch": 1.42, "grad_norm": 0.38918519020080566, "learning_rate": 0.00027396788209124387, "loss": 0.0428, "step": 801 }, { "epoch": 1.42, "grad_norm": 0.2719264328479767, "learning_rate": 0.0002735038760749531, "loss": 0.0496, "step": 802 }, { "epoch": 1.42, "grad_norm": 0.3014307916164398, "learning_rate": 0.0002730397883554189, "loss": 0.0241, "step": 803 }, { "epoch": 1.42, "grad_norm": 0.25686872005462646, "learning_rate": 0.00027257562054588453, "loss": 0.0672, "step": 804 }, { "epoch": 1.42, "grad_norm": 0.30520740151405334, "learning_rate": 0.00027211137425987175, "loss": 0.0376, "step": 805 }, { "epoch": 1.43, "grad_norm": 0.3014354407787323, "learning_rate": 0.00027164705111117516, "loss": 0.0201, "step": 806 }, { "epoch": 1.43, "grad_norm": 0.1687713861465454, "learning_rate": 0.0002711826527138565, "loss": 0.0328, "step": 807 }, { "epoch": 1.43, "grad_norm": 0.31807759404182434, "learning_rate": 0.00027071818068223906, "loss": 0.121, "step": 808 }, { "epoch": 1.43, "grad_norm": 0.6311066150665283, "learning_rate": 0.00027025363663090216, "loss": 0.0745, "step": 809 }, { "epoch": 1.43, "grad_norm": 0.12313732504844666, "learning_rate": 0.0002697890221746754, "loss": 0.0103, "step": 810 }, { "epoch": 1.43, "grad_norm": 0.7472290396690369, "learning_rate": 0.00026932433892863324, "loss": 0.0935, "step": 811 }, { "epoch": 1.44, "grad_norm": 0.3236583173274994, "learning_rate": 0.00026885958850808914, "loss": 0.0592, "step": 812 }, { "epoch": 1.44, "grad_norm": 0.6342505216598511, "learning_rate": 0.00026839477252859007, "loss": 0.0919, "step": 813 }, { "epoch": 1.44, "grad_norm": 0.37357425689697266, "learning_rate": 0.0002679298926059109, "loss": 0.0426, "step": 814 }, { "epoch": 1.44, "grad_norm": 0.6023468375205994, "learning_rate": 0.000267464950356049, "loss": 0.0756, "step": 815 }, { "epoch": 1.44, "grad_norm": 0.35651543736457825, "learning_rate": 0.0002669999473952181, "loss": 0.0323, "step": 816 }, { "epoch": 1.45, "grad_norm": 0.4318545162677765, "learning_rate": 0.00026653488533984307, "loss": 0.1178, "step": 817 }, { "epoch": 1.45, "grad_norm": 0.31720227003097534, "learning_rate": 0.00026606976580655415, "loss": 0.0974, "step": 818 }, { "epoch": 1.45, "grad_norm": 0.5617119669914246, "learning_rate": 0.00026560459041218156, "loss": 0.1098, "step": 819 }, { "epoch": 1.45, "grad_norm": 0.5732232332229614, "learning_rate": 0.00026513936077374954, "loss": 0.0949, "step": 820 }, { "epoch": 1.45, "grad_norm": 0.33141466975212097, "learning_rate": 0.00026467407850847105, "loss": 0.0417, "step": 821 }, { "epoch": 1.45, "grad_norm": 0.41724076867103577, "learning_rate": 0.00026420874523374173, "loss": 0.0466, "step": 822 }, { "epoch": 1.46, "grad_norm": 0.24849863350391388, "learning_rate": 0.0002637433625671347, "loss": 0.0536, "step": 823 }, { "epoch": 1.46, "grad_norm": 0.3150210380554199, "learning_rate": 0.00026327793212639486, "loss": 0.0806, "step": 824 }, { "epoch": 1.46, "grad_norm": 0.2865166664123535, "learning_rate": 0.00026281245552943293, "loss": 0.0533, "step": 825 }, { "epoch": 1.46, "grad_norm": 0.2756612300872803, "learning_rate": 0.00026234693439432043, "loss": 0.0504, "step": 826 }, { "epoch": 1.46, "grad_norm": 0.9047459363937378, "learning_rate": 0.0002618813703392833, "loss": 0.109, "step": 827 }, { "epoch": 1.46, "grad_norm": 0.10576523840427399, "learning_rate": 0.00026141576498269706, "loss": 0.0231, "step": 828 }, { "epoch": 1.47, "grad_norm": 0.4203824996948242, "learning_rate": 0.00026095011994308056, "loss": 0.0727, "step": 829 }, { "epoch": 1.47, "grad_norm": 0.4648183584213257, "learning_rate": 0.0002604844368390905, "loss": 0.1066, "step": 830 }, { "epoch": 1.47, "grad_norm": 0.2482835054397583, "learning_rate": 0.00026001871728951624, "loss": 0.0237, "step": 831 }, { "epoch": 1.47, "grad_norm": 0.4374096393585205, "learning_rate": 0.00025955296291327356, "loss": 0.0934, "step": 832 }, { "epoch": 1.47, "grad_norm": 0.3488870859146118, "learning_rate": 0.00025908717532939946, "loss": 0.0638, "step": 833 }, { "epoch": 1.48, "grad_norm": 0.5614967942237854, "learning_rate": 0.00025862135615704613, "loss": 0.0827, "step": 834 }, { "epoch": 1.48, "grad_norm": 0.30991917848587036, "learning_rate": 0.0002581555070154759, "loss": 0.0438, "step": 835 }, { "epoch": 1.48, "grad_norm": 0.44601985812187195, "learning_rate": 0.00025768962952405503, "loss": 0.0797, "step": 836 }, { "epoch": 1.48, "grad_norm": 0.3628086745738983, "learning_rate": 0.00025722372530224844, "loss": 0.0366, "step": 837 }, { "epoch": 1.48, "grad_norm": 0.2644861936569214, "learning_rate": 0.000256757795969614, "loss": 0.0331, "step": 838 }, { "epoch": 1.48, "grad_norm": 0.4585146903991699, "learning_rate": 0.0002562918431457967, "loss": 0.0635, "step": 839 }, { "epoch": 1.49, "grad_norm": 0.4738370478153229, "learning_rate": 0.0002558258684505233, "loss": 0.0599, "step": 840 }, { "epoch": 1.49, "grad_norm": 0.6536511182785034, "learning_rate": 0.00025535987350359664, "loss": 0.077, "step": 841 }, { "epoch": 1.49, "grad_norm": 0.43449538946151733, "learning_rate": 0.00025489385992489, "loss": 0.0432, "step": 842 }, { "epoch": 1.49, "grad_norm": 0.5419031977653503, "learning_rate": 0.0002544278293343411, "loss": 0.093, "step": 843 }, { "epoch": 1.49, "grad_norm": 0.30555063486099243, "learning_rate": 0.0002539617833519472, "loss": 0.0572, "step": 844 }, { "epoch": 1.49, "grad_norm": 0.18094651401042938, "learning_rate": 0.0002534957235977589, "loss": 0.0353, "step": 845 }, { "epoch": 1.5, "grad_norm": 0.34586626291275024, "learning_rate": 0.00025302965169187467, "loss": 0.0554, "step": 846 }, { "epoch": 1.5, "grad_norm": 0.38212889432907104, "learning_rate": 0.00025256356925443507, "loss": 0.0624, "step": 847 }, { "epoch": 1.5, "grad_norm": 0.5566253066062927, "learning_rate": 0.00025209747790561754, "loss": 0.0603, "step": 848 }, { "epoch": 1.5, "grad_norm": 0.2991026043891907, "learning_rate": 0.0002516313792656304, "loss": 0.0374, "step": 849 }, { "epoch": 1.5, "grad_norm": 0.6026126146316528, "learning_rate": 0.0002511652749547072, "loss": 0.1283, "step": 850 }, { "epoch": 1.51, "grad_norm": 0.28952938318252563, "learning_rate": 0.0002506991665931013, "loss": 0.0708, "step": 851 }, { "epoch": 1.51, "grad_norm": 0.3491526246070862, "learning_rate": 0.00025023305580108027, "loss": 0.0536, "step": 852 }, { "epoch": 1.51, "eval_loss": 0.08201431483030319, "eval_runtime": 14.6754, "eval_samples_per_second": 32.503, "eval_steps_per_second": 8.177, "step": 852 }, { "epoch": 1.51, "grad_norm": 0.7052826881408691, "learning_rate": 0.00024976694419891974, "loss": 0.059, "step": 853 }, { "epoch": 1.51, "grad_norm": 0.43753165006637573, "learning_rate": 0.0002493008334068987, "loss": 0.0751, "step": 854 }, { "epoch": 1.51, "grad_norm": 0.1675650030374527, "learning_rate": 0.00024883472504529287, "loss": 0.0224, "step": 855 }, { "epoch": 1.51, "grad_norm": 0.3790821433067322, "learning_rate": 0.00024836862073436967, "loss": 0.0707, "step": 856 }, { "epoch": 1.52, "grad_norm": 0.3162197470664978, "learning_rate": 0.0002479025220943825, "loss": 0.0375, "step": 857 }, { "epoch": 1.52, "grad_norm": 0.5708606243133545, "learning_rate": 0.00024743643074556494, "loss": 0.0632, "step": 858 }, { "epoch": 1.52, "grad_norm": 0.24075205624103546, "learning_rate": 0.00024697034830812535, "loss": 0.0452, "step": 859 }, { "epoch": 1.52, "grad_norm": 0.371979683637619, "learning_rate": 0.00024650427640224114, "loss": 0.0676, "step": 860 }, { "epoch": 1.52, "grad_norm": 0.2892770767211914, "learning_rate": 0.00024603821664805276, "loss": 0.0592, "step": 861 }, { "epoch": 1.52, "grad_norm": 0.14655162394046783, "learning_rate": 0.00024557217066565896, "loss": 0.0161, "step": 862 }, { "epoch": 1.53, "grad_norm": 0.22819265723228455, "learning_rate": 0.0002451061400751101, "loss": 0.0418, "step": 863 }, { "epoch": 1.53, "grad_norm": 0.4637010097503662, "learning_rate": 0.0002446401264964034, "loss": 0.0555, "step": 864 }, { "epoch": 1.53, "grad_norm": 0.30727583169937134, "learning_rate": 0.00024417413154947677, "loss": 0.0258, "step": 865 }, { "epoch": 1.53, "grad_norm": 0.26563793420791626, "learning_rate": 0.00024370815685420338, "loss": 0.0528, "step": 866 }, { "epoch": 1.53, "grad_norm": 0.18691249191761017, "learning_rate": 0.00024324220403038613, "loss": 0.0432, "step": 867 }, { "epoch": 1.54, "grad_norm": 0.3845004439353943, "learning_rate": 0.00024277627469775163, "loss": 0.08, "step": 868 }, { "epoch": 1.54, "grad_norm": 0.36308273673057556, "learning_rate": 0.00024231037047594495, "loss": 0.0569, "step": 869 }, { "epoch": 1.54, "grad_norm": 0.43758854269981384, "learning_rate": 0.00024184449298452414, "loss": 0.0451, "step": 870 }, { "epoch": 1.54, "grad_norm": 0.30163124203681946, "learning_rate": 0.00024137864384295388, "loss": 0.0219, "step": 871 }, { "epoch": 1.54, "grad_norm": 0.20218737423419952, "learning_rate": 0.00024091282467060055, "loss": 0.0277, "step": 872 }, { "epoch": 1.54, "grad_norm": 0.2397354692220688, "learning_rate": 0.00024044703708672648, "loss": 0.0307, "step": 873 }, { "epoch": 1.55, "grad_norm": 0.31938987970352173, "learning_rate": 0.00023998128271048374, "loss": 0.0453, "step": 874 }, { "epoch": 1.55, "grad_norm": 0.5195713043212891, "learning_rate": 0.00023951556316090952, "loss": 0.0852, "step": 875 }, { "epoch": 1.55, "grad_norm": 0.5428398251533508, "learning_rate": 0.00023904988005691953, "loss": 0.0941, "step": 876 }, { "epoch": 1.55, "grad_norm": 0.9265273213386536, "learning_rate": 0.00023858423501730295, "loss": 0.0489, "step": 877 }, { "epoch": 1.55, "grad_norm": 0.5896188020706177, "learning_rate": 0.00023811862966071674, "loss": 0.0601, "step": 878 }, { "epoch": 1.56, "grad_norm": 0.6030554175376892, "learning_rate": 0.0002376530656056796, "loss": 0.0958, "step": 879 }, { "epoch": 1.56, "grad_norm": 0.3220241665840149, "learning_rate": 0.00023718754447056708, "loss": 0.0487, "step": 880 }, { "epoch": 1.56, "grad_norm": 0.49902886152267456, "learning_rate": 0.00023672206787360523, "loss": 0.0457, "step": 881 }, { "epoch": 1.56, "grad_norm": 0.3744886517524719, "learning_rate": 0.00023625663743286534, "loss": 0.0771, "step": 882 }, { "epoch": 1.56, "grad_norm": 0.39943140745162964, "learning_rate": 0.0002357912547662584, "loss": 0.0389, "step": 883 }, { "epoch": 1.56, "grad_norm": 0.34382057189941406, "learning_rate": 0.00023532592149152898, "loss": 0.0405, "step": 884 }, { "epoch": 1.57, "grad_norm": 0.22058314085006714, "learning_rate": 0.00023486063922625042, "loss": 0.032, "step": 885 }, { "epoch": 1.57, "grad_norm": 0.32711130380630493, "learning_rate": 0.00023439540958781848, "loss": 0.0589, "step": 886 }, { "epoch": 1.57, "grad_norm": 0.42972657084465027, "learning_rate": 0.0002339302341934459, "loss": 0.0601, "step": 887 }, { "epoch": 1.57, "grad_norm": 0.31368395686149597, "learning_rate": 0.00023346511466015708, "loss": 0.0345, "step": 888 }, { "epoch": 1.57, "grad_norm": 0.28611400723457336, "learning_rate": 0.00023300005260478194, "loss": 0.0432, "step": 889 }, { "epoch": 1.57, "grad_norm": 0.5313751101493835, "learning_rate": 0.00023253504964395097, "loss": 0.0427, "step": 890 }, { "epoch": 1.58, "grad_norm": 0.2192300707101822, "learning_rate": 0.00023207010739408908, "loss": 0.0392, "step": 891 }, { "epoch": 1.58, "grad_norm": 0.7893845438957214, "learning_rate": 0.00023160522747141, "loss": 0.1338, "step": 892 }, { "epoch": 1.58, "grad_norm": 0.5475191473960876, "learning_rate": 0.00023114041149191098, "loss": 0.1458, "step": 893 }, { "epoch": 1.58, "grad_norm": 0.4575919806957245, "learning_rate": 0.00023067566107136685, "loss": 0.0593, "step": 894 }, { "epoch": 1.58, "grad_norm": 0.47522222995758057, "learning_rate": 0.00023021097782532457, "loss": 0.0744, "step": 895 }, { "epoch": 1.59, "grad_norm": 0.3471393883228302, "learning_rate": 0.0002297463633690979, "loss": 0.0795, "step": 896 }, { "epoch": 1.59, "grad_norm": 0.6821273565292358, "learning_rate": 0.00022928181931776098, "loss": 0.0692, "step": 897 }, { "epoch": 1.59, "grad_norm": 0.4375980496406555, "learning_rate": 0.00022881734728614347, "loss": 0.0704, "step": 898 }, { "epoch": 1.59, "grad_norm": 0.596495509147644, "learning_rate": 0.0002283529488888249, "loss": 0.0744, "step": 899 }, { "epoch": 1.59, "grad_norm": 0.36588358879089355, "learning_rate": 0.00022788862574012824, "loss": 0.0576, "step": 900 }, { "epoch": 1.59, "grad_norm": 0.49629420042037964, "learning_rate": 0.0002274243794541155, "loss": 0.0847, "step": 901 }, { "epoch": 1.6, "grad_norm": 0.8968174457550049, "learning_rate": 0.0002269602116445811, "loss": 0.0521, "step": 902 }, { "epoch": 1.6, "grad_norm": 0.260775089263916, "learning_rate": 0.00022649612392504687, "loss": 0.0345, "step": 903 }, { "epoch": 1.6, "grad_norm": 0.33473265171051025, "learning_rate": 0.00022603211790875622, "loss": 0.0483, "step": 904 }, { "epoch": 1.6, "grad_norm": 0.451668918132782, "learning_rate": 0.0002255681952086683, "loss": 0.0862, "step": 905 }, { "epoch": 1.6, "grad_norm": 0.6166467070579529, "learning_rate": 0.00022510435743745304, "loss": 0.1038, "step": 906 }, { "epoch": 1.6, "grad_norm": 0.361858069896698, "learning_rate": 0.0002246406062074848, "loss": 0.073, "step": 907 }, { "epoch": 1.61, "grad_norm": 0.42583227157592773, "learning_rate": 0.00022417694313083735, "loss": 0.0923, "step": 908 }, { "epoch": 1.61, "grad_norm": 0.2566489279270172, "learning_rate": 0.00022371336981927788, "loss": 0.0358, "step": 909 }, { "epoch": 1.61, "grad_norm": 0.3075582683086395, "learning_rate": 0.0002232498878842615, "loss": 0.0912, "step": 910 }, { "epoch": 1.61, "grad_norm": 0.1581839770078659, "learning_rate": 0.00022278649893692584, "loss": 0.0309, "step": 911 }, { "epoch": 1.61, "grad_norm": 0.37088707089424133, "learning_rate": 0.00022232320458808532, "loss": 0.074, "step": 912 }, { "epoch": 1.62, "grad_norm": 0.40888333320617676, "learning_rate": 0.00022186000644822522, "loss": 0.0618, "step": 913 }, { "epoch": 1.62, "grad_norm": 0.2161816507577896, "learning_rate": 0.00022139690612749672, "loss": 0.0355, "step": 914 }, { "epoch": 1.62, "grad_norm": 0.3577941954135895, "learning_rate": 0.00022093390523571067, "loss": 0.0499, "step": 915 }, { "epoch": 1.62, "grad_norm": 0.3455984592437744, "learning_rate": 0.0002204710053823326, "loss": 0.0712, "step": 916 }, { "epoch": 1.62, "grad_norm": 0.2223758101463318, "learning_rate": 0.0002200082081764766, "loss": 0.0362, "step": 917 }, { "epoch": 1.62, "grad_norm": 0.3027271032333374, "learning_rate": 0.00021954551522689993, "loss": 0.0579, "step": 918 }, { "epoch": 1.63, "grad_norm": 0.12046008557081223, "learning_rate": 0.00021908292814199764, "loss": 0.0227, "step": 919 }, { "epoch": 1.63, "grad_norm": 0.3685935437679291, "learning_rate": 0.00021862044852979652, "loss": 0.0797, "step": 920 }, { "epoch": 1.63, "grad_norm": 0.26438644528388977, "learning_rate": 0.00021815807799794982, "loss": 0.0567, "step": 921 }, { "epoch": 1.63, "grad_norm": 0.44811537861824036, "learning_rate": 0.00021769581815373192, "loss": 0.0623, "step": 922 }, { "epoch": 1.63, "grad_norm": 0.36297371983528137, "learning_rate": 0.000217233670604032, "loss": 0.0512, "step": 923 }, { "epoch": 1.63, "grad_norm": 0.30395954847335815, "learning_rate": 0.00021677163695534913, "loss": 0.0423, "step": 924 }, { "epoch": 1.64, "grad_norm": 0.26092368364334106, "learning_rate": 0.00021630971881378644, "loss": 0.0463, "step": 925 }, { "epoch": 1.64, "grad_norm": 0.639478325843811, "learning_rate": 0.0002158479177850453, "loss": 0.0564, "step": 926 }, { "epoch": 1.64, "grad_norm": 0.25447505712509155, "learning_rate": 0.00021538623547442045, "loss": 0.0317, "step": 927 }, { "epoch": 1.64, "grad_norm": 0.24460607767105103, "learning_rate": 0.00021492467348679345, "loss": 0.0375, "step": 928 }, { "epoch": 1.64, "grad_norm": 0.43972596526145935, "learning_rate": 0.00021446323342662785, "loss": 0.0587, "step": 929 }, { "epoch": 1.65, "grad_norm": 0.08744898438453674, "learning_rate": 0.00021400191689796338, "loss": 0.0074, "step": 930 }, { "epoch": 1.65, "grad_norm": 0.29619458317756653, "learning_rate": 0.00021354072550441018, "loss": 0.0422, "step": 931 }, { "epoch": 1.65, "grad_norm": 0.3273064196109772, "learning_rate": 0.00021307966084914372, "loss": 0.0362, "step": 932 }, { "epoch": 1.65, "grad_norm": 0.2981872260570526, "learning_rate": 0.00021261872453489842, "loss": 0.0246, "step": 933 }, { "epoch": 1.65, "grad_norm": 0.7154226899147034, "learning_rate": 0.00021215791816396303, "loss": 0.0856, "step": 934 }, { "epoch": 1.65, "grad_norm": 0.4782339930534363, "learning_rate": 0.00021169724333817443, "loss": 0.0482, "step": 935 }, { "epoch": 1.66, "grad_norm": 0.5048168897628784, "learning_rate": 0.00021123670165891208, "loss": 0.0405, "step": 936 }, { "epoch": 1.66, "grad_norm": 0.22893092036247253, "learning_rate": 0.0002107762947270928, "loss": 0.0181, "step": 937 }, { "epoch": 1.66, "grad_norm": 0.6863519549369812, "learning_rate": 0.00021031602414316506, "loss": 0.0643, "step": 938 }, { "epoch": 1.66, "grad_norm": 0.5500178337097168, "learning_rate": 0.0002098558915071031, "loss": 0.0599, "step": 939 }, { "epoch": 1.66, "grad_norm": 0.7170897126197815, "learning_rate": 0.0002093958984184018, "loss": 0.1167, "step": 940 }, { "epoch": 1.66, "grad_norm": 0.48540323972702026, "learning_rate": 0.00020893604647607088, "loss": 0.0443, "step": 941 }, { "epoch": 1.67, "grad_norm": 0.09318219870328903, "learning_rate": 0.0002084763372786295, "loss": 0.0092, "step": 942 }, { "epoch": 1.67, "grad_norm": 0.7748222351074219, "learning_rate": 0.00020801677242410067, "loss": 0.0912, "step": 943 }, { "epoch": 1.67, "grad_norm": 0.2857852280139923, "learning_rate": 0.00020755735351000537, "loss": 0.0313, "step": 944 }, { "epoch": 1.67, "grad_norm": 0.483059287071228, "learning_rate": 0.00020709808213335758, "loss": 0.0768, "step": 945 }, { "epoch": 1.67, "grad_norm": 0.2753046751022339, "learning_rate": 0.0002066389598906582, "loss": 0.0339, "step": 946 }, { "epoch": 1.68, "grad_norm": 0.29203662276268005, "learning_rate": 0.00020617998837788975, "loss": 0.0185, "step": 947 }, { "epoch": 1.68, "grad_norm": 0.3008381426334381, "learning_rate": 0.00020572116919051098, "loss": 0.0574, "step": 948 }, { "epoch": 1.68, "grad_norm": 0.6928682923316956, "learning_rate": 0.0002052625039234509, "loss": 0.051, "step": 949 }, { "epoch": 1.68, "grad_norm": 0.27521032094955444, "learning_rate": 0.00020480399417110352, "loss": 0.05, "step": 950 }, { "epoch": 1.68, "grad_norm": 0.1467350274324417, "learning_rate": 0.00020434564152732238, "loss": 0.0254, "step": 951 }, { "epoch": 1.68, "grad_norm": 0.2940123379230499, "learning_rate": 0.00020388744758541462, "loss": 0.0521, "step": 952 }, { "epoch": 1.69, "grad_norm": 0.2931191325187683, "learning_rate": 0.00020342941393813613, "loss": 0.0394, "step": 953 }, { "epoch": 1.69, "grad_norm": 0.2518831491470337, "learning_rate": 0.00020297154217768513, "loss": 0.041, "step": 954 }, { "epoch": 1.69, "grad_norm": 0.48200809955596924, "learning_rate": 0.00020251383389569743, "loss": 0.0693, "step": 955 }, { "epoch": 1.69, "grad_norm": 0.4188168942928314, "learning_rate": 0.0002020562906832404, "loss": 0.0594, "step": 956 }, { "epoch": 1.69, "grad_norm": 0.4201320707798004, "learning_rate": 0.00020159891413080755, "loss": 0.0737, "step": 957 }, { "epoch": 1.69, "grad_norm": 0.4236721396446228, "learning_rate": 0.00020114170582831342, "loss": 0.0443, "step": 958 }, { "epoch": 1.7, "grad_norm": 0.39196375012397766, "learning_rate": 0.00020068466736508704, "loss": 0.0728, "step": 959 }, { "epoch": 1.7, "grad_norm": 0.5320992469787598, "learning_rate": 0.00020022780032986765, "loss": 0.0416, "step": 960 }, { "epoch": 1.7, "grad_norm": 0.12672173976898193, "learning_rate": 0.00019977110631079836, "loss": 0.0191, "step": 961 }, { "epoch": 1.7, "grad_norm": 0.6431661248207092, "learning_rate": 0.0001993145868954207, "loss": 0.038, "step": 962 }, { "epoch": 1.7, "grad_norm": 0.29868853092193604, "learning_rate": 0.00019885824367066955, "loss": 0.0245, "step": 963 }, { "epoch": 1.71, "grad_norm": 0.5283737778663635, "learning_rate": 0.0001984020782228671, "loss": 0.0722, "step": 964 }, { "epoch": 1.71, "grad_norm": 0.5567461252212524, "learning_rate": 0.00019794609213771755, "loss": 0.1026, "step": 965 }, { "epoch": 1.71, "grad_norm": 0.9617827534675598, "learning_rate": 0.00019749028700030181, "loss": 0.078, "step": 966 }, { "epoch": 1.71, "grad_norm": 0.5466052889823914, "learning_rate": 0.0001970346643950715, "loss": 0.075, "step": 967 }, { "epoch": 1.71, "grad_norm": 0.18108782172203064, "learning_rate": 0.00019657922590584392, "loss": 0.024, "step": 968 }, { "epoch": 1.71, "grad_norm": 0.4150354862213135, "learning_rate": 0.00019612397311579647, "loss": 0.0656, "step": 969 }, { "epoch": 1.72, "grad_norm": 0.21237897872924805, "learning_rate": 0.0001956689076074607, "loss": 0.0378, "step": 970 }, { "epoch": 1.72, "grad_norm": 0.21254923939704895, "learning_rate": 0.0001952140309627174, "loss": 0.0173, "step": 971 }, { "epoch": 1.72, "grad_norm": 0.2641647756099701, "learning_rate": 0.0001947593447627907, "loss": 0.049, "step": 972 }, { "epoch": 1.72, "grad_norm": 0.3682314455509186, "learning_rate": 0.00019430485058824276, "loss": 0.0485, "step": 973 }, { "epoch": 1.72, "grad_norm": 0.2566399574279785, "learning_rate": 0.00019385055001896835, "loss": 0.0388, "step": 974 }, { "epoch": 1.72, "grad_norm": 0.20328454673290253, "learning_rate": 0.000193396444634189, "loss": 0.0259, "step": 975 }, { "epoch": 1.73, "grad_norm": 0.5327407717704773, "learning_rate": 0.00019294253601244792, "loss": 0.085, "step": 976 }, { "epoch": 1.73, "grad_norm": 0.6960484385490417, "learning_rate": 0.00019248882573160437, "loss": 0.1077, "step": 977 }, { "epoch": 1.73, "grad_norm": 0.5338547229766846, "learning_rate": 0.00019203531536882785, "loss": 0.0421, "step": 978 }, { "epoch": 1.73, "grad_norm": 0.1970924586057663, "learning_rate": 0.00019158200650059337, "loss": 0.024, "step": 979 }, { "epoch": 1.73, "grad_norm": 0.4665428698062897, "learning_rate": 0.0001911289007026751, "loss": 0.0549, "step": 980 }, { "epoch": 1.74, "grad_norm": 0.4011171758174896, "learning_rate": 0.00019067599955014156, "loss": 0.0482, "step": 981 }, { "epoch": 1.74, "grad_norm": 0.25179675221443176, "learning_rate": 0.00019022330461734982, "loss": 0.0327, "step": 982 }, { "epoch": 1.74, "grad_norm": 0.2733090817928314, "learning_rate": 0.00018977081747794, "loss": 0.0271, "step": 983 }, { "epoch": 1.74, "grad_norm": 0.5745018124580383, "learning_rate": 0.00018931853970483012, "loss": 0.0344, "step": 984 }, { "epoch": 1.74, "grad_norm": 0.12801390886306763, "learning_rate": 0.00018886647287021007, "loss": 0.0144, "step": 985 }, { "epoch": 1.74, "grad_norm": 0.1290263682603836, "learning_rate": 0.00018841461854553681, "loss": 0.0132, "step": 986 }, { "epoch": 1.75, "grad_norm": 0.863158643245697, "learning_rate": 0.00018796297830152853, "loss": 0.1274, "step": 987 }, { "epoch": 1.75, "grad_norm": 0.3602030277252197, "learning_rate": 0.00018751155370815895, "loss": 0.0549, "step": 988 }, { "epoch": 1.75, "grad_norm": 0.379295289516449, "learning_rate": 0.00018706034633465257, "loss": 0.0266, "step": 989 }, { "epoch": 1.75, "grad_norm": 0.43907806277275085, "learning_rate": 0.00018660935774947858, "loss": 0.0499, "step": 990 }, { "epoch": 1.75, "grad_norm": 0.822163462638855, "learning_rate": 0.00018615858952034548, "loss": 0.1464, "step": 991 }, { "epoch": 1.75, "grad_norm": 0.3563006520271301, "learning_rate": 0.00018570804321419614, "loss": 0.0499, "step": 992 }, { "epoch": 1.76, "grad_norm": 0.082757368683815, "learning_rate": 0.00018525772039720167, "loss": 0.0088, "step": 993 }, { "epoch": 1.76, "grad_norm": 0.349202424287796, "learning_rate": 0.00018480762263475638, "loss": 0.0325, "step": 994 }, { "epoch": 1.76, "eval_loss": 0.0834248885512352, "eval_runtime": 14.6855, "eval_samples_per_second": 32.481, "eval_steps_per_second": 8.171, "step": 994 }, { "epoch": 1.76, "grad_norm": 0.2513701319694519, "learning_rate": 0.0001843577514914725, "loss": 0.0186, "step": 995 }, { "epoch": 1.76, "grad_norm": 0.4057576656341553, "learning_rate": 0.00018390810853117408, "loss": 0.0348, "step": 996 }, { "epoch": 1.76, "grad_norm": 0.46003130078315735, "learning_rate": 0.0001834586953168923, "loss": 0.0689, "step": 997 }, { "epoch": 1.77, "grad_norm": 0.41909146308898926, "learning_rate": 0.00018300951341085946, "loss": 0.0298, "step": 998 }, { "epoch": 1.77, "grad_norm": 0.9829010367393494, "learning_rate": 0.00018256056437450399, "loss": 0.2026, "step": 999 }, { "epoch": 1.77, "grad_norm": 0.31356698274612427, "learning_rate": 0.00018211184976844487, "loss": 0.0263, "step": 1000 }, { "epoch": 1.77, "grad_norm": 0.4269973337650299, "learning_rate": 0.00018166337115248585, "loss": 0.1063, "step": 1001 }, { "epoch": 1.77, "grad_norm": 0.3558803200721741, "learning_rate": 0.00018121513008561064, "loss": 0.0389, "step": 1002 }, { "epoch": 1.77, "grad_norm": 0.5086562633514404, "learning_rate": 0.0001807671281259771, "loss": 0.0593, "step": 1003 }, { "epoch": 1.78, "grad_norm": 0.1954115778207779, "learning_rate": 0.00018031936683091186, "loss": 0.0327, "step": 1004 }, { "epoch": 1.78, "grad_norm": 0.1789095103740692, "learning_rate": 0.0001798718477569051, "loss": 0.024, "step": 1005 }, { "epoch": 1.78, "grad_norm": 0.38798651099205017, "learning_rate": 0.0001794245724596048, "loss": 0.0791, "step": 1006 }, { "epoch": 1.78, "grad_norm": 0.34710198640823364, "learning_rate": 0.00017897754249381165, "loss": 0.0571, "step": 1007 }, { "epoch": 1.78, "grad_norm": 0.2781204283237457, "learning_rate": 0.00017853075941347363, "loss": 0.0521, "step": 1008 }, { "epoch": 1.79, "grad_norm": 0.2825307548046112, "learning_rate": 0.00017808422477168023, "loss": 0.0474, "step": 1009 }, { "epoch": 1.79, "grad_norm": 0.23594889044761658, "learning_rate": 0.0001776379401206577, "loss": 0.0295, "step": 1010 }, { "epoch": 1.79, "grad_norm": 0.37222880125045776, "learning_rate": 0.00017719190701176286, "loss": 0.0411, "step": 1011 }, { "epoch": 1.79, "grad_norm": 0.25766775012016296, "learning_rate": 0.00017674612699547846, "loss": 0.0419, "step": 1012 }, { "epoch": 1.79, "grad_norm": 0.27667155861854553, "learning_rate": 0.00017630060162140737, "loss": 0.0325, "step": 1013 }, { "epoch": 1.79, "grad_norm": 0.49651435017585754, "learning_rate": 0.00017585533243826712, "loss": 0.0435, "step": 1014 }, { "epoch": 1.8, "grad_norm": 0.7008858323097229, "learning_rate": 0.00017541032099388499, "loss": 0.1405, "step": 1015 }, { "epoch": 1.8, "grad_norm": 0.17448720335960388, "learning_rate": 0.0001749655688351921, "loss": 0.0269, "step": 1016 }, { "epoch": 1.8, "grad_norm": 0.2893378734588623, "learning_rate": 0.0001745210775082182, "loss": 0.0417, "step": 1017 }, { "epoch": 1.8, "grad_norm": 0.18504270911216736, "learning_rate": 0.0001740768485580866, "loss": 0.0302, "step": 1018 }, { "epoch": 1.8, "grad_norm": 0.2060771882534027, "learning_rate": 0.00017363288352900818, "loss": 0.047, "step": 1019 }, { "epoch": 1.8, "grad_norm": 0.8185610771179199, "learning_rate": 0.00017318918396427675, "loss": 0.1398, "step": 1020 }, { "epoch": 1.81, "grad_norm": 0.46132713556289673, "learning_rate": 0.00017274575140626317, "loss": 0.0776, "step": 1021 }, { "epoch": 1.81, "grad_norm": 0.16016420722007751, "learning_rate": 0.0001723025873964101, "loss": 0.0161, "step": 1022 }, { "epoch": 1.81, "grad_norm": 0.6459915041923523, "learning_rate": 0.00017185969347522674, "loss": 0.0711, "step": 1023 }, { "epoch": 1.81, "grad_norm": 0.40434324741363525, "learning_rate": 0.0001714170711822834, "loss": 0.0571, "step": 1024 }, { "epoch": 1.81, "grad_norm": 0.5824777483940125, "learning_rate": 0.00017097472205620607, "loss": 0.1141, "step": 1025 }, { "epoch": 1.82, "grad_norm": 0.4143454134464264, "learning_rate": 0.00017053264763467152, "loss": 0.0558, "step": 1026 }, { "epoch": 1.82, "grad_norm": 0.36720553040504456, "learning_rate": 0.00017009084945440113, "loss": 0.0376, "step": 1027 }, { "epoch": 1.82, "grad_norm": 0.27180641889572144, "learning_rate": 0.00016964932905115632, "loss": 0.054, "step": 1028 }, { "epoch": 1.82, "grad_norm": 0.43961653113365173, "learning_rate": 0.0001692080879597329, "loss": 0.0773, "step": 1029 }, { "epoch": 1.82, "grad_norm": 0.2728005647659302, "learning_rate": 0.00016876712771395552, "loss": 0.0142, "step": 1030 }, { "epoch": 1.82, "grad_norm": 0.5099291205406189, "learning_rate": 0.0001683264498466729, "loss": 0.0404, "step": 1031 }, { "epoch": 1.83, "grad_norm": 0.3162379562854767, "learning_rate": 0.00016788605588975193, "loss": 0.0332, "step": 1032 }, { "epoch": 1.83, "grad_norm": 0.4152194857597351, "learning_rate": 0.0001674459473740726, "loss": 0.0352, "step": 1033 }, { "epoch": 1.83, "grad_norm": 0.3174980878829956, "learning_rate": 0.00016700612582952278, "loss": 0.0777, "step": 1034 }, { "epoch": 1.83, "grad_norm": 0.6996863484382629, "learning_rate": 0.0001665665927849926, "loss": 0.1145, "step": 1035 }, { "epoch": 1.83, "grad_norm": 0.2766638398170471, "learning_rate": 0.0001661273497683697, "loss": 0.0179, "step": 1036 }, { "epoch": 1.83, "grad_norm": 0.45079368352890015, "learning_rate": 0.00016568839830653287, "loss": 0.1081, "step": 1037 }, { "epoch": 1.84, "grad_norm": 0.44944706559181213, "learning_rate": 0.0001652497399253481, "loss": 0.0964, "step": 1038 }, { "epoch": 1.84, "grad_norm": 0.5892651081085205, "learning_rate": 0.00016481137614966223, "loss": 0.1138, "step": 1039 }, { "epoch": 1.84, "grad_norm": 0.29900479316711426, "learning_rate": 0.00016437330850329793, "loss": 0.0429, "step": 1040 }, { "epoch": 1.84, "grad_norm": 0.3094378411769867, "learning_rate": 0.00016393553850904878, "loss": 0.0577, "step": 1041 }, { "epoch": 1.84, "grad_norm": 0.23039738833904266, "learning_rate": 0.00016349806768867345, "loss": 0.026, "step": 1042 }, { "epoch": 1.85, "grad_norm": 0.3328697979450226, "learning_rate": 0.00016306089756289063, "loss": 0.0542, "step": 1043 }, { "epoch": 1.85, "grad_norm": 0.3017619252204895, "learning_rate": 0.0001626240296513739, "loss": 0.0363, "step": 1044 }, { "epoch": 1.85, "grad_norm": 0.15930373966693878, "learning_rate": 0.0001621874654727461, "loss": 0.02, "step": 1045 }, { "epoch": 1.85, "grad_norm": 0.40952980518341064, "learning_rate": 0.00016175120654457432, "loss": 0.0523, "step": 1046 }, { "epoch": 1.85, "grad_norm": 0.6540464162826538, "learning_rate": 0.00016131525438336475, "loss": 0.0744, "step": 1047 }, { "epoch": 1.85, "grad_norm": 0.3518769443035126, "learning_rate": 0.00016087961050455685, "loss": 0.05, "step": 1048 }, { "epoch": 1.86, "grad_norm": 0.4166756570339203, "learning_rate": 0.0001604442764225188, "loss": 0.0681, "step": 1049 }, { "epoch": 1.86, "grad_norm": 0.39616283774375916, "learning_rate": 0.00016000925365054154, "loss": 0.0416, "step": 1050 }, { "epoch": 1.86, "grad_norm": 0.20040427148342133, "learning_rate": 0.00015957454370083398, "loss": 0.0284, "step": 1051 }, { "epoch": 1.86, "grad_norm": 0.7230433821678162, "learning_rate": 0.00015914014808451784, "loss": 0.1035, "step": 1052 }, { "epoch": 1.86, "grad_norm": 0.1968737691640854, "learning_rate": 0.00015870606831162182, "loss": 0.0281, "step": 1053 }, { "epoch": 1.86, "grad_norm": 0.2677242159843445, "learning_rate": 0.0001582723058910769, "loss": 0.0566, "step": 1054 }, { "epoch": 1.87, "grad_norm": 0.12256369739770889, "learning_rate": 0.00015783886233071076, "loss": 0.0192, "step": 1055 }, { "epoch": 1.87, "grad_norm": 0.311192125082016, "learning_rate": 0.00015740573913724276, "loss": 0.035, "step": 1056 }, { "epoch": 1.87, "grad_norm": 0.36169809103012085, "learning_rate": 0.00015697293781627878, "loss": 0.0755, "step": 1057 }, { "epoch": 1.87, "grad_norm": 0.8104953765869141, "learning_rate": 0.00015654045987230532, "loss": 0.0418, "step": 1058 }, { "epoch": 1.87, "grad_norm": 0.35273879766464233, "learning_rate": 0.00015610830680868533, "loss": 0.0266, "step": 1059 }, { "epoch": 1.88, "grad_norm": 0.18313364684581757, "learning_rate": 0.00015567648012765212, "loss": 0.0538, "step": 1060 }, { "epoch": 1.88, "grad_norm": 0.40563294291496277, "learning_rate": 0.0001552449813303044, "loss": 0.046, "step": 1061 }, { "epoch": 1.88, "grad_norm": 0.44426023960113525, "learning_rate": 0.00015481381191660143, "loss": 0.0938, "step": 1062 }, { "epoch": 1.88, "grad_norm": 0.4189196228981018, "learning_rate": 0.00015438297338535702, "loss": 0.0344, "step": 1063 }, { "epoch": 1.88, "grad_norm": 0.6641749143600464, "learning_rate": 0.0001539524672342351, "loss": 0.0729, "step": 1064 }, { "epoch": 1.88, "grad_norm": 0.2397107034921646, "learning_rate": 0.00015352229495974422, "loss": 0.0493, "step": 1065 }, { "epoch": 1.89, "grad_norm": 0.17326873540878296, "learning_rate": 0.00015309245805723205, "loss": 0.0131, "step": 1066 }, { "epoch": 1.89, "grad_norm": 0.69275963306427, "learning_rate": 0.00015266295802088064, "loss": 0.1512, "step": 1067 }, { "epoch": 1.89, "grad_norm": 0.3260841369628906, "learning_rate": 0.00015223379634370115, "loss": 0.0602, "step": 1068 }, { "epoch": 1.89, "grad_norm": 0.45368266105651855, "learning_rate": 0.00015180497451752826, "loss": 0.0593, "step": 1069 }, { "epoch": 1.89, "grad_norm": 0.5664640069007874, "learning_rate": 0.0001513764940330155, "loss": 0.0651, "step": 1070 }, { "epoch": 1.89, "grad_norm": 0.21212846040725708, "learning_rate": 0.00015094835637962975, "loss": 0.0232, "step": 1071 }, { "epoch": 1.9, "grad_norm": 0.364945650100708, "learning_rate": 0.0001505205630456461, "loss": 0.0436, "step": 1072 }, { "epoch": 1.9, "grad_norm": 0.48835766315460205, "learning_rate": 0.00015009311551814297, "loss": 0.0885, "step": 1073 }, { "epoch": 1.9, "grad_norm": 0.22198058664798737, "learning_rate": 0.00014966601528299637, "loss": 0.026, "step": 1074 }, { "epoch": 1.9, "grad_norm": 0.2598209083080292, "learning_rate": 0.00014923926382487534, "loss": 0.0306, "step": 1075 }, { "epoch": 1.9, "grad_norm": 0.22863651812076569, "learning_rate": 0.0001488128626272363, "loss": 0.0476, "step": 1076 }, { "epoch": 1.91, "grad_norm": 0.4222748875617981, "learning_rate": 0.00014838681317231822, "loss": 0.0837, "step": 1077 }, { "epoch": 1.91, "grad_norm": 0.5555634498596191, "learning_rate": 0.00014796111694113752, "loss": 0.0747, "step": 1078 }, { "epoch": 1.91, "grad_norm": 0.28704702854156494, "learning_rate": 0.0001475357754134824, "loss": 0.0388, "step": 1079 }, { "epoch": 1.91, "grad_norm": 0.3526531457901001, "learning_rate": 0.00014711079006790828, "loss": 0.0396, "step": 1080 }, { "epoch": 1.91, "grad_norm": 0.41639888286590576, "learning_rate": 0.0001466861623817325, "loss": 0.0954, "step": 1081 }, { "epoch": 1.91, "grad_norm": 0.288824200630188, "learning_rate": 0.0001462618938310288, "loss": 0.0355, "step": 1082 }, { "epoch": 1.92, "grad_norm": 0.257003515958786, "learning_rate": 0.00014583798589062292, "loss": 0.0257, "step": 1083 }, { "epoch": 1.92, "grad_norm": 0.23509138822555542, "learning_rate": 0.00014541444003408682, "loss": 0.0548, "step": 1084 }, { "epoch": 1.92, "grad_norm": 0.3425995707511902, "learning_rate": 0.0001449912577337337, "loss": 0.0691, "step": 1085 }, { "epoch": 1.92, "grad_norm": 0.2606826722621918, "learning_rate": 0.00014456844046061332, "loss": 0.029, "step": 1086 }, { "epoch": 1.92, "grad_norm": 0.10555114597082138, "learning_rate": 0.00014414598968450615, "loss": 0.0166, "step": 1087 }, { "epoch": 1.92, "grad_norm": 0.47334909439086914, "learning_rate": 0.00014372390687391906, "loss": 0.0438, "step": 1088 }, { "epoch": 1.93, "grad_norm": 0.36925116181373596, "learning_rate": 0.00014330219349607947, "loss": 0.0163, "step": 1089 }, { "epoch": 1.93, "grad_norm": 0.2707056999206543, "learning_rate": 0.0001428808510169307, "loss": 0.0709, "step": 1090 }, { "epoch": 1.93, "grad_norm": 0.20645679533481598, "learning_rate": 0.00014245988090112694, "loss": 0.0351, "step": 1091 }, { "epoch": 1.93, "grad_norm": 0.1839297115802765, "learning_rate": 0.00014203928461202763, "loss": 0.025, "step": 1092 }, { "epoch": 1.93, "grad_norm": 0.6433751583099365, "learning_rate": 0.0001416190636116932, "loss": 0.0693, "step": 1093 }, { "epoch": 1.94, "grad_norm": 0.44755876064300537, "learning_rate": 0.00014119921936087907, "loss": 0.0788, "step": 1094 }, { "epoch": 1.94, "grad_norm": 0.3716716766357422, "learning_rate": 0.00014077975331903118, "loss": 0.0429, "step": 1095 }, { "epoch": 1.94, "grad_norm": 0.7285858392715454, "learning_rate": 0.00014036066694428096, "loss": 0.035, "step": 1096 }, { "epoch": 1.94, "grad_norm": 0.22279293835163116, "learning_rate": 0.00013994196169343963, "loss": 0.012, "step": 1097 }, { "epoch": 1.94, "grad_norm": 0.17774836719036102, "learning_rate": 0.00013952363902199405, "loss": 0.0238, "step": 1098 }, { "epoch": 1.94, "grad_norm": 0.2312237024307251, "learning_rate": 0.0001391057003841008, "loss": 0.0546, "step": 1099 }, { "epoch": 1.95, "grad_norm": 0.6004791855812073, "learning_rate": 0.0001386881472325816, "loss": 0.0625, "step": 1100 }, { "epoch": 1.95, "grad_norm": 0.21409562230110168, "learning_rate": 0.0001382709810189183, "loss": 0.034, "step": 1101 }, { "epoch": 1.95, "grad_norm": 0.295493483543396, "learning_rate": 0.00013785420319324744, "loss": 0.0332, "step": 1102 }, { "epoch": 1.95, "grad_norm": 0.5428887009620667, "learning_rate": 0.00013743781520435573, "loss": 0.0649, "step": 1103 }, { "epoch": 1.95, "grad_norm": 0.331990122795105, "learning_rate": 0.00013702181849967453, "loss": 0.046, "step": 1104 }, { "epoch": 1.95, "grad_norm": 0.4544171392917633, "learning_rate": 0.00013660621452527504, "loss": 0.0563, "step": 1105 }, { "epoch": 1.96, "grad_norm": 0.35486406087875366, "learning_rate": 0.0001361910047258635, "loss": 0.0583, "step": 1106 }, { "epoch": 1.96, "grad_norm": 0.24665361642837524, "learning_rate": 0.00013577619054477575, "loss": 0.0267, "step": 1107 }, { "epoch": 1.96, "grad_norm": 0.07276459783315659, "learning_rate": 0.00013536177342397243, "loss": 0.0064, "step": 1108 }, { "epoch": 1.96, "grad_norm": 0.4690609872341156, "learning_rate": 0.00013494775480403384, "loss": 0.0553, "step": 1109 }, { "epoch": 1.96, "grad_norm": 0.4010032117366791, "learning_rate": 0.00013453413612415512, "loss": 0.0514, "step": 1110 }, { "epoch": 1.97, "grad_norm": 0.3563205301761627, "learning_rate": 0.00013412091882214112, "loss": 0.0553, "step": 1111 }, { "epoch": 1.97, "grad_norm": 0.6027369499206543, "learning_rate": 0.00013370810433440167, "loss": 0.0677, "step": 1112 }, { "epoch": 1.97, "grad_norm": 0.5082702040672302, "learning_rate": 0.00013329569409594605, "loss": 0.1265, "step": 1113 }, { "epoch": 1.97, "grad_norm": 0.5313373804092407, "learning_rate": 0.00013288368954037834, "loss": 0.0234, "step": 1114 }, { "epoch": 1.97, "grad_norm": 0.33385488390922546, "learning_rate": 0.00013247209209989242, "loss": 0.0252, "step": 1115 }, { "epoch": 1.97, "grad_norm": 0.36459195613861084, "learning_rate": 0.00013206090320526704, "loss": 0.0211, "step": 1116 }, { "epoch": 1.98, "grad_norm": 0.5477709770202637, "learning_rate": 0.00013165012428586096, "loss": 0.0416, "step": 1117 }, { "epoch": 1.98, "grad_norm": 0.3133089542388916, "learning_rate": 0.0001312397567696074, "loss": 0.036, "step": 1118 }, { "epoch": 1.98, "grad_norm": 0.284045934677124, "learning_rate": 0.00013082980208300971, "loss": 0.0249, "step": 1119 }, { "epoch": 1.98, "grad_norm": 0.3401576578617096, "learning_rate": 0.00013042026165113618, "loss": 0.0281, "step": 1120 }, { "epoch": 1.98, "grad_norm": 0.21280981600284576, "learning_rate": 0.00013001113689761496, "loss": 0.0186, "step": 1121 }, { "epoch": 1.98, "grad_norm": 0.1294207125902176, "learning_rate": 0.00012960242924462957, "loss": 0.0156, "step": 1122 }, { "epoch": 1.99, "grad_norm": 0.11151756346225739, "learning_rate": 0.00012919414011291298, "loss": 0.0111, "step": 1123 }, { "epoch": 1.99, "grad_norm": 0.7448397874832153, "learning_rate": 0.0001287862709217439, "loss": 0.0898, "step": 1124 }, { "epoch": 1.99, "grad_norm": 0.4080904424190521, "learning_rate": 0.00012837882308894117, "loss": 0.0323, "step": 1125 }, { "epoch": 1.99, "grad_norm": 0.33506283164024353, "learning_rate": 0.00012797179803085862, "loss": 0.0309, "step": 1126 }, { "epoch": 1.99, "grad_norm": 0.32063427567481995, "learning_rate": 0.00012756519716238096, "loss": 0.0978, "step": 1127 }, { "epoch": 2.0, "grad_norm": 0.7773024439811707, "learning_rate": 0.0001271590218969176, "loss": 0.1017, "step": 1128 }, { "epoch": 2.0, "grad_norm": 0.2934909164905548, "learning_rate": 0.00012675327364639917, "loss": 0.0192, "step": 1129 }, { "epoch": 2.0, "grad_norm": 0.6137431859970093, "learning_rate": 0.0001263479538212717, "loss": 0.1013, "step": 1130 }, { "epoch": 2.0, "grad_norm": 0.2201017141342163, "learning_rate": 0.00012594306383049186, "loss": 0.0129, "step": 1131 }, { "epoch": 2.0, "grad_norm": 0.04556626081466675, "learning_rate": 0.00012553860508152212, "loss": 0.0066, "step": 1132 }, { "epoch": 2.0, "grad_norm": 0.17257572710514069, "learning_rate": 0.00012513457898032616, "loss": 0.0133, "step": 1133 }, { "epoch": 2.01, "grad_norm": 0.29365074634552, "learning_rate": 0.0001247309869313633, "loss": 0.0194, "step": 1134 }, { "epoch": 2.01, "grad_norm": 0.032978832721710205, "learning_rate": 0.00012432783033758447, "loss": 0.0047, "step": 1135 }, { "epoch": 2.01, "grad_norm": 0.05853278562426567, "learning_rate": 0.0001239251106004265, "loss": 0.0062, "step": 1136 }, { "epoch": 2.01, "eval_loss": 0.08742678165435791, "eval_runtime": 14.7019, "eval_samples_per_second": 32.445, "eval_steps_per_second": 8.162, "step": 1136 }, { "epoch": 2.01, "grad_norm": 0.1357499063014984, "learning_rate": 0.00012352282911980782, "loss": 0.0081, "step": 1137 }, { "epoch": 2.01, "grad_norm": 0.17858386039733887, "learning_rate": 0.00012312098729412346, "loss": 0.0148, "step": 1138 }, { "epoch": 2.02, "grad_norm": 0.12391608208417892, "learning_rate": 0.00012271958652023993, "loss": 0.0082, "step": 1139 }, { "epoch": 2.02, "grad_norm": 0.10268434137105942, "learning_rate": 0.0001223186281934909, "loss": 0.0068, "step": 1140 }, { "epoch": 2.02, "grad_norm": 0.26439937949180603, "learning_rate": 0.00012191811370767172, "loss": 0.0149, "step": 1141 }, { "epoch": 2.02, "grad_norm": 0.051827941089868546, "learning_rate": 0.00012151804445503492, "loss": 0.0066, "step": 1142 }, { "epoch": 2.02, "grad_norm": 0.354404091835022, "learning_rate": 0.00012111842182628555, "loss": 0.0539, "step": 1143 }, { "epoch": 2.02, "grad_norm": 0.036191366612911224, "learning_rate": 0.00012071924721057579, "loss": 0.0034, "step": 1144 }, { "epoch": 2.03, "grad_norm": 0.10083664208650589, "learning_rate": 0.00012032052199550083, "loss": 0.0096, "step": 1145 }, { "epoch": 2.03, "grad_norm": 0.1553259789943695, "learning_rate": 0.00011992224756709343, "loss": 0.0077, "step": 1146 }, { "epoch": 2.03, "grad_norm": 0.3661736845970154, "learning_rate": 0.00011952442530981921, "loss": 0.027, "step": 1147 }, { "epoch": 2.03, "grad_norm": 0.25327032804489136, "learning_rate": 0.00011912705660657244, "loss": 0.0212, "step": 1148 }, { "epoch": 2.03, "grad_norm": 0.30773070454597473, "learning_rate": 0.0001187301428386702, "loss": 0.0175, "step": 1149 }, { "epoch": 2.03, "grad_norm": 0.056706108152866364, "learning_rate": 0.00011833368538584863, "loss": 0.0056, "step": 1150 }, { "epoch": 2.04, "grad_norm": 0.24290138483047485, "learning_rate": 0.00011793768562625734, "loss": 0.0165, "step": 1151 }, { "epoch": 2.04, "grad_norm": 0.10750260949134827, "learning_rate": 0.00011754214493645493, "loss": 0.0075, "step": 1152 }, { "epoch": 2.04, "grad_norm": 0.07075980305671692, "learning_rate": 0.00011714706469140449, "loss": 0.0052, "step": 1153 }, { "epoch": 2.04, "grad_norm": 0.03554920107126236, "learning_rate": 0.0001167524462644681, "loss": 0.0043, "step": 1154 }, { "epoch": 2.04, "grad_norm": 0.10947668552398682, "learning_rate": 0.00011635829102740294, "loss": 0.0077, "step": 1155 }, { "epoch": 2.05, "grad_norm": 0.036122776567935944, "learning_rate": 0.00011596460035035572, "loss": 0.0018, "step": 1156 }, { "epoch": 2.05, "grad_norm": 0.036581866443157196, "learning_rate": 0.00011557137560185829, "loss": 0.0033, "step": 1157 }, { "epoch": 2.05, "grad_norm": 0.3237001597881317, "learning_rate": 0.00011517861814882308, "loss": 0.0203, "step": 1158 }, { "epoch": 2.05, "grad_norm": 0.3374278247356415, "learning_rate": 0.00011478632935653805, "loss": 0.0076, "step": 1159 }, { "epoch": 2.05, "grad_norm": 0.3109148144721985, "learning_rate": 0.0001143945105886619, "loss": 0.0252, "step": 1160 }, { "epoch": 2.05, "grad_norm": 0.1825418621301651, "learning_rate": 0.00011400316320721951, "loss": 0.0095, "step": 1161 }, { "epoch": 2.06, "grad_norm": 0.056046485900878906, "learning_rate": 0.00011361228857259709, "loss": 0.0036, "step": 1162 }, { "epoch": 2.06, "grad_norm": 0.10362865775823593, "learning_rate": 0.00011322188804353761, "loss": 0.0048, "step": 1163 }, { "epoch": 2.06, "grad_norm": 0.09634903073310852, "learning_rate": 0.00011283196297713608, "loss": 0.0056, "step": 1164 }, { "epoch": 2.06, "grad_norm": 0.0892496109008789, "learning_rate": 0.00011244251472883446, "loss": 0.0051, "step": 1165 }, { "epoch": 2.06, "grad_norm": 0.9342542886734009, "learning_rate": 0.00011205354465241732, "loss": 0.0462, "step": 1166 }, { "epoch": 2.06, "grad_norm": 0.08937977254390717, "learning_rate": 0.00011166505410000697, "loss": 0.0055, "step": 1167 }, { "epoch": 2.07, "grad_norm": 0.032398249953985214, "learning_rate": 0.00011127704442205897, "loss": 0.0027, "step": 1168 }, { "epoch": 2.07, "grad_norm": 0.20088808238506317, "learning_rate": 0.0001108895169673573, "loss": 0.0328, "step": 1169 }, { "epoch": 2.07, "grad_norm": 0.0885910764336586, "learning_rate": 0.00011050247308300945, "loss": 0.005, "step": 1170 }, { "epoch": 2.07, "grad_norm": 0.009513720870018005, "learning_rate": 0.00011011591411444199, "loss": 0.0009, "step": 1171 }, { "epoch": 2.07, "grad_norm": 0.04859737306833267, "learning_rate": 0.00010972984140539605, "loss": 0.0024, "step": 1172 }, { "epoch": 2.08, "grad_norm": 0.08774285018444061, "learning_rate": 0.00010934425629792214, "loss": 0.0048, "step": 1173 }, { "epoch": 2.08, "grad_norm": 0.09334032982587814, "learning_rate": 0.00010895916013237619, "loss": 0.0022, "step": 1174 }, { "epoch": 2.08, "grad_norm": 0.016344185918569565, "learning_rate": 0.00010857455424741388, "loss": 0.0014, "step": 1175 }, { "epoch": 2.08, "grad_norm": 0.06272843480110168, "learning_rate": 0.00010819043997998721, "loss": 0.0018, "step": 1176 }, { "epoch": 2.08, "grad_norm": 0.42132094502449036, "learning_rate": 0.00010780681866533897, "loss": 0.0221, "step": 1177 }, { "epoch": 2.08, "grad_norm": 0.2525283992290497, "learning_rate": 0.00010742369163699841, "loss": 0.0114, "step": 1178 }, { "epoch": 2.09, "grad_norm": 0.9688707590103149, "learning_rate": 0.00010704106022677645, "loss": 0.0166, "step": 1179 }, { "epoch": 2.09, "grad_norm": 0.14904755353927612, "learning_rate": 0.00010665892576476122, "loss": 0.0044, "step": 1180 }, { "epoch": 2.09, "grad_norm": 0.020122570917010307, "learning_rate": 0.00010627728957931346, "loss": 0.0017, "step": 1181 }, { "epoch": 2.09, "grad_norm": 0.44014814496040344, "learning_rate": 0.00010589615299706187, "loss": 0.0078, "step": 1182 }, { "epoch": 2.09, "grad_norm": 0.6460086107254028, "learning_rate": 0.00010551551734289827, "loss": 0.0398, "step": 1183 }, { "epoch": 2.09, "grad_norm": 0.1972631961107254, "learning_rate": 0.00010513538393997316, "loss": 0.0286, "step": 1184 }, { "epoch": 2.1, "grad_norm": 0.04835314676165581, "learning_rate": 0.00010475575410969138, "loss": 0.0026, "step": 1185 }, { "epoch": 2.1, "grad_norm": 0.4275042712688446, "learning_rate": 0.00010437662917170695, "loss": 0.0136, "step": 1186 }, { "epoch": 2.1, "grad_norm": 0.010206771083176136, "learning_rate": 0.00010399801044391918, "loss": 0.0007, "step": 1187 }, { "epoch": 2.1, "grad_norm": 0.01756274327635765, "learning_rate": 0.00010361989924246737, "loss": 0.0011, "step": 1188 }, { "epoch": 2.1, "grad_norm": 0.6749821901321411, "learning_rate": 0.00010324229688172665, "loss": 0.0332, "step": 1189 }, { "epoch": 2.11, "grad_norm": 0.1459697037935257, "learning_rate": 0.00010286520467430357, "loss": 0.0052, "step": 1190 }, { "epoch": 2.11, "grad_norm": 0.0659116879105568, "learning_rate": 0.00010248862393103092, "loss": 0.0037, "step": 1191 }, { "epoch": 2.11, "grad_norm": 0.11409203708171844, "learning_rate": 0.000102112555960964, "loss": 0.0046, "step": 1192 }, { "epoch": 2.11, "grad_norm": 0.19234444200992584, "learning_rate": 0.00010173700207137529, "loss": 0.0084, "step": 1193 }, { "epoch": 2.11, "grad_norm": 0.01539614424109459, "learning_rate": 0.00010136196356775024, "loss": 0.001, "step": 1194 }, { "epoch": 2.11, "grad_norm": 0.006582081783562899, "learning_rate": 0.00010098744175378308, "loss": 0.0005, "step": 1195 }, { "epoch": 2.12, "grad_norm": 0.5718114972114563, "learning_rate": 0.00010061343793137149, "loss": 0.0232, "step": 1196 }, { "epoch": 2.12, "grad_norm": 0.4466152787208557, "learning_rate": 0.00010023995340061292, "loss": 0.0192, "step": 1197 }, { "epoch": 2.12, "grad_norm": 0.3126049041748047, "learning_rate": 9.986698945979946e-05, "loss": 0.0057, "step": 1198 }, { "epoch": 2.12, "grad_norm": 0.7232750058174133, "learning_rate": 9.94945474054135e-05, "loss": 0.0372, "step": 1199 }, { "epoch": 2.12, "grad_norm": 0.00877452827990055, "learning_rate": 9.91226285321235e-05, "loss": 0.0005, "step": 1200 }, { "epoch": 2.12, "grad_norm": 0.023644492030143738, "learning_rate": 9.8751234132779e-05, "loss": 0.0012, "step": 1201 }, { "epoch": 2.13, "grad_norm": 0.008336883969604969, "learning_rate": 9.838036549840668e-05, "loss": 0.0005, "step": 1202 }, { "epoch": 2.13, "grad_norm": 0.1168123185634613, "learning_rate": 9.801002391820527e-05, "loss": 0.0036, "step": 1203 }, { "epoch": 2.13, "grad_norm": 0.4848583936691284, "learning_rate": 9.764021067954146e-05, "loss": 0.0177, "step": 1204 }, { "epoch": 2.13, "grad_norm": 0.32454514503479004, "learning_rate": 9.727092706794555e-05, "loss": 0.0056, "step": 1205 }, { "epoch": 2.13, "grad_norm": 0.1333390176296234, "learning_rate": 9.690217436710646e-05, "loss": 0.0026, "step": 1206 }, { "epoch": 2.14, "grad_norm": 0.09174130856990814, "learning_rate": 9.653395385886787e-05, "loss": 0.0062, "step": 1207 }, { "epoch": 2.14, "grad_norm": 0.01976648159325123, "learning_rate": 9.616626682322327e-05, "loss": 0.0008, "step": 1208 }, { "epoch": 2.14, "grad_norm": 0.2334078848361969, "learning_rate": 9.579911453831166e-05, "loss": 0.0069, "step": 1209 }, { "epoch": 2.14, "grad_norm": 0.010424863547086716, "learning_rate": 9.543249828041342e-05, "loss": 0.0007, "step": 1210 }, { "epoch": 2.14, "grad_norm": 0.06431838870048523, "learning_rate": 9.506641932394552e-05, "loss": 0.0021, "step": 1211 }, { "epoch": 2.14, "grad_norm": 0.2083590179681778, "learning_rate": 9.470087894145704e-05, "loss": 0.0092, "step": 1212 }, { "epoch": 2.15, "grad_norm": 0.03698310628533363, "learning_rate": 9.433587840362501e-05, "loss": 0.0018, "step": 1213 }, { "epoch": 2.15, "grad_norm": 0.836845874786377, "learning_rate": 9.397141897924974e-05, "loss": 0.0274, "step": 1214 }, { "epoch": 2.15, "grad_norm": 0.007941615767776966, "learning_rate": 9.360750193525076e-05, "loss": 0.0004, "step": 1215 }, { "epoch": 2.15, "grad_norm": 0.03961505368351936, "learning_rate": 9.324412853666217e-05, "loss": 0.0018, "step": 1216 }, { "epoch": 2.15, "grad_norm": 0.040892381221055984, "learning_rate": 9.28813000466281e-05, "loss": 0.0015, "step": 1217 }, { "epoch": 2.15, "grad_norm": 0.05060145631432533, "learning_rate": 9.25190177263986e-05, "loss": 0.0025, "step": 1218 }, { "epoch": 2.16, "grad_norm": 0.5681605935096741, "learning_rate": 9.215728283532502e-05, "loss": 0.0695, "step": 1219 }, { "epoch": 2.16, "grad_norm": 0.012955489568412304, "learning_rate": 9.179609663085595e-05, "loss": 0.0006, "step": 1220 }, { "epoch": 2.16, "grad_norm": 0.6021496653556824, "learning_rate": 9.143546036853279e-05, "loss": 0.0213, "step": 1221 }, { "epoch": 2.16, "grad_norm": 0.011097903363406658, "learning_rate": 9.107537530198464e-05, "loss": 0.0006, "step": 1222 }, { "epoch": 2.16, "grad_norm": 0.009882250800728798, "learning_rate": 9.071584268292515e-05, "loss": 0.0006, "step": 1223 }, { "epoch": 2.17, "grad_norm": 0.020904725417494774, "learning_rate": 9.035686376114749e-05, "loss": 0.0009, "step": 1224 }, { "epoch": 2.17, "grad_norm": 1.1304357051849365, "learning_rate": 8.999843978451977e-05, "loss": 0.0159, "step": 1225 }, { "epoch": 2.17, "grad_norm": 0.019047705456614494, "learning_rate": 8.964057199898148e-05, "loss": 0.0008, "step": 1226 }, { "epoch": 2.17, "grad_norm": 0.05119523033499718, "learning_rate": 8.928326164853811e-05, "loss": 0.0018, "step": 1227 }, { "epoch": 2.17, "grad_norm": 0.01547545101493597, "learning_rate": 8.892650997525794e-05, "loss": 0.0008, "step": 1228 }, { "epoch": 2.17, "grad_norm": 0.7677561044692993, "learning_rate": 8.857031821926711e-05, "loss": 0.0276, "step": 1229 }, { "epoch": 2.18, "grad_norm": 0.07431495934724808, "learning_rate": 8.821468761874518e-05, "loss": 0.0029, "step": 1230 }, { "epoch": 2.18, "grad_norm": 0.018387913703918457, "learning_rate": 8.785961940992118e-05, "loss": 0.0008, "step": 1231 }, { "epoch": 2.18, "grad_norm": 0.34750181436538696, "learning_rate": 8.75051148270691e-05, "loss": 0.009, "step": 1232 }, { "epoch": 2.18, "grad_norm": 0.36946621537208557, "learning_rate": 8.715117510250378e-05, "loss": 0.0281, "step": 1233 }, { "epoch": 2.18, "grad_norm": 0.03397729992866516, "learning_rate": 8.67978014665766e-05, "loss": 0.0012, "step": 1234 }, { "epoch": 2.18, "grad_norm": 0.12867552042007446, "learning_rate": 8.644499514767088e-05, "loss": 0.0062, "step": 1235 }, { "epoch": 2.19, "grad_norm": 0.010095584206283092, "learning_rate": 8.609275737219793e-05, "loss": 0.0006, "step": 1236 }, { "epoch": 2.19, "grad_norm": 0.028368016704916954, "learning_rate": 8.57410893645929e-05, "loss": 0.0014, "step": 1237 }, { "epoch": 2.19, "grad_norm": 0.12146025896072388, "learning_rate": 8.538999234731004e-05, "loss": 0.005, "step": 1238 }, { "epoch": 2.19, "grad_norm": 0.0617150254547596, "learning_rate": 8.50394675408191e-05, "loss": 0.0031, "step": 1239 }, { "epoch": 2.19, "grad_norm": 0.005746606737375259, "learning_rate": 8.468951616360038e-05, "loss": 0.0003, "step": 1240 }, { "epoch": 2.2, "grad_norm": 0.4718758761882782, "learning_rate": 8.434013943214097e-05, "loss": 0.0496, "step": 1241 }, { "epoch": 2.2, "grad_norm": 0.008204291574656963, "learning_rate": 8.399133856093061e-05, "loss": 0.0004, "step": 1242 }, { "epoch": 2.2, "grad_norm": 0.5281336307525635, "learning_rate": 8.36431147624569e-05, "loss": 0.0375, "step": 1243 }, { "epoch": 2.2, "grad_norm": 0.006096679251641035, "learning_rate": 8.329546924720177e-05, "loss": 0.0004, "step": 1244 }, { "epoch": 2.2, "grad_norm": 0.040774155408144, "learning_rate": 8.294840322363672e-05, "loss": 0.0015, "step": 1245 }, { "epoch": 2.2, "grad_norm": 0.060770515352487564, "learning_rate": 8.260191789821884e-05, "loss": 0.0016, "step": 1246 }, { "epoch": 2.21, "grad_norm": 0.2820521295070648, "learning_rate": 8.225601447538689e-05, "loss": 0.0085, "step": 1247 }, { "epoch": 2.21, "grad_norm": 0.04571341350674629, "learning_rate": 8.191069415755645e-05, "loss": 0.0018, "step": 1248 }, { "epoch": 2.21, "grad_norm": 0.015350689180195332, "learning_rate": 8.156595814511655e-05, "loss": 0.0011, "step": 1249 }, { "epoch": 2.21, "grad_norm": 0.40809088945388794, "learning_rate": 8.122180763642475e-05, "loss": 0.0129, "step": 1250 }, { "epoch": 2.21, "grad_norm": 0.030470022931694984, "learning_rate": 8.087824382780335e-05, "loss": 0.0014, "step": 1251 }, { "epoch": 2.21, "grad_norm": 0.11672550439834595, "learning_rate": 8.05352679135354e-05, "loss": 0.0031, "step": 1252 }, { "epoch": 2.22, "grad_norm": 0.009694907814264297, "learning_rate": 8.01928810858601e-05, "loss": 0.0006, "step": 1253 }, { "epoch": 2.22, "grad_norm": 0.03160417824983597, "learning_rate": 7.985108453496909e-05, "loss": 0.0015, "step": 1254 }, { "epoch": 2.22, "grad_norm": 0.3607199490070343, "learning_rate": 7.950987944900193e-05, "loss": 0.0141, "step": 1255 }, { "epoch": 2.22, "grad_norm": 0.12325584143400192, "learning_rate": 7.916926701404217e-05, "loss": 0.0052, "step": 1256 }, { "epoch": 2.22, "grad_norm": 0.03867779299616814, "learning_rate": 7.882924841411343e-05, "loss": 0.0016, "step": 1257 }, { "epoch": 2.23, "grad_norm": 0.008120791055262089, "learning_rate": 7.848982483117473e-05, "loss": 0.0004, "step": 1258 }, { "epoch": 2.23, "grad_norm": 0.12468260526657104, "learning_rate": 7.815099744511708e-05, "loss": 0.0053, "step": 1259 }, { "epoch": 2.23, "grad_norm": 0.006734863854944706, "learning_rate": 7.78127674337587e-05, "loss": 0.0005, "step": 1260 }, { "epoch": 2.23, "grad_norm": 0.11184979975223541, "learning_rate": 7.747513597284134e-05, "loss": 0.0023, "step": 1261 }, { "epoch": 2.23, "grad_norm": 0.030304264277219772, "learning_rate": 7.713810423602619e-05, "loss": 0.0018, "step": 1262 }, { "epoch": 2.23, "grad_norm": 0.4860706925392151, "learning_rate": 7.680167339488967e-05, "loss": 0.0358, "step": 1263 }, { "epoch": 2.24, "grad_norm": 0.08765775710344315, "learning_rate": 7.646584461891929e-05, "loss": 0.0031, "step": 1264 }, { "epoch": 2.24, "grad_norm": 0.06726156920194626, "learning_rate": 7.613061907550975e-05, "loss": 0.003, "step": 1265 }, { "epoch": 2.24, "grad_norm": 0.04926925525069237, "learning_rate": 7.579599792995872e-05, "loss": 0.0024, "step": 1266 }, { "epoch": 2.24, "grad_norm": 0.31866374611854553, "learning_rate": 7.546198234546309e-05, "loss": 0.0263, "step": 1267 }, { "epoch": 2.24, "grad_norm": 0.016148075461387634, "learning_rate": 7.512857348311466e-05, "loss": 0.0006, "step": 1268 }, { "epoch": 2.25, "grad_norm": 0.013526340946555138, "learning_rate": 7.479577250189606e-05, "loss": 0.0005, "step": 1269 }, { "epoch": 2.25, "grad_norm": 0.28299248218536377, "learning_rate": 7.446358055867688e-05, "loss": 0.0102, "step": 1270 }, { "epoch": 2.25, "grad_norm": 0.34623104333877563, "learning_rate": 7.413199880820953e-05, "loss": 0.0105, "step": 1271 }, { "epoch": 2.25, "grad_norm": 0.471078485250473, "learning_rate": 7.380102840312541e-05, "loss": 0.0406, "step": 1272 }, { "epoch": 2.25, "grad_norm": 0.015806253999471664, "learning_rate": 7.347067049393091e-05, "loss": 0.0008, "step": 1273 }, { "epoch": 2.25, "grad_norm": 0.2348398119211197, "learning_rate": 7.314092622900285e-05, "loss": 0.0119, "step": 1274 }, { "epoch": 2.26, "grad_norm": 0.018902868032455444, "learning_rate": 7.281179675458527e-05, "loss": 0.0008, "step": 1275 }, { "epoch": 2.26, "grad_norm": 0.07969076931476593, "learning_rate": 7.248328321478512e-05, "loss": 0.0026, "step": 1276 }, { "epoch": 2.26, "grad_norm": 0.09565503150224686, "learning_rate": 7.215538675156804e-05, "loss": 0.0018, "step": 1277 }, { "epoch": 2.26, "grad_norm": 0.10772550851106644, "learning_rate": 7.182810850475494e-05, "loss": 0.0024, "step": 1278 }, { "epoch": 2.26, "eval_loss": 0.10833004117012024, "eval_runtime": 14.7071, "eval_samples_per_second": 32.433, "eval_steps_per_second": 8.159, "step": 1278 }, { "epoch": 2.26, "grad_norm": 0.16737312078475952, "learning_rate": 7.15014496120172e-05, "loss": 0.005, "step": 1279 }, { "epoch": 2.26, "grad_norm": 0.013761342503130436, "learning_rate": 7.11754112088737e-05, "loss": 0.0006, "step": 1280 }, { "epoch": 2.27, "grad_norm": 0.32918259501457214, "learning_rate": 7.084999442868629e-05, "loss": 0.007, "step": 1281 }, { "epoch": 2.27, "grad_norm": 0.009015708230435848, "learning_rate": 7.052520040265581e-05, "loss": 0.0005, "step": 1282 }, { "epoch": 2.27, "grad_norm": 0.03732029348611832, "learning_rate": 7.020103025981839e-05, "loss": 0.0011, "step": 1283 }, { "epoch": 2.27, "grad_norm": 0.010399947874248028, "learning_rate": 6.987748512704143e-05, "loss": 0.0005, "step": 1284 }, { "epoch": 2.27, "grad_norm": 0.34805890917778015, "learning_rate": 6.955456612901973e-05, "loss": 0.014, "step": 1285 }, { "epoch": 2.28, "grad_norm": 0.03678420931100845, "learning_rate": 6.923227438827159e-05, "loss": 0.0011, "step": 1286 }, { "epoch": 2.28, "grad_norm": 0.09479320049285889, "learning_rate": 6.891061102513479e-05, "loss": 0.002, "step": 1287 }, { "epoch": 2.28, "grad_norm": 0.09461376816034317, "learning_rate": 6.858957715776265e-05, "loss": 0.0032, "step": 1288 }, { "epoch": 2.28, "grad_norm": 0.042890515178442, "learning_rate": 6.826917390212056e-05, "loss": 0.0014, "step": 1289 }, { "epoch": 2.28, "grad_norm": 0.5706468224525452, "learning_rate": 6.79494023719815e-05, "loss": 0.0081, "step": 1290 }, { "epoch": 2.28, "grad_norm": 0.01644587516784668, "learning_rate": 6.763026367892269e-05, "loss": 0.0009, "step": 1291 }, { "epoch": 2.29, "grad_norm": 0.020033186301589012, "learning_rate": 6.731175893232141e-05, "loss": 0.001, "step": 1292 }, { "epoch": 2.29, "grad_norm": 0.13267932832241058, "learning_rate": 6.699388923935118e-05, "loss": 0.0016, "step": 1293 }, { "epoch": 2.29, "grad_norm": 0.01488049328327179, "learning_rate": 6.667665570497813e-05, "loss": 0.0004, "step": 1294 }, { "epoch": 2.29, "grad_norm": 0.07691626995801926, "learning_rate": 6.636005943195683e-05, "loss": 0.0023, "step": 1295 }, { "epoch": 2.29, "grad_norm": 0.36601608991622925, "learning_rate": 6.604410152082683e-05, "loss": 0.0058, "step": 1296 }, { "epoch": 2.29, "grad_norm": 0.053158730268478394, "learning_rate": 6.57287830699084e-05, "loss": 0.0013, "step": 1297 }, { "epoch": 2.3, "grad_norm": 0.0060308403335511684, "learning_rate": 6.541410517529906e-05, "loss": 0.0002, "step": 1298 }, { "epoch": 2.3, "grad_norm": 0.06450387835502625, "learning_rate": 6.510006893086973e-05, "loss": 0.0016, "step": 1299 }, { "epoch": 2.3, "grad_norm": 0.0036570043303072453, "learning_rate": 6.478667542826064e-05, "loss": 0.0002, "step": 1300 }, { "epoch": 2.3, "grad_norm": 0.010067092254757881, "learning_rate": 6.447392575687805e-05, "loss": 0.0003, "step": 1301 }, { "epoch": 2.3, "grad_norm": 0.0020321488846093416, "learning_rate": 6.41618210038899e-05, "loss": 0.0001, "step": 1302 }, { "epoch": 2.31, "grad_norm": 0.07873855531215668, "learning_rate": 6.38503622542223e-05, "loss": 0.0023, "step": 1303 }, { "epoch": 2.31, "grad_norm": 0.021666008979082108, "learning_rate": 6.353955059055597e-05, "loss": 0.0006, "step": 1304 }, { "epoch": 2.31, "grad_norm": 0.1410478949546814, "learning_rate": 6.322938709332196e-05, "loss": 0.0031, "step": 1305 }, { "epoch": 2.31, "grad_norm": 0.01548179890960455, "learning_rate": 6.291987284069849e-05, "loss": 0.0006, "step": 1306 }, { "epoch": 2.31, "grad_norm": 0.10515931993722916, "learning_rate": 6.261100890860668e-05, "loss": 0.0024, "step": 1307 }, { "epoch": 2.31, "grad_norm": 0.05864081159234047, "learning_rate": 6.230279637070704e-05, "loss": 0.0017, "step": 1308 }, { "epoch": 2.32, "grad_norm": 0.007448627147823572, "learning_rate": 6.199523629839591e-05, "loss": 0.0003, "step": 1309 }, { "epoch": 2.32, "grad_norm": 0.1981428563594818, "learning_rate": 6.168832976080133e-05, "loss": 0.0027, "step": 1310 }, { "epoch": 2.32, "grad_norm": 0.002334183780476451, "learning_rate": 6.138207782477976e-05, "loss": 0.0001, "step": 1311 }, { "epoch": 2.32, "grad_norm": 0.12165654450654984, "learning_rate": 6.107648155491202e-05, "loss": 0.0029, "step": 1312 }, { "epoch": 2.32, "grad_norm": 0.038640353828668594, "learning_rate": 6.077154201349966e-05, "loss": 0.0008, "step": 1313 }, { "epoch": 2.32, "grad_norm": 0.004505124408751726, "learning_rate": 6.046726026056154e-05, "loss": 0.0002, "step": 1314 }, { "epoch": 2.33, "grad_norm": 0.3108009696006775, "learning_rate": 6.01636373538299e-05, "loss": 0.0064, "step": 1315 }, { "epoch": 2.33, "grad_norm": 0.003968897275626659, "learning_rate": 5.986067434874662e-05, "loss": 0.0002, "step": 1316 }, { "epoch": 2.33, "grad_norm": 0.026421288028359413, "learning_rate": 5.955837229845965e-05, "loss": 0.0007, "step": 1317 }, { "epoch": 2.33, "grad_norm": 0.007577298209071159, "learning_rate": 5.925673225381939e-05, "loss": 0.0003, "step": 1318 }, { "epoch": 2.33, "grad_norm": 0.009191282093524933, "learning_rate": 5.89557552633751e-05, "loss": 0.0003, "step": 1319 }, { "epoch": 2.34, "grad_norm": 0.0959949716925621, "learning_rate": 5.865544237337117e-05, "loss": 0.0015, "step": 1320 }, { "epoch": 2.34, "grad_norm": 0.014931642450392246, "learning_rate": 5.835579462774312e-05, "loss": 0.0005, "step": 1321 }, { "epoch": 2.34, "grad_norm": 0.023637857288122177, "learning_rate": 5.80568130681148e-05, "loss": 0.0003, "step": 1322 }, { "epoch": 2.34, "grad_norm": 0.07706478983163834, "learning_rate": 5.775849873379393e-05, "loss": 0.0022, "step": 1323 }, { "epoch": 2.34, "grad_norm": 0.3625439703464508, "learning_rate": 5.746085266176907e-05, "loss": 0.0146, "step": 1324 }, { "epoch": 2.34, "grad_norm": 0.12187057733535767, "learning_rate": 5.7163875886705824e-05, "loss": 0.0036, "step": 1325 }, { "epoch": 2.35, "grad_norm": 0.0629158765077591, "learning_rate": 5.686756944094282e-05, "loss": 0.0018, "step": 1326 }, { "epoch": 2.35, "grad_norm": 0.0620671845972538, "learning_rate": 5.657193435448896e-05, "loss": 0.0012, "step": 1327 }, { "epoch": 2.35, "grad_norm": 0.00476891128346324, "learning_rate": 5.627697165501927e-05, "loss": 0.0002, "step": 1328 }, { "epoch": 2.35, "grad_norm": 0.019357487559318542, "learning_rate": 5.598268236787138e-05, "loss": 0.0005, "step": 1329 }, { "epoch": 2.35, "grad_norm": 0.21635344624519348, "learning_rate": 5.5689067516041994e-05, "loss": 0.007, "step": 1330 }, { "epoch": 2.35, "grad_norm": 0.6375563740730286, "learning_rate": 5.539612812018344e-05, "loss": 0.0377, "step": 1331 }, { "epoch": 2.36, "grad_norm": 0.0075180609710514545, "learning_rate": 5.5103865198600085e-05, "loss": 0.0003, "step": 1332 }, { "epoch": 2.36, "grad_norm": 0.030566420406103134, "learning_rate": 5.481227976724476e-05, "loss": 0.0006, "step": 1333 }, { "epoch": 2.36, "grad_norm": 0.3805221617221832, "learning_rate": 5.45213728397152e-05, "loss": 0.0245, "step": 1334 }, { "epoch": 2.36, "grad_norm": 0.033048298209905624, "learning_rate": 5.423114542725049e-05, "loss": 0.0006, "step": 1335 }, { "epoch": 2.36, "grad_norm": 0.18197427690029144, "learning_rate": 5.3941598538727625e-05, "loss": 0.0034, "step": 1336 }, { "epoch": 2.37, "grad_norm": 0.009043999947607517, "learning_rate": 5.365273318065811e-05, "loss": 0.0004, "step": 1337 }, { "epoch": 2.37, "grad_norm": 0.036380622535943985, "learning_rate": 5.3364550357184325e-05, "loss": 0.0012, "step": 1338 }, { "epoch": 2.37, "grad_norm": 0.02875285968184471, "learning_rate": 5.307705107007593e-05, "loss": 0.0007, "step": 1339 }, { "epoch": 2.37, "grad_norm": 0.006231918465346098, "learning_rate": 5.2790236318726484e-05, "loss": 0.0003, "step": 1340 }, { "epoch": 2.37, "grad_norm": 0.012466797605156898, "learning_rate": 5.2504107100150245e-05, "loss": 0.0006, "step": 1341 }, { "epoch": 2.37, "grad_norm": 0.013808784075081348, "learning_rate": 5.221866440897807e-05, "loss": 0.0004, "step": 1342 }, { "epoch": 2.38, "grad_norm": 0.5687355995178223, "learning_rate": 5.193390923745475e-05, "loss": 0.0199, "step": 1343 }, { "epoch": 2.38, "grad_norm": 0.0935133546590805, "learning_rate": 5.1649842575434844e-05, "loss": 0.0036, "step": 1344 }, { "epoch": 2.38, "grad_norm": 0.6903636455535889, "learning_rate": 5.136646541037956e-05, "loss": 0.0154, "step": 1345 }, { "epoch": 2.38, "grad_norm": 0.07615023851394653, "learning_rate": 5.108377872735351e-05, "loss": 0.002, "step": 1346 }, { "epoch": 2.38, "grad_norm": 0.017100483179092407, "learning_rate": 5.0801783509020844e-05, "loss": 0.0005, "step": 1347 }, { "epoch": 2.38, "grad_norm": 0.01030554249882698, "learning_rate": 5.052048073564228e-05, "loss": 0.0004, "step": 1348 }, { "epoch": 2.39, "grad_norm": 0.33771464228630066, "learning_rate": 5.023987138507133e-05, "loss": 0.0053, "step": 1349 }, { "epoch": 2.39, "grad_norm": 0.03562993183732033, "learning_rate": 4.995995643275103e-05, "loss": 0.0012, "step": 1350 }, { "epoch": 2.39, "grad_norm": 0.037580545991659164, "learning_rate": 4.968073685171082e-05, "loss": 0.0009, "step": 1351 }, { "epoch": 2.39, "grad_norm": 0.018147172406315804, "learning_rate": 4.940221361256259e-05, "loss": 0.0004, "step": 1352 }, { "epoch": 2.39, "grad_norm": 0.1366257667541504, "learning_rate": 4.912438768349792e-05, "loss": 0.0039, "step": 1353 }, { "epoch": 2.4, "grad_norm": 0.020096778869628906, "learning_rate": 4.884726003028428e-05, "loss": 0.0004, "step": 1354 }, { "epoch": 2.4, "grad_norm": 0.005000903271138668, "learning_rate": 4.8570831616261745e-05, "loss": 0.0002, "step": 1355 }, { "epoch": 2.4, "grad_norm": 0.02229972742497921, "learning_rate": 4.829510340234e-05, "loss": 0.0008, "step": 1356 }, { "epoch": 2.4, "grad_norm": 0.04677216708660126, "learning_rate": 4.802007634699437e-05, "loss": 0.0008, "step": 1357 }, { "epoch": 2.4, "grad_norm": 0.008060449734330177, "learning_rate": 4.7745751406263163e-05, "loss": 0.0002, "step": 1358 }, { "epoch": 2.4, "grad_norm": 0.004165045917034149, "learning_rate": 4.74721295337438e-05, "loss": 0.0003, "step": 1359 }, { "epoch": 2.41, "grad_norm": 0.2668968439102173, "learning_rate": 4.719921168058977e-05, "loss": 0.0055, "step": 1360 }, { "epoch": 2.41, "grad_norm": 0.33916279673576355, "learning_rate": 4.6926998795507406e-05, "loss": 0.0061, "step": 1361 }, { "epoch": 2.41, "grad_norm": 0.023179393261671066, "learning_rate": 4.6655491824752263e-05, "loss": 0.0006, "step": 1362 }, { "epoch": 2.41, "grad_norm": 0.01882367953658104, "learning_rate": 4.6384691712126225e-05, "loss": 0.0007, "step": 1363 }, { "epoch": 2.41, "grad_norm": 0.013556770980358124, "learning_rate": 4.611459939897386e-05, "loss": 0.0003, "step": 1364 }, { "epoch": 2.41, "grad_norm": 0.215042382478714, "learning_rate": 4.5845215824179335e-05, "loss": 0.0051, "step": 1365 }, { "epoch": 2.42, "grad_norm": 0.02435019426047802, "learning_rate": 4.557654192416319e-05, "loss": 0.0004, "step": 1366 }, { "epoch": 2.42, "grad_norm": 0.017334023490548134, "learning_rate": 4.530857863287913e-05, "loss": 0.0005, "step": 1367 }, { "epoch": 2.42, "grad_norm": 0.20091189444065094, "learning_rate": 4.5041326881810395e-05, "loss": 0.0069, "step": 1368 }, { "epoch": 2.42, "grad_norm": 0.05958046764135361, "learning_rate": 4.4774787599967004e-05, "loss": 0.0009, "step": 1369 }, { "epoch": 2.42, "grad_norm": 0.08757390081882477, "learning_rate": 4.450896171388219e-05, "loss": 0.0025, "step": 1370 }, { "epoch": 2.43, "grad_norm": 0.028023116290569305, "learning_rate": 4.424385014760937e-05, "loss": 0.0008, "step": 1371 }, { "epoch": 2.43, "grad_norm": 0.022646216675639153, "learning_rate": 4.397945382271909e-05, "loss": 0.0004, "step": 1372 }, { "epoch": 2.43, "grad_norm": 0.04051864892244339, "learning_rate": 4.37157736582951e-05, "loss": 0.0009, "step": 1373 }, { "epoch": 2.43, "grad_norm": 0.7693873643875122, "learning_rate": 4.3452810570932115e-05, "loss": 0.0696, "step": 1374 }, { "epoch": 2.43, "grad_norm": 0.0007687499164603651, "learning_rate": 4.3190565474731904e-05, "loss": 0.0, "step": 1375 }, { "epoch": 2.43, "grad_norm": 0.00293734110891819, "learning_rate": 4.292903928130054e-05, "loss": 0.0001, "step": 1376 }, { "epoch": 2.44, "grad_norm": 0.6718714237213135, "learning_rate": 4.266823289974517e-05, "loss": 0.0105, "step": 1377 }, { "epoch": 2.44, "grad_norm": 0.003744264366105199, "learning_rate": 4.240814723667033e-05, "loss": 0.0001, "step": 1378 }, { "epoch": 2.44, "grad_norm": 0.002130451612174511, "learning_rate": 4.214878319617568e-05, "loss": 0.0001, "step": 1379 }, { "epoch": 2.44, "grad_norm": 0.02480950951576233, "learning_rate": 4.189014167985225e-05, "loss": 0.0004, "step": 1380 }, { "epoch": 2.44, "grad_norm": 0.00624463614076376, "learning_rate": 4.163222358677937e-05, "loss": 0.0002, "step": 1381 }, { "epoch": 2.44, "grad_norm": 0.0020641738083213568, "learning_rate": 4.137502981352173e-05, "loss": 0.0001, "step": 1382 }, { "epoch": 2.45, "grad_norm": 0.14240607619285583, "learning_rate": 4.111856125412608e-05, "loss": 0.0036, "step": 1383 }, { "epoch": 2.45, "grad_norm": 0.006745543330907822, "learning_rate": 4.086281880011833e-05, "loss": 0.0002, "step": 1384 }, { "epoch": 2.45, "grad_norm": 0.015076296404004097, "learning_rate": 4.060780334050032e-05, "loss": 0.0005, "step": 1385 }, { "epoch": 2.45, "grad_norm": 0.008114277385175228, "learning_rate": 4.035351576174667e-05, "loss": 0.0003, "step": 1386 }, { "epoch": 2.45, "grad_norm": 0.014272456988692284, "learning_rate": 4.0099956947801745e-05, "loss": 0.0003, "step": 1387 }, { "epoch": 2.46, "grad_norm": 0.2593598961830139, "learning_rate": 3.9847127780076626e-05, "loss": 0.0062, "step": 1388 }, { "epoch": 2.46, "grad_norm": 0.011931393295526505, "learning_rate": 3.959502913744614e-05, "loss": 0.0004, "step": 1389 }, { "epoch": 2.46, "grad_norm": 0.37758180499076843, "learning_rate": 3.934366189624561e-05, "loss": 0.0124, "step": 1390 }, { "epoch": 2.46, "grad_norm": 0.7305591702461243, "learning_rate": 3.9093026930267864e-05, "loss": 0.0191, "step": 1391 }, { "epoch": 2.46, "grad_norm": 0.3506815433502197, "learning_rate": 3.8843125110760186e-05, "loss": 0.004, "step": 1392 }, { "epoch": 2.46, "grad_norm": 1.1179872751235962, "learning_rate": 3.859395730642151e-05, "loss": 0.1287, "step": 1393 }, { "epoch": 2.47, "grad_norm": 0.07131043821573257, "learning_rate": 3.8345524383398974e-05, "loss": 0.0015, "step": 1394 }, { "epoch": 2.47, "grad_norm": 0.10362354665994644, "learning_rate": 3.8097827205285374e-05, "loss": 0.002, "step": 1395 }, { "epoch": 2.47, "grad_norm": 0.035806868225336075, "learning_rate": 3.7850866633115767e-05, "loss": 0.0006, "step": 1396 }, { "epoch": 2.47, "grad_norm": 0.09917337447404861, "learning_rate": 3.760464352536469e-05, "loss": 0.0013, "step": 1397 }, { "epoch": 2.47, "grad_norm": 0.013792157173156738, "learning_rate": 3.735915873794327e-05, "loss": 0.0005, "step": 1398 }, { "epoch": 2.48, "grad_norm": 0.11405621469020844, "learning_rate": 3.711441312419589e-05, "loss": 0.0017, "step": 1399 }, { "epoch": 2.48, "grad_norm": 0.37025147676467896, "learning_rate": 3.687040753489765e-05, "loss": 0.0066, "step": 1400 }, { "epoch": 2.48, "grad_norm": 0.005007702391594648, "learning_rate": 3.662714281825111e-05, "loss": 0.0001, "step": 1401 }, { "epoch": 2.48, "grad_norm": 0.035408273339271545, "learning_rate": 3.6384619819883335e-05, "loss": 0.0005, "step": 1402 }, { "epoch": 2.48, "grad_norm": 1.066094160079956, "learning_rate": 3.614283938284332e-05, "loss": 0.0229, "step": 1403 }, { "epoch": 2.48, "grad_norm": 0.06345271319150925, "learning_rate": 3.590180234759857e-05, "loss": 0.0012, "step": 1404 }, { "epoch": 2.49, "grad_norm": 0.13740839064121246, "learning_rate": 3.566150955203251e-05, "loss": 0.003, "step": 1405 }, { "epoch": 2.49, "grad_norm": 0.18154090642929077, "learning_rate": 3.542196183144148e-05, "loss": 0.0045, "step": 1406 }, { "epoch": 2.49, "grad_norm": 1.66633141040802, "learning_rate": 3.518316001853164e-05, "loss": 0.0242, "step": 1407 }, { "epoch": 2.49, "grad_norm": 0.054164353758096695, "learning_rate": 3.494510494341657e-05, "loss": 0.0022, "step": 1408 }, { "epoch": 2.49, "grad_norm": 0.011136863380670547, "learning_rate": 3.470779743361374e-05, "loss": 0.0005, "step": 1409 }, { "epoch": 2.49, "grad_norm": 0.011947316117584705, "learning_rate": 3.447123831404228e-05, "loss": 0.0005, "step": 1410 }, { "epoch": 2.5, "grad_norm": 0.17515721917152405, "learning_rate": 3.423542840701957e-05, "loss": 0.0035, "step": 1411 }, { "epoch": 2.5, "grad_norm": 0.014279712922871113, "learning_rate": 3.4000368532258604e-05, "loss": 0.0006, "step": 1412 }, { "epoch": 2.5, "grad_norm": 0.004492651205509901, "learning_rate": 3.376605950686532e-05, "loss": 0.0003, "step": 1413 }, { "epoch": 2.5, "grad_norm": 0.012061775662004948, "learning_rate": 3.3532502145335373e-05, "loss": 0.0004, "step": 1414 }, { "epoch": 2.5, "grad_norm": 0.019419865682721138, "learning_rate": 3.3299697259551706e-05, "loss": 0.0007, "step": 1415 }, { "epoch": 2.51, "grad_norm": 0.18258291482925415, "learning_rate": 3.3067645658781425e-05, "loss": 0.004, "step": 1416 }, { "epoch": 2.51, "grad_norm": 0.2847168445587158, "learning_rate": 3.283634814967307e-05, "loss": 0.0092, "step": 1417 }, { "epoch": 2.51, "grad_norm": 0.4876096248626709, "learning_rate": 3.260580553625389e-05, "loss": 0.0131, "step": 1418 }, { "epoch": 2.51, "grad_norm": 0.6045449376106262, "learning_rate": 3.237601861992709e-05, "loss": 0.0193, "step": 1419 }, { "epoch": 2.51, "grad_norm": 0.1586480587720871, "learning_rate": 3.214698819946879e-05, "loss": 0.0026, "step": 1420 }, { "epoch": 2.51, "eval_loss": 0.12486789375543594, "eval_runtime": 14.7046, "eval_samples_per_second": 32.439, "eval_steps_per_second": 8.161, "step": 1420 }, { "epoch": 2.51, "grad_norm": 0.12747687101364136, "learning_rate": 3.191871507102545e-05, "loss": 0.0017, "step": 1421 }, { "epoch": 2.52, "grad_norm": 0.04522531479597092, "learning_rate": 3.1691200028111044e-05, "loss": 0.0009, "step": 1422 }, { "epoch": 2.52, "grad_norm": 0.2810252904891968, "learning_rate": 3.146444386160441e-05, "loss": 0.0041, "step": 1423 }, { "epoch": 2.52, "grad_norm": 0.06912285834550858, "learning_rate": 3.123844735974646e-05, "loss": 0.0008, "step": 1424 }, { "epoch": 2.52, "grad_norm": 0.3842872679233551, "learning_rate": 3.1013211308137054e-05, "loss": 0.0075, "step": 1425 }, { "epoch": 2.52, "grad_norm": 0.018251223489642143, "learning_rate": 3.078873648973304e-05, "loss": 0.0003, "step": 1426 }, { "epoch": 2.52, "grad_norm": 0.005700815003365278, "learning_rate": 3.0565023684844765e-05, "loss": 0.0002, "step": 1427 }, { "epoch": 2.53, "grad_norm": 1.1778111457824707, "learning_rate": 3.034207367113387e-05, "loss": 0.0241, "step": 1428 }, { "epoch": 2.53, "grad_norm": 0.5692623853683472, "learning_rate": 3.0119887223610475e-05, "loss": 0.0074, "step": 1429 }, { "epoch": 2.53, "grad_norm": 0.007092227227985859, "learning_rate": 2.9898465114630123e-05, "loss": 0.0003, "step": 1430 }, { "epoch": 2.53, "grad_norm": 0.006958132144063711, "learning_rate": 2.9677808113891675e-05, "loss": 0.0003, "step": 1431 }, { "epoch": 2.53, "grad_norm": 0.2849915027618408, "learning_rate": 2.945791698843431e-05, "loss": 0.0114, "step": 1432 }, { "epoch": 2.54, "grad_norm": 0.3223225176334381, "learning_rate": 2.9238792502634782e-05, "loss": 0.0105, "step": 1433 }, { "epoch": 2.54, "grad_norm": 0.3097008466720581, "learning_rate": 2.902043541820501e-05, "loss": 0.0067, "step": 1434 }, { "epoch": 2.54, "grad_norm": 0.08397488296031952, "learning_rate": 2.880284649418913e-05, "loss": 0.0029, "step": 1435 }, { "epoch": 2.54, "grad_norm": 0.029154475778341293, "learning_rate": 2.8586026486961235e-05, "loss": 0.0008, "step": 1436 }, { "epoch": 2.54, "grad_norm": 0.02804793231189251, "learning_rate": 2.836997615022249e-05, "loss": 0.0007, "step": 1437 }, { "epoch": 2.54, "grad_norm": 0.008161618374288082, "learning_rate": 2.8154696234998472e-05, "loss": 0.0003, "step": 1438 }, { "epoch": 2.55, "grad_norm": 0.055503591895103455, "learning_rate": 2.7940187489636697e-05, "loss": 0.0015, "step": 1439 }, { "epoch": 2.55, "grad_norm": 0.02855825237929821, "learning_rate": 2.7726450659803947e-05, "loss": 0.0008, "step": 1440 }, { "epoch": 2.55, "grad_norm": 0.01375576015561819, "learning_rate": 2.751348648848373e-05, "loss": 0.0004, "step": 1441 }, { "epoch": 2.55, "grad_norm": 0.0038817201275378466, "learning_rate": 2.7301295715973757e-05, "loss": 0.0001, "step": 1442 }, { "epoch": 2.55, "grad_norm": 0.023767409846186638, "learning_rate": 2.708987907988314e-05, "loss": 0.0007, "step": 1443 }, { "epoch": 2.55, "grad_norm": 0.06856090575456619, "learning_rate": 2.687923731512995e-05, "loss": 0.0016, "step": 1444 }, { "epoch": 2.56, "grad_norm": 0.001726289396174252, "learning_rate": 2.666937115393886e-05, "loss": 0.0001, "step": 1445 }, { "epoch": 2.56, "grad_norm": 0.011897512711584568, "learning_rate": 2.646028132583822e-05, "loss": 0.0003, "step": 1446 }, { "epoch": 2.56, "grad_norm": 0.011256224475800991, "learning_rate": 2.6251968557657908e-05, "loss": 0.0004, "step": 1447 }, { "epoch": 2.56, "grad_norm": 0.09208813309669495, "learning_rate": 2.6044433573526454e-05, "loss": 0.0021, "step": 1448 }, { "epoch": 2.56, "grad_norm": 0.028126435354351997, "learning_rate": 2.58376770948687e-05, "loss": 0.0003, "step": 1449 }, { "epoch": 2.57, "grad_norm": 0.02156115137040615, "learning_rate": 2.5631699840403476e-05, "loss": 0.0007, "step": 1450 }, { "epoch": 2.57, "grad_norm": 0.039257489144802094, "learning_rate": 2.542650252614062e-05, "loss": 0.0013, "step": 1451 }, { "epoch": 2.57, "grad_norm": 1.2106196880340576, "learning_rate": 2.5222085865379023e-05, "loss": 0.0606, "step": 1452 }, { "epoch": 2.57, "grad_norm": 0.011918625794351101, "learning_rate": 2.501845056870375e-05, "loss": 0.0002, "step": 1453 }, { "epoch": 2.57, "grad_norm": 0.35332852602005005, "learning_rate": 2.4815597343983697e-05, "loss": 0.0813, "step": 1454 }, { "epoch": 2.57, "grad_norm": 0.0004311532247811556, "learning_rate": 2.4613526896369308e-05, "loss": 0.0, "step": 1455 }, { "epoch": 2.58, "grad_norm": 0.20872049033641815, "learning_rate": 2.441223992828978e-05, "loss": 0.0047, "step": 1456 }, { "epoch": 2.58, "grad_norm": 1.3243696689605713, "learning_rate": 2.421173713945099e-05, "loss": 0.024, "step": 1457 }, { "epoch": 2.58, "grad_norm": 0.005421569105237722, "learning_rate": 2.4012019226832772e-05, "loss": 0.0002, "step": 1458 }, { "epoch": 2.58, "grad_norm": 0.3085567355155945, "learning_rate": 2.381308688468656e-05, "loss": 0.0047, "step": 1459 }, { "epoch": 2.58, "grad_norm": 0.002236233791336417, "learning_rate": 2.361494080453319e-05, "loss": 0.0001, "step": 1460 }, { "epoch": 2.58, "grad_norm": 0.5045457482337952, "learning_rate": 2.3417581675160088e-05, "loss": 0.0054, "step": 1461 }, { "epoch": 2.59, "grad_norm": 0.0775829628109932, "learning_rate": 2.3221010182619406e-05, "loss": 0.0014, "step": 1462 }, { "epoch": 2.59, "grad_norm": 0.47293341159820557, "learning_rate": 2.302522701022511e-05, "loss": 0.0563, "step": 1463 }, { "epoch": 2.59, "grad_norm": 0.30130696296691895, "learning_rate": 2.2830232838550845e-05, "loss": 0.0057, "step": 1464 }, { "epoch": 2.59, "grad_norm": 0.0052946461364626884, "learning_rate": 2.2636028345427745e-05, "loss": 0.0002, "step": 1465 }, { "epoch": 2.59, "grad_norm": 0.017164453864097595, "learning_rate": 2.244261420594168e-05, "loss": 0.0005, "step": 1466 }, { "epoch": 2.6, "grad_norm": 0.8137333393096924, "learning_rate": 2.224999109243131e-05, "loss": 0.039, "step": 1467 }, { "epoch": 2.6, "grad_norm": 0.19935303926467896, "learning_rate": 2.205815967448546e-05, "loss": 0.0056, "step": 1468 }, { "epoch": 2.6, "grad_norm": 0.3961120545864105, "learning_rate": 2.1867120618940833e-05, "loss": 0.0056, "step": 1469 }, { "epoch": 2.6, "grad_norm": 0.019902199506759644, "learning_rate": 2.167687458987991e-05, "loss": 0.0007, "step": 1470 }, { "epoch": 2.6, "grad_norm": 0.009017308242619038, "learning_rate": 2.1487422248628457e-05, "loss": 0.0004, "step": 1471 }, { "epoch": 2.6, "grad_norm": 0.001593849272467196, "learning_rate": 2.1298764253753044e-05, "loss": 0.0001, "step": 1472 }, { "epoch": 2.61, "grad_norm": 0.023191401734948158, "learning_rate": 2.1110901261059245e-05, "loss": 0.0006, "step": 1473 }, { "epoch": 2.61, "grad_norm": 0.03758575767278671, "learning_rate": 2.0923833923588874e-05, "loss": 0.0011, "step": 1474 }, { "epoch": 2.61, "grad_norm": 0.3719123303890228, "learning_rate": 2.0737562891618074e-05, "loss": 0.0061, "step": 1475 }, { "epoch": 2.61, "grad_norm": 0.004338722676038742, "learning_rate": 2.0552088812654885e-05, "loss": 0.0002, "step": 1476 }, { "epoch": 2.61, "grad_norm": 0.004211151506751776, "learning_rate": 2.0367412331436796e-05, "loss": 0.0002, "step": 1477 }, { "epoch": 2.61, "grad_norm": 0.008191877976059914, "learning_rate": 2.018353408992901e-05, "loss": 0.0003, "step": 1478 }, { "epoch": 2.62, "grad_norm": 0.4456917941570282, "learning_rate": 2.0000454727321703e-05, "loss": 0.0153, "step": 1479 }, { "epoch": 2.62, "grad_norm": 0.03597655147314072, "learning_rate": 1.9818174880028213e-05, "loss": 0.0013, "step": 1480 }, { "epoch": 2.62, "grad_norm": 0.0015153115382418036, "learning_rate": 1.963669518168243e-05, "loss": 0.0001, "step": 1481 }, { "epoch": 2.62, "grad_norm": 0.008952159434556961, "learning_rate": 1.9456016263136923e-05, "loss": 0.0003, "step": 1482 }, { "epoch": 2.62, "grad_norm": 0.00541423074901104, "learning_rate": 1.927613875246059e-05, "loss": 0.0002, "step": 1483 }, { "epoch": 2.63, "grad_norm": 0.09652198851108551, "learning_rate": 1.909706327493657e-05, "loss": 0.0016, "step": 1484 }, { "epoch": 2.63, "grad_norm": 0.22741255164146423, "learning_rate": 1.8918790453059914e-05, "loss": 0.0031, "step": 1485 }, { "epoch": 2.63, "grad_norm": 0.040858399122953415, "learning_rate": 1.8741320906535546e-05, "loss": 0.0007, "step": 1486 }, { "epoch": 2.63, "grad_norm": 0.053088657557964325, "learning_rate": 1.856465525227602e-05, "loss": 0.0019, "step": 1487 }, { "epoch": 2.63, "grad_norm": 0.008057578466832638, "learning_rate": 1.8388794104399558e-05, "loss": 0.0003, "step": 1488 }, { "epoch": 2.63, "grad_norm": 0.009569887071847916, "learning_rate": 1.8213738074227742e-05, "loss": 0.0004, "step": 1489 }, { "epoch": 2.64, "grad_norm": 0.00848764181137085, "learning_rate": 1.803948777028336e-05, "loss": 0.0003, "step": 1490 }, { "epoch": 2.64, "grad_norm": 0.6634844541549683, "learning_rate": 1.7866043798288433e-05, "loss": 0.0438, "step": 1491 }, { "epoch": 2.64, "grad_norm": 0.15894575417041779, "learning_rate": 1.7693406761162016e-05, "loss": 0.0055, "step": 1492 }, { "epoch": 2.64, "grad_norm": 0.013144123367965221, "learning_rate": 1.752157725901815e-05, "loss": 0.0005, "step": 1493 }, { "epoch": 2.64, "grad_norm": 0.018960585817694664, "learning_rate": 1.735055588916379e-05, "loss": 0.0003, "step": 1494 }, { "epoch": 2.64, "grad_norm": 0.016689004376530647, "learning_rate": 1.718034324609663e-05, "loss": 0.0006, "step": 1495 }, { "epoch": 2.65, "grad_norm": 0.006429940462112427, "learning_rate": 1.701093992150307e-05, "loss": 0.0003, "step": 1496 }, { "epoch": 2.65, "grad_norm": 0.11746788769960403, "learning_rate": 1.684234650425631e-05, "loss": 0.0032, "step": 1497 }, { "epoch": 2.65, "grad_norm": 0.004738051909953356, "learning_rate": 1.6674563580414053e-05, "loss": 0.0002, "step": 1498 }, { "epoch": 2.65, "grad_norm": 0.004201785195618868, "learning_rate": 1.65075917332167e-05, "loss": 0.0002, "step": 1499 }, { "epoch": 2.65, "grad_norm": 0.07351760566234589, "learning_rate": 1.6341431543085207e-05, "loss": 0.0017, "step": 1500 }, { "epoch": 2.66, "grad_norm": 0.03148205578327179, "learning_rate": 1.6176083587618935e-05, "loss": 0.0011, "step": 1501 }, { "epoch": 2.66, "grad_norm": 0.07184291630983353, "learning_rate": 1.6011548441594e-05, "loss": 0.0018, "step": 1502 }, { "epoch": 2.66, "grad_norm": 0.01941969059407711, "learning_rate": 1.5847826676960914e-05, "loss": 0.0007, "step": 1503 }, { "epoch": 2.66, "grad_norm": 0.004026324022561312, "learning_rate": 1.5684918862842846e-05, "loss": 0.0001, "step": 1504 }, { "epoch": 2.66, "grad_norm": 0.002735046437010169, "learning_rate": 1.5522825565533445e-05, "loss": 0.0002, "step": 1505 }, { "epoch": 2.66, "grad_norm": 0.09817846864461899, "learning_rate": 1.5361547348495013e-05, "loss": 0.0016, "step": 1506 }, { "epoch": 2.67, "grad_norm": 0.022421833127737045, "learning_rate": 1.5201084772356544e-05, "loss": 0.0004, "step": 1507 }, { "epoch": 2.67, "grad_norm": 0.1519351303577423, "learning_rate": 1.5041438394911622e-05, "loss": 0.0019, "step": 1508 }, { "epoch": 2.67, "grad_norm": 0.1200493648648262, "learning_rate": 1.4882608771116719e-05, "loss": 0.002, "step": 1509 }, { "epoch": 2.67, "grad_norm": 0.002202227944508195, "learning_rate": 1.4724596453089101e-05, "loss": 0.0001, "step": 1510 }, { "epoch": 2.67, "grad_norm": 0.030175110325217247, "learning_rate": 1.4567401990104867e-05, "loss": 0.0009, "step": 1511 }, { "epoch": 2.67, "grad_norm": 0.03393395245075226, "learning_rate": 1.441102592859725e-05, "loss": 0.0014, "step": 1512 }, { "epoch": 2.68, "grad_norm": 0.7548046112060547, "learning_rate": 1.4255468812154477e-05, "loss": 0.0468, "step": 1513 }, { "epoch": 2.68, "grad_norm": 0.015761546790599823, "learning_rate": 1.4100731181518056e-05, "loss": 0.0006, "step": 1514 }, { "epoch": 2.68, "grad_norm": 0.014691539108753204, "learning_rate": 1.3946813574580858e-05, "loss": 0.0006, "step": 1515 }, { "epoch": 2.68, "grad_norm": 0.06156891956925392, "learning_rate": 1.3793716526385058e-05, "loss": 0.0015, "step": 1516 }, { "epoch": 2.68, "grad_norm": 0.011981514282524586, "learning_rate": 1.36414405691207e-05, "loss": 0.0002, "step": 1517 }, { "epoch": 2.69, "grad_norm": 0.04529397189617157, "learning_rate": 1.3489986232123302e-05, "loss": 0.0017, "step": 1518 }, { "epoch": 2.69, "grad_norm": 0.0034320768900215626, "learning_rate": 1.333935404187253e-05, "loss": 0.0002, "step": 1519 }, { "epoch": 2.69, "grad_norm": 0.5339131951332092, "learning_rate": 1.3189544521990032e-05, "loss": 0.031, "step": 1520 }, { "epoch": 2.69, "grad_norm": 0.21039195358753204, "learning_rate": 1.3040558193237657e-05, "loss": 0.002, "step": 1521 }, { "epoch": 2.69, "grad_norm": 0.0334392786026001, "learning_rate": 1.2892395573515819e-05, "loss": 0.0014, "step": 1522 }, { "epoch": 2.69, "grad_norm": 0.009169626981019974, "learning_rate": 1.2745057177861647e-05, "loss": 0.0003, "step": 1523 }, { "epoch": 2.7, "grad_norm": 0.006436166353523731, "learning_rate": 1.2598543518446887e-05, "loss": 0.0004, "step": 1524 }, { "epoch": 2.7, "grad_norm": 0.004530397243797779, "learning_rate": 1.245285510457661e-05, "loss": 0.0002, "step": 1525 }, { "epoch": 2.7, "grad_norm": 0.013779646717011929, "learning_rate": 1.2307992442687072e-05, "loss": 0.0005, "step": 1526 }, { "epoch": 2.7, "grad_norm": 0.0018188911490142345, "learning_rate": 1.2163956036344153e-05, "loss": 0.0001, "step": 1527 }, { "epoch": 2.7, "grad_norm": 0.15664638578891754, "learning_rate": 1.2020746386241565e-05, "loss": 0.0029, "step": 1528 }, { "epoch": 2.7, "grad_norm": 0.017927415668964386, "learning_rate": 1.1878363990198871e-05, "loss": 0.0004, "step": 1529 }, { "epoch": 2.71, "grad_norm": 0.17458246648311615, "learning_rate": 1.1736809343160237e-05, "loss": 0.0035, "step": 1530 }, { "epoch": 2.71, "grad_norm": 0.016326846554875374, "learning_rate": 1.1596082937192276e-05, "loss": 0.0006, "step": 1531 }, { "epoch": 2.71, "grad_norm": 0.75627201795578, "learning_rate": 1.1456185261482565e-05, "loss": 0.0268, "step": 1532 }, { "epoch": 2.71, "grad_norm": 0.003338505746796727, "learning_rate": 1.1317116802337906e-05, "loss": 0.0002, "step": 1533 }, { "epoch": 2.71, "grad_norm": 0.003949652425944805, "learning_rate": 1.1178878043182462e-05, "loss": 0.0002, "step": 1534 }, { "epoch": 2.72, "grad_norm": 0.005014631897211075, "learning_rate": 1.1041469464556419e-05, "loss": 0.0003, "step": 1535 }, { "epoch": 2.72, "grad_norm": 0.09127765148878098, "learning_rate": 1.090489154411406e-05, "loss": 0.0022, "step": 1536 }, { "epoch": 2.72, "grad_norm": 0.002308650640770793, "learning_rate": 1.0769144756622106e-05, "loss": 0.0001, "step": 1537 }, { "epoch": 2.72, "grad_norm": 0.01227644830942154, "learning_rate": 1.0634229573958155e-05, "loss": 0.0005, "step": 1538 }, { "epoch": 2.72, "grad_norm": 0.07127789407968521, "learning_rate": 1.0500146465108995e-05, "loss": 0.0016, "step": 1539 }, { "epoch": 2.72, "grad_norm": 0.3965778648853302, "learning_rate": 1.0366895896169098e-05, "loss": 0.0119, "step": 1540 }, { "epoch": 2.73, "grad_norm": 0.006936135236173868, "learning_rate": 1.0234478330338775e-05, "loss": 0.0003, "step": 1541 }, { "epoch": 2.73, "grad_norm": 0.1017901748418808, "learning_rate": 1.0102894227922737e-05, "loss": 0.0019, "step": 1542 }, { "epoch": 2.73, "grad_norm": 0.004074485041201115, "learning_rate": 9.972144046328429e-06, "loss": 0.0002, "step": 1543 }, { "epoch": 2.73, "grad_norm": 0.5157943367958069, "learning_rate": 9.842228240064421e-06, "loss": 0.0767, "step": 1544 }, { "epoch": 2.73, "grad_norm": 0.35279086232185364, "learning_rate": 9.713147260738936e-06, "loss": 0.0063, "step": 1545 }, { "epoch": 2.74, "grad_norm": 0.005003898870199919, "learning_rate": 9.584901557058156e-06, "loss": 0.0002, "step": 1546 }, { "epoch": 2.74, "grad_norm": 0.001333480584435165, "learning_rate": 9.457491574824757e-06, "loss": 0.0001, "step": 1547 }, { "epoch": 2.74, "grad_norm": 0.02091939002275467, "learning_rate": 9.330917756936174e-06, "loss": 0.0009, "step": 1548 }, { "epoch": 2.74, "grad_norm": 0.30123287439346313, "learning_rate": 9.2051805433834e-06, "loss": 0.0046, "step": 1549 }, { "epoch": 2.74, "grad_norm": 0.17032013833522797, "learning_rate": 9.080280371249112e-06, "loss": 0.0036, "step": 1550 }, { "epoch": 2.74, "grad_norm": 0.4087766706943512, "learning_rate": 8.956217674706363e-06, "loss": 0.0049, "step": 1551 }, { "epoch": 2.75, "grad_norm": 0.09622213989496231, "learning_rate": 8.832992885016988e-06, "loss": 0.0024, "step": 1552 }, { "epoch": 2.75, "grad_norm": 0.016199596226215363, "learning_rate": 8.710606430530066e-06, "loss": 0.0005, "step": 1553 }, { "epoch": 2.75, "grad_norm": 0.0021451227366924286, "learning_rate": 8.589058736680643e-06, "loss": 0.0001, "step": 1554 }, { "epoch": 2.75, "grad_norm": 0.005553426221013069, "learning_rate": 8.46835022598791e-06, "loss": 0.0002, "step": 1555 }, { "epoch": 2.75, "grad_norm": 0.5673149824142456, "learning_rate": 8.348481318054075e-06, "loss": 0.0614, "step": 1556 }, { "epoch": 2.75, "grad_norm": 0.014100473374128342, "learning_rate": 8.229452429562661e-06, "loss": 0.0004, "step": 1557 }, { "epoch": 2.76, "grad_norm": 0.026370083913207054, "learning_rate": 8.111263974277166e-06, "loss": 0.0006, "step": 1558 }, { "epoch": 2.76, "grad_norm": 0.00458921492099762, "learning_rate": 7.993916363039672e-06, "loss": 0.0002, "step": 1559 }, { "epoch": 2.76, "grad_norm": 0.0019055847078561783, "learning_rate": 7.877410003769236e-06, "loss": 0.0001, "step": 1560 }, { "epoch": 2.76, "grad_norm": 0.010362006723880768, "learning_rate": 7.761745301460676e-06, "loss": 0.0004, "step": 1561 }, { "epoch": 2.76, "grad_norm": 0.06622084230184555, "learning_rate": 7.646922658183092e-06, "loss": 0.0016, "step": 1562 }, { "epoch": 2.76, "eval_loss": 0.12071493268013, "eval_runtime": 14.7006, "eval_samples_per_second": 32.448, "eval_steps_per_second": 8.163, "step": 1562 }, { "epoch": 2.77, "grad_norm": 0.21637581288814545, "learning_rate": 7.532942473078341e-06, "loss": 0.0077, "step": 1563 }, { "epoch": 2.77, "grad_norm": 0.2466088980436325, "learning_rate": 7.419805142359875e-06, "loss": 0.005, "step": 1564 }, { "epoch": 2.77, "grad_norm": 0.1368948519229889, "learning_rate": 7.307511059311184e-06, "loss": 0.0024, "step": 1565 }, { "epoch": 2.77, "grad_norm": 0.4194111227989197, "learning_rate": 7.196060614284544e-06, "loss": 0.0097, "step": 1566 }, { "epoch": 2.77, "grad_norm": 0.0024943724274635315, "learning_rate": 7.085454194699553e-06, "loss": 0.0001, "step": 1567 }, { "epoch": 2.77, "grad_norm": 0.5519493222236633, "learning_rate": 6.975692185041848e-06, "loss": 0.0128, "step": 1568 }, { "epoch": 2.78, "grad_norm": 0.17635726928710938, "learning_rate": 6.866774966861833e-06, "loss": 0.0047, "step": 1569 }, { "epoch": 2.78, "grad_norm": 0.17873595654964447, "learning_rate": 6.758702918773202e-06, "loss": 0.0041, "step": 1570 }, { "epoch": 2.78, "grad_norm": 0.008024133741855621, "learning_rate": 6.651476416451696e-06, "loss": 0.0004, "step": 1571 }, { "epoch": 2.78, "grad_norm": 0.018708229064941406, "learning_rate": 6.545095832633907e-06, "loss": 0.0008, "step": 1572 }, { "epoch": 2.78, "grad_norm": 0.3967719078063965, "learning_rate": 6.439561537115751e-06, "loss": 0.0771, "step": 1573 }, { "epoch": 2.78, "grad_norm": 0.039222270250320435, "learning_rate": 6.334873896751414e-06, "loss": 0.001, "step": 1574 }, { "epoch": 2.79, "grad_norm": 0.013372889719903469, "learning_rate": 6.231033275451908e-06, "loss": 0.0007, "step": 1575 }, { "epoch": 2.79, "grad_norm": 0.008132797665894032, "learning_rate": 6.12804003418388e-06, "loss": 0.0003, "step": 1576 }, { "epoch": 2.79, "grad_norm": 0.15415289998054504, "learning_rate": 6.0258945309683565e-06, "loss": 0.0044, "step": 1577 }, { "epoch": 2.79, "grad_norm": 0.5614458918571472, "learning_rate": 5.9245971208795045e-06, "loss": 0.0128, "step": 1578 }, { "epoch": 2.79, "grad_norm": 0.0058565386570990086, "learning_rate": 5.824148156043374e-06, "loss": 0.0002, "step": 1579 }, { "epoch": 2.8, "grad_norm": 0.6066858172416687, "learning_rate": 5.724547985636652e-06, "loss": 0.0365, "step": 1580 }, { "epoch": 2.8, "grad_norm": 0.39940959215164185, "learning_rate": 5.625796955885526e-06, "loss": 0.0168, "step": 1581 }, { "epoch": 2.8, "grad_norm": 0.10061442852020264, "learning_rate": 5.527895410064459e-06, "loss": 0.0013, "step": 1582 }, { "epoch": 2.8, "grad_norm": 0.003030581632629037, "learning_rate": 5.430843688494836e-06, "loss": 0.0001, "step": 1583 }, { "epoch": 2.8, "grad_norm": 0.11822108179330826, "learning_rate": 5.3346421285440925e-06, "loss": 0.0033, "step": 1584 }, { "epoch": 2.8, "grad_norm": 0.004489241633564234, "learning_rate": 5.239291064624258e-06, "loss": 0.0003, "step": 1585 }, { "epoch": 2.81, "grad_norm": 1.0766096115112305, "learning_rate": 5.144790828190887e-06, "loss": 0.036, "step": 1586 }, { "epoch": 2.81, "grad_norm": 0.09894857555627823, "learning_rate": 5.051141747741989e-06, "loss": 0.0018, "step": 1587 }, { "epoch": 2.81, "grad_norm": 0.49432384967803955, "learning_rate": 4.958344148816824e-06, "loss": 0.0061, "step": 1588 }, { "epoch": 2.81, "grad_norm": 0.07527617365121841, "learning_rate": 4.8663983539946885e-06, "loss": 0.0017, "step": 1589 }, { "epoch": 2.81, "grad_norm": 0.0023846172261983156, "learning_rate": 4.775304682893944e-06, "loss": 0.0001, "step": 1590 }, { "epoch": 2.81, "grad_norm": 0.01218903437256813, "learning_rate": 4.685063452170735e-06, "loss": 0.0006, "step": 1591 }, { "epoch": 2.82, "grad_norm": 0.029260369017720222, "learning_rate": 4.595674975518133e-06, "loss": 0.0013, "step": 1592 }, { "epoch": 2.82, "grad_norm": 0.00425742520019412, "learning_rate": 4.507139563664802e-06, "loss": 0.0002, "step": 1593 }, { "epoch": 2.82, "grad_norm": 0.003876454196870327, "learning_rate": 4.419457524374032e-06, "loss": 0.0002, "step": 1594 }, { "epoch": 2.82, "grad_norm": 0.1555083692073822, "learning_rate": 4.332629162442675e-06, "loss": 0.0028, "step": 1595 }, { "epoch": 2.82, "grad_norm": 0.0016881643095985055, "learning_rate": 4.246654779699988e-06, "loss": 0.0001, "step": 1596 }, { "epoch": 2.83, "grad_norm": 0.00919495802372694, "learning_rate": 4.161534675006739e-06, "loss": 0.0005, "step": 1597 }, { "epoch": 2.83, "grad_norm": 0.008377696387469769, "learning_rate": 4.077269144254103e-06, "loss": 0.0003, "step": 1598 }, { "epoch": 2.83, "grad_norm": 1.2848001718521118, "learning_rate": 3.993858480362572e-06, "loss": 0.0519, "step": 1599 }, { "epoch": 2.83, "grad_norm": 0.07129717618227005, "learning_rate": 3.9113029732809615e-06, "loss": 0.0011, "step": 1600 }, { "epoch": 2.83, "grad_norm": 0.001333805383183062, "learning_rate": 3.8296029099854635e-06, "loss": 0.0001, "step": 1601 }, { "epoch": 2.83, "grad_norm": 0.017828090116381645, "learning_rate": 3.748758574478622e-06, "loss": 0.0008, "step": 1602 }, { "epoch": 2.84, "grad_norm": 0.010987207293510437, "learning_rate": 3.6687702477883332e-06, "loss": 0.0004, "step": 1603 }, { "epoch": 2.84, "grad_norm": 0.006206910125911236, "learning_rate": 3.5896382079668166e-06, "loss": 0.0003, "step": 1604 }, { "epoch": 2.84, "grad_norm": 0.001939537120051682, "learning_rate": 3.511362730089729e-06, "loss": 0.0001, "step": 1605 }, { "epoch": 2.84, "grad_norm": 0.2372162640094757, "learning_rate": 3.4339440862552194e-06, "loss": 0.0031, "step": 1606 }, { "epoch": 2.84, "grad_norm": 0.0658475011587143, "learning_rate": 3.3573825455829043e-06, "loss": 0.002, "step": 1607 }, { "epoch": 2.84, "grad_norm": 0.012416253797709942, "learning_rate": 3.2816783742129762e-06, "loss": 0.0005, "step": 1608 }, { "epoch": 2.85, "grad_norm": 0.001760126673616469, "learning_rate": 3.206831835305263e-06, "loss": 0.0001, "step": 1609 }, { "epoch": 2.85, "grad_norm": 0.1820244938135147, "learning_rate": 3.132843189038365e-06, "loss": 0.0035, "step": 1610 }, { "epoch": 2.85, "grad_norm": 0.2944984436035156, "learning_rate": 3.059712692608657e-06, "loss": 0.0086, "step": 1611 }, { "epoch": 2.85, "grad_norm": 0.013544095680117607, "learning_rate": 2.9874406002295128e-06, "loss": 0.0006, "step": 1612 }, { "epoch": 2.85, "grad_norm": 0.008425934240221977, "learning_rate": 2.9160271631303025e-06, "loss": 0.0003, "step": 1613 }, { "epoch": 2.86, "grad_norm": 0.0034127351827919483, "learning_rate": 2.84547262955559e-06, "loss": 0.0001, "step": 1614 }, { "epoch": 2.86, "grad_norm": 0.03862582519650459, "learning_rate": 2.775777244764216e-06, "loss": 0.0007, "step": 1615 }, { "epoch": 2.86, "grad_norm": 0.9643748998641968, "learning_rate": 2.7069412510285773e-06, "loss": 0.0381, "step": 1616 }, { "epoch": 2.86, "grad_norm": 0.027254153043031693, "learning_rate": 2.6389648876335716e-06, "loss": 0.0007, "step": 1617 }, { "epoch": 2.86, "grad_norm": 0.14486004412174225, "learning_rate": 2.571848390875986e-06, "loss": 0.0039, "step": 1618 }, { "epoch": 2.86, "grad_norm": 0.002894668374210596, "learning_rate": 2.5055919940635276e-06, "loss": 0.0001, "step": 1619 }, { "epoch": 2.87, "grad_norm": 0.8268805742263794, "learning_rate": 2.440195927514044e-06, "loss": 0.0447, "step": 1620 }, { "epoch": 2.87, "grad_norm": 0.17663922905921936, "learning_rate": 2.375660418554776e-06, "loss": 0.0076, "step": 1621 }, { "epoch": 2.87, "grad_norm": 0.12479265034198761, "learning_rate": 2.3119856915214677e-06, "loss": 0.002, "step": 1622 }, { "epoch": 2.87, "grad_norm": 0.14911742508411407, "learning_rate": 2.249171967757674e-06, "loss": 0.0046, "step": 1623 }, { "epoch": 2.87, "grad_norm": 1.0442122220993042, "learning_rate": 2.1872194656140377e-06, "loss": 0.0154, "step": 1624 }, { "epoch": 2.87, "grad_norm": 0.018565386533737183, "learning_rate": 2.126128400447347e-06, "loss": 0.0007, "step": 1625 }, { "epoch": 2.88, "grad_norm": 0.7466828227043152, "learning_rate": 2.0658989846199516e-06, "loss": 0.0113, "step": 1626 }, { "epoch": 2.88, "grad_norm": 0.034939322620630264, "learning_rate": 2.006531427499014e-06, "loss": 0.0008, "step": 1627 }, { "epoch": 2.88, "grad_norm": 0.028358131647109985, "learning_rate": 1.948025935455594e-06, "loss": 0.001, "step": 1628 }, { "epoch": 2.88, "grad_norm": 0.043177518993616104, "learning_rate": 1.8903827118642303e-06, "loss": 0.0008, "step": 1629 }, { "epoch": 2.88, "grad_norm": 0.03869365155696869, "learning_rate": 1.8336019571019712e-06, "loss": 0.0022, "step": 1630 }, { "epoch": 2.89, "grad_norm": 0.0050878459587693214, "learning_rate": 1.7776838685478748e-06, "loss": 0.0002, "step": 1631 }, { "epoch": 2.89, "grad_norm": 0.11395423114299774, "learning_rate": 1.7226286405821746e-06, "loss": 0.0024, "step": 1632 }, { "epoch": 2.89, "grad_norm": 0.5369199514389038, "learning_rate": 1.6684364645856431e-06, "loss": 0.0145, "step": 1633 }, { "epoch": 2.89, "grad_norm": 0.012085442431271076, "learning_rate": 1.6151075289390082e-06, "loss": 0.0003, "step": 1634 }, { "epoch": 2.89, "grad_norm": 0.15687036514282227, "learning_rate": 1.5626420190222034e-06, "loss": 0.0059, "step": 1635 }, { "epoch": 2.89, "grad_norm": 0.02339506521821022, "learning_rate": 1.5110401172137578e-06, "loss": 0.0004, "step": 1636 }, { "epoch": 2.9, "grad_norm": 0.15162977576255798, "learning_rate": 1.4603020028901292e-06, "loss": 0.0026, "step": 1637 }, { "epoch": 2.9, "grad_norm": 0.0018832300556823611, "learning_rate": 1.4104278524251778e-06, "loss": 0.0001, "step": 1638 }, { "epoch": 2.9, "grad_norm": 0.030508514493703842, "learning_rate": 1.3614178391894438e-06, "loss": 0.0013, "step": 1639 }, { "epoch": 2.9, "grad_norm": 0.00550305750221014, "learning_rate": 1.3132721335495645e-06, "loss": 0.0002, "step": 1640 }, { "epoch": 2.9, "grad_norm": 0.01754370890557766, "learning_rate": 1.265990902867803e-06, "loss": 0.0005, "step": 1641 }, { "epoch": 2.9, "grad_norm": 0.00891774520277977, "learning_rate": 1.2195743115012148e-06, "loss": 0.0002, "step": 1642 }, { "epoch": 2.91, "grad_norm": 0.02182842791080475, "learning_rate": 1.1740225208013712e-06, "loss": 0.0007, "step": 1643 }, { "epoch": 2.91, "grad_norm": 0.0056608193553984165, "learning_rate": 1.1293356891136086e-06, "loss": 0.0002, "step": 1644 }, { "epoch": 2.91, "grad_norm": 0.004720357712358236, "learning_rate": 1.0855139717765028e-06, "loss": 0.0003, "step": 1645 }, { "epoch": 2.91, "grad_norm": 0.005244838539510965, "learning_rate": 1.0425575211213956e-06, "loss": 0.0003, "step": 1646 }, { "epoch": 2.91, "grad_norm": 0.0327480174601078, "learning_rate": 1.0004664864717573e-06, "loss": 0.001, "step": 1647 }, { "epoch": 2.92, "grad_norm": 0.8325297832489014, "learning_rate": 9.592410141427977e-07, "loss": 0.0526, "step": 1648 }, { "epoch": 2.92, "grad_norm": 0.0062986682169139385, "learning_rate": 9.188812474408837e-07, "loss": 0.0003, "step": 1649 }, { "epoch": 2.92, "grad_norm": 0.01729346066713333, "learning_rate": 8.793873266630393e-07, "loss": 0.0006, "step": 1650 }, { "epoch": 2.92, "grad_norm": 0.00606880709528923, "learning_rate": 8.407593890964461e-07, "loss": 0.0004, "step": 1651 }, { "epoch": 2.92, "grad_norm": 0.009202031418681145, "learning_rate": 8.029975690179992e-07, "loss": 0.0005, "step": 1652 }, { "epoch": 2.92, "grad_norm": 0.14149095118045807, "learning_rate": 7.661019976939187e-07, "loss": 0.0014, "step": 1653 }, { "epoch": 2.93, "grad_norm": 0.06853083521127701, "learning_rate": 7.300728033790282e-07, "loss": 0.0026, "step": 1654 }, { "epoch": 2.93, "grad_norm": 0.5628806352615356, "learning_rate": 6.949101113166712e-07, "loss": 0.0322, "step": 1655 }, { "epoch": 2.93, "grad_norm": 0.004697425756603479, "learning_rate": 6.606140437379616e-07, "loss": 0.0002, "step": 1656 }, { "epoch": 2.93, "grad_norm": 0.026971176266670227, "learning_rate": 6.271847198615343e-07, "loss": 0.0008, "step": 1657 }, { "epoch": 2.93, "grad_norm": 0.416694313287735, "learning_rate": 5.946222558931014e-07, "loss": 0.0072, "step": 1658 }, { "epoch": 2.93, "grad_norm": 0.16609083116054535, "learning_rate": 5.629267650249792e-07, "loss": 0.0056, "step": 1659 }, { "epoch": 2.94, "grad_norm": 0.020159004256129265, "learning_rate": 5.320983574358118e-07, "loss": 0.0006, "step": 1660 }, { "epoch": 2.94, "grad_norm": 0.020069165155291557, "learning_rate": 5.02137140290071e-07, "loss": 0.0007, "step": 1661 }, { "epoch": 2.94, "grad_norm": 0.04708680510520935, "learning_rate": 4.7304321773775085e-07, "loss": 0.0024, "step": 1662 }, { "epoch": 2.94, "grad_norm": 0.03632630407810211, "learning_rate": 4.4481669091400725e-07, "loss": 0.0015, "step": 1663 }, { "epoch": 2.94, "grad_norm": 0.006391770206391811, "learning_rate": 4.1745765793874126e-07, "loss": 0.0004, "step": 1664 }, { "epoch": 2.95, "grad_norm": 0.8153448700904846, "learning_rate": 3.90966213916405e-07, "loss": 0.0167, "step": 1665 }, { "epoch": 2.95, "grad_norm": 0.08031131327152252, "learning_rate": 3.6534245093544636e-07, "loss": 0.0036, "step": 1666 }, { "epoch": 2.95, "grad_norm": 0.015597201883792877, "learning_rate": 3.405864580681983e-07, "loss": 0.0006, "step": 1667 }, { "epoch": 2.95, "grad_norm": 0.024990934878587723, "learning_rate": 3.1669832137046216e-07, "loss": 0.0008, "step": 1668 }, { "epoch": 2.95, "grad_norm": 0.005498305428773165, "learning_rate": 2.9367812388123025e-07, "loss": 0.0002, "step": 1669 }, { "epoch": 2.95, "grad_norm": 0.19190239906311035, "learning_rate": 2.715259456224084e-07, "loss": 0.0051, "step": 1670 }, { "epoch": 2.96, "grad_norm": 0.008078343234956264, "learning_rate": 2.502418635985382e-07, "loss": 0.0004, "step": 1671 }, { "epoch": 2.96, "grad_norm": 0.02244127169251442, "learning_rate": 2.2982595179646404e-07, "loss": 0.0009, "step": 1672 }, { "epoch": 2.96, "grad_norm": 0.027221323922276497, "learning_rate": 2.1027828118519442e-07, "loss": 0.001, "step": 1673 }, { "epoch": 2.96, "grad_norm": 0.05436151847243309, "learning_rate": 1.91598919715541e-07, "loss": 0.0012, "step": 1674 }, { "epoch": 2.96, "grad_norm": 0.751646876335144, "learning_rate": 1.7378793232000755e-07, "loss": 0.0248, "step": 1675 }, { "epoch": 2.97, "grad_norm": 0.04363579303026199, "learning_rate": 1.5684538091240153e-07, "loss": 0.0015, "step": 1676 }, { "epoch": 2.97, "grad_norm": 0.02322092093527317, "learning_rate": 1.407713243877784e-07, "loss": 0.001, "step": 1677 }, { "epoch": 2.97, "grad_norm": 0.023351455107331276, "learning_rate": 1.2556581862213646e-07, "loss": 0.001, "step": 1678 }, { "epoch": 2.97, "grad_norm": 0.02784132957458496, "learning_rate": 1.1122891647222244e-07, "loss": 0.0007, "step": 1679 }, { "epoch": 2.97, "grad_norm": 0.018551329150795937, "learning_rate": 9.776066777542058e-08, "loss": 0.0008, "step": 1680 }, { "epoch": 2.97, "grad_norm": 0.0073452303186059, "learning_rate": 8.516111934955828e-08, "loss": 0.0004, "step": 1681 }, { "epoch": 2.98, "grad_norm": 0.022210106253623962, "learning_rate": 7.343031499262853e-08, "loss": 0.0005, "step": 1682 }, { "epoch": 2.98, "grad_norm": 0.36818450689315796, "learning_rate": 6.256829548284549e-08, "loss": 0.0045, "step": 1683 }, { "epoch": 2.98, "grad_norm": 0.012365012429654598, "learning_rate": 5.257509857828358e-08, "loss": 0.0005, "step": 1684 }, { "epoch": 2.98, "grad_norm": 0.007756817154586315, "learning_rate": 4.345075901693307e-08, "loss": 0.0003, "step": 1685 }, { "epoch": 2.98, "grad_norm": 0.31708094477653503, "learning_rate": 3.51953085164225e-08, "loss": 0.01, "step": 1686 }, { "epoch": 2.98, "grad_norm": 0.026957141235470772, "learning_rate": 2.7808775774074147e-08, "loss": 0.0005, "step": 1687 }, { "epoch": 2.99, "grad_norm": 0.7157958149909973, "learning_rate": 2.1291186466626535e-08, "loss": 0.0264, "step": 1688 }, { "epoch": 2.99, "grad_norm": 0.003837962169200182, "learning_rate": 1.5642563250289897e-08, "loss": 0.0002, "step": 1689 }, { "epoch": 2.99, "grad_norm": 0.22489942610263824, "learning_rate": 1.0862925760551923e-08, "loss": 0.0026, "step": 1690 }, { "epoch": 2.99, "grad_norm": 0.4433555603027344, "learning_rate": 6.952290612205481e-09, "loss": 0.037, "step": 1691 }, { "epoch": 2.99, "grad_norm": 0.0032361983321607113, "learning_rate": 3.910671399265376e-09, "loss": 0.0002, "step": 1692 }, { "epoch": 3.0, "grad_norm": 0.13408610224723816, "learning_rate": 1.7380786948850702e-09, "loss": 0.0032, "step": 1693 }, { "epoch": 3.0, "grad_norm": 0.017384354025125504, "learning_rate": 4.345200513289349e-10, "loss": 0.0004, "step": 1694 }, { "epoch": 3.0, "grad_norm": 0.0010622803820297122, "learning_rate": 0.0, "loss": 0.0001, "step": 1695 } ], "logging_steps": 1, "max_steps": 1695, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 565, "total_flos": 1.549439947809751e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }