[ { "loss": 29.9707, "grad_norm": 0.4777052700519562, "learning_rate": 0.0009991248796709547, "epoch": 0.0 }, { "loss": 22.6857, "grad_norm": 0.7528864741325378, "learning_rate": 0.0009982497593419095, "epoch": 0.01 }, { "loss": 23.3032, "grad_norm": 0.2558889389038086, "learning_rate": 0.0009973746390128642, "epoch": 0.01 }, { "loss": 22.7608, "grad_norm": 0.16549238562583923, "learning_rate": 0.000996499518683819, "epoch": 0.01 }, { "loss": 21.8524, "grad_norm": 0.969261109828949, "learning_rate": 0.0009956243983547737, "epoch": 0.01 }, { "loss": 20.1216, "grad_norm": 1.4401915073394775, "learning_rate": 0.0009947492780257286, "epoch": 0.02 }, { "loss": 18.8698, "grad_norm": 1.2519457340240479, "learning_rate": 0.0009938741576966832, "epoch": 0.02 }, { "loss": 17.8051, "grad_norm": 0.6829971075057983, "learning_rate": 0.0009929990373676381, "epoch": 0.02 }, { "loss": 16.7436, "grad_norm": 0.8918408155441284, "learning_rate": 0.0009921239170385928, "epoch": 0.02 }, { "loss": 16.3997, "grad_norm": 1.1997641324996948, "learning_rate": 0.0009912487967095476, "epoch": 0.03 }, { "loss": 16.6555, "grad_norm": 1.0908863544464111, "learning_rate": 0.0009903736763805023, "epoch": 0.03 }, { "loss": 16.168, "grad_norm": 0.8117638230323792, "learning_rate": 0.0009894985560514572, "epoch": 0.03 }, { "loss": 16.4056, "grad_norm": 0.7367461919784546, "learning_rate": 0.0009886234357224118, "epoch": 0.03 }, { "loss": 16.0576, "grad_norm": 0.827192485332489, "learning_rate": 0.0009877483153933667, "epoch": 0.04 }, { "loss": 14.8864, "grad_norm": 0.6644204258918762, "learning_rate": 0.0009868731950643213, "epoch": 0.04 }, { "loss": 15.1702, "grad_norm": 0.6314308047294617, "learning_rate": 0.0009859980747352762, "epoch": 0.04 }, { "loss": 14.8873, "grad_norm": 0.4996398091316223, "learning_rate": 0.0009851229544062309, "epoch": 0.04 }, { "loss": 14.704, "grad_norm": 0.6396967768669128, "learning_rate": 0.0009842478340771857, "epoch": 0.05 }, { "loss": 14.8636, "grad_norm": 0.5319499373435974, "learning_rate": 0.0009833727137481404, "epoch": 0.05 }, { "loss": 14.7236, "grad_norm": 1.1328645944595337, "learning_rate": 0.0009824975934190953, "epoch": 0.05 }, { "loss": 14.2966, "grad_norm": 0.5435690879821777, "learning_rate": 0.00098162247309005, "epoch": 0.06 }, { "loss": 14.8865, "grad_norm": 0.5260070562362671, "learning_rate": 0.0009807473527610048, "epoch": 0.06 }, { "loss": 14.1635, "grad_norm": 0.5490550994873047, "learning_rate": 0.0009798722324319594, "epoch": 0.06 }, { "loss": 14.1756, "grad_norm": 0.5580148696899414, "learning_rate": 0.0009789971121029143, "epoch": 0.06 }, { "loss": 13.3026, "grad_norm": 0.4862927198410034, "learning_rate": 0.000978121991773869, "epoch": 0.07 }, { "loss": 13.9938, "grad_norm": 0.4365651607513428, "learning_rate": 0.0009772468714448236, "epoch": 0.07 }, { "loss": 13.7628, "grad_norm": 0.5206578373908997, "learning_rate": 0.0009763717511157785, "epoch": 0.07 }, { "loss": 13.2932, "grad_norm": 0.4493275582790375, "learning_rate": 0.0009754966307867332, "epoch": 0.07 }, { "loss": 13.4192, "grad_norm": 0.5717960596084595, "learning_rate": 0.000974621510457688, "epoch": 0.08 }, { "loss": 13.2883, "grad_norm": 0.48513928055763245, "learning_rate": 0.0009737463901286428, "epoch": 0.08 }, { "loss": 13.8283, "grad_norm": 0.7734763622283936, "learning_rate": 0.0009728712697995975, "epoch": 0.08 }, { "loss": 12.4766, "grad_norm": 0.45278435945510864, "learning_rate": 0.0009719961494705523, "epoch": 0.08 }, { "loss": 13.41, "grad_norm": 0.5911663174629211, "learning_rate": 0.000971121029141507, "epoch": 0.09 }, { "loss": 12.4475, "grad_norm": 0.5840547680854797, "learning_rate": 0.0009702459088124618, "epoch": 0.09 }, { "loss": 13.9417, "grad_norm": 0.7008219957351685, "learning_rate": 0.0009693707884834166, "epoch": 0.09 }, { "loss": 12.9164, "grad_norm": 0.7021568417549133, "learning_rate": 0.0009684956681543713, "epoch": 0.09 }, { "loss": 12.9824, "grad_norm": 0.5466001629829407, "learning_rate": 0.0009676205478253261, "epoch": 0.1 }, { "loss": 12.5413, "grad_norm": 0.6215840578079224, "learning_rate": 0.0009667454274962808, "epoch": 0.1 }, { "loss": 12.7753, "grad_norm": 1.5948784351348877, "learning_rate": 0.0009658703071672355, "epoch": 0.1 }, { "loss": 12.5837, "grad_norm": 1.1063404083251953, "learning_rate": 0.0009649951868381903, "epoch": 0.1 }, { "loss": 12.7079, "grad_norm": 0.7521733045578003, "learning_rate": 0.000964120066509145, "epoch": 0.11 }, { "loss": 12.1584, "grad_norm": 0.7596040964126587, "learning_rate": 0.0009632449461800998, "epoch": 0.11 }, { "loss": 12.6058, "grad_norm": 1.1221098899841309, "learning_rate": 0.0009623698258510546, "epoch": 0.11 }, { "loss": 12.7003, "grad_norm": 0.795098602771759, "learning_rate": 0.0009614947055220093, "epoch": 0.12 }, { "loss": 12.4519, "grad_norm": 0.481406569480896, "learning_rate": 0.0009606195851929641, "epoch": 0.12 }, { "loss": 12.8483, "grad_norm": 0.6707068681716919, "learning_rate": 0.0009597444648639187, "epoch": 0.12 }, { "loss": 12.593, "grad_norm": 0.6381434798240662, "learning_rate": 0.0009588693445348735, "epoch": 0.12 }, { "loss": 12.1654, "grad_norm": 0.7791229486465454, "learning_rate": 0.0009579942242058283, "epoch": 0.13 }, { "loss": 11.8089, "grad_norm": 0.8445360660552979, "learning_rate": 0.000957119103876783, "epoch": 0.13 }, { "loss": 12.232, "grad_norm": 0.6427455544471741, "learning_rate": 0.0009562439835477378, "epoch": 0.13 }, { "loss": 12.9625, "grad_norm": 0.5700855255126953, "learning_rate": 0.0009553688632186925, "epoch": 0.13 }, { "loss": 12.2628, "grad_norm": 0.8731588125228882, "learning_rate": 0.0009544937428896473, "epoch": 0.14 }, { "loss": 12.0435, "grad_norm": 0.869883120059967, "learning_rate": 0.0009536186225606021, "epoch": 0.14 }, { "loss": 12.2867, "grad_norm": 0.8802808523178101, "learning_rate": 0.0009527435022315568, "epoch": 0.14 }, { "loss": 11.9397, "grad_norm": 1.0076773166656494, "learning_rate": 0.0009518683819025116, "epoch": 0.14 }, { "loss": 11.8392, "grad_norm": 0.5855250954627991, "learning_rate": 0.0009509932615734664, "epoch": 0.15 }, { "loss": 11.3847, "grad_norm": 0.5606763958930969, "learning_rate": 0.0009501181412444211, "epoch": 0.15 }, { "loss": 12.2154, "grad_norm": 1.1014057397842407, "learning_rate": 0.0009492430209153759, "epoch": 0.15 }, { "loss": 11.6247, "grad_norm": 0.6524838805198669, "learning_rate": 0.0009483679005863306, "epoch": 0.15 }, { "loss": 11.5115, "grad_norm": 1.0140221118927002, "learning_rate": 0.0009474927802572854, "epoch": 0.16 }, { "loss": 12.1707, "grad_norm": 1.4689868688583374, "learning_rate": 0.0009466176599282402, "epoch": 0.16 }, { "loss": 11.6165, "grad_norm": 0.8136260509490967, "learning_rate": 0.0009457425395991948, "epoch": 0.16 }, { "loss": 11.8841, "grad_norm": 2.0376949310302734, "learning_rate": 0.0009448674192701496, "epoch": 0.17 }, { "loss": 11.2108, "grad_norm": 1.1647133827209473, "learning_rate": 0.0009439922989411043, "epoch": 0.17 }, { "loss": 11.9281, "grad_norm": 0.8479063510894775, "learning_rate": 0.0009431171786120591, "epoch": 0.17 }, { "loss": 11.0593, "grad_norm": 0.8340569138526917, "learning_rate": 0.0009422420582830139, "epoch": 0.17 }, { "loss": 11.5591, "grad_norm": 0.9813485145568848, "learning_rate": 0.0009413669379539686, "epoch": 0.18 }, { "loss": 11.1773, "grad_norm": 0.9088229537010193, "learning_rate": 0.0009404918176249234, "epoch": 0.18 }, { "loss": 11.6913, "grad_norm": 0.860917866230011, "learning_rate": 0.0009396166972958782, "epoch": 0.18 }, { "loss": 12.3707, "grad_norm": 0.7795988321304321, "learning_rate": 0.0009387415769668329, "epoch": 0.18 }, { "loss": 11.6669, "grad_norm": 0.914884626865387, "learning_rate": 0.0009378664566377877, "epoch": 0.19 }, { "loss": 11.6139, "grad_norm": 1.7863789796829224, "learning_rate": 0.0009369913363087424, "epoch": 0.19 }, { "loss": 11.1885, "grad_norm": 0.7225568294525146, "learning_rate": 0.0009361162159796972, "epoch": 0.19 }, { "loss": 11.7488, "grad_norm": 0.9028294682502747, "learning_rate": 0.000935241095650652, "epoch": 0.19 }, { "loss": 11.227, "grad_norm": 1.0842101573944092, "learning_rate": 0.0009343659753216067, "epoch": 0.2 }, { "loss": 11.4022, "grad_norm": 0.7042496800422668, "learning_rate": 0.0009334908549925615, "epoch": 0.2 }, { "loss": 11.006, "grad_norm": 0.8355586528778076, "learning_rate": 0.0009326157346635162, "epoch": 0.2 }, { "loss": 11.0561, "grad_norm": 0.9001519083976746, "learning_rate": 0.000931740614334471, "epoch": 0.2 }, { "loss": 11.357, "grad_norm": 0.8695396184921265, "learning_rate": 0.0009308654940054258, "epoch": 0.21 }, { "loss": 10.7003, "grad_norm": 0.8076105117797852, "learning_rate": 0.0009299903736763805, "epoch": 0.21 }, { "loss": 11.2661, "grad_norm": 0.9677106142044067, "learning_rate": 0.0009291152533473353, "epoch": 0.21 }, { "loss": 10.8957, "grad_norm": 0.8753145337104797, "learning_rate": 0.0009282401330182901, "epoch": 0.22 }, { "loss": 11.2854, "grad_norm": 0.7343422770500183, "learning_rate": 0.0009273650126892448, "epoch": 0.22 }, { "loss": 10.8205, "grad_norm": 0.9795741438865662, "learning_rate": 0.0009264898923601996, "epoch": 0.22 }, { "loss": 10.6805, "grad_norm": 0.9723809957504272, "learning_rate": 0.0009256147720311543, "epoch": 0.22 }, { "loss": 10.7639, "grad_norm": 0.6675435900688171, "learning_rate": 0.0009247396517021091, "epoch": 0.23 }, { "loss": 11.1119, "grad_norm": 0.9673445224761963, "learning_rate": 0.0009238645313730638, "epoch": 0.23 }, { "loss": 11.293, "grad_norm": 0.9545767307281494, "learning_rate": 0.0009229894110440185, "epoch": 0.23 }, { "loss": 11.4529, "grad_norm": 0.8443020582199097, "learning_rate": 0.0009221142907149733, "epoch": 0.23 }, { "loss": 10.402, "grad_norm": 0.9980494976043701, "learning_rate": 0.000921239170385928, "epoch": 0.24 }, { "loss": 10.8417, "grad_norm": 1.2651828527450562, "learning_rate": 0.0009203640500568828, "epoch": 0.24 }, { "loss": 10.9627, "grad_norm": 0.7320075035095215, "learning_rate": 0.0009194889297278376, "epoch": 0.24 }, { "loss": 10.1427, "grad_norm": 1.5249311923980713, "learning_rate": 0.0009186138093987923, "epoch": 0.24 }, { "loss": 11.0647, "grad_norm": 0.8371347188949585, "learning_rate": 0.0009177386890697471, "epoch": 0.25 }, { "loss": 10.7984, "grad_norm": 1.0522745847702026, "learning_rate": 0.0009168635687407019, "epoch": 0.25 }, { "loss": 10.0289, "grad_norm": 0.9992939829826355, "learning_rate": 0.0009159884484116566, "epoch": 0.25 }, { "loss": 10.6594, "grad_norm": 1.6465744972229004, "learning_rate": 0.0009151133280826114, "epoch": 0.25 }, { "loss": 10.7898, "grad_norm": 0.8755474090576172, "learning_rate": 0.0009142382077535661, "epoch": 0.26 }, { "loss": 10.8566, "grad_norm": 0.9154648780822754, "learning_rate": 0.0009133630874245209, "epoch": 0.26 }, { "loss": 10.3388, "grad_norm": 0.9557958245277405, "learning_rate": 0.0009124879670954757, "epoch": 0.26 }, { "loss": 11.0761, "grad_norm": 0.9756875038146973, "learning_rate": 0.0009116128467664304, "epoch": 0.27 }, { "loss": 10.6927, "grad_norm": 0.9137876033782959, "learning_rate": 0.0009107377264373852, "epoch": 0.27 }, { "loss": 10.4956, "grad_norm": 1.2811295986175537, "learning_rate": 0.00090986260610834, "epoch": 0.27 }, { "loss": 11.13, "grad_norm": 1.574196696281433, "learning_rate": 0.0009090749978121991, "epoch": 0.27 }, { "loss": 10.4299, "grad_norm": 1.120239019393921, "learning_rate": 0.0009082873895160585, "epoch": 0.28 }, { "loss": 10.9432, "grad_norm": 4.42399263381958, "learning_rate": 0.0009074122691870133, "epoch": 0.28 }, { "loss": 10.6758, "grad_norm": 1.1292444467544556, "learning_rate": 0.000906537148857968, "epoch": 0.28 }, { "loss": 9.9808, "grad_norm": 1.36553156375885, "learning_rate": 0.0009056620285289227, "epoch": 0.28 }, { "loss": 10.4376, "grad_norm": 1.4920979738235474, "learning_rate": 0.0009047869081998775, "epoch": 0.29 }, { "loss": 11.5319, "grad_norm": 1.142583966255188, "learning_rate": 0.0009039117878708322, "epoch": 0.29 }, { "loss": 10.8741, "grad_norm": 1.7269898653030396, "learning_rate": 0.000903036667541787, "epoch": 0.29 }, { "loss": 10.6609, "grad_norm": 1.0620924234390259, "learning_rate": 0.0009021615472127418, "epoch": 0.29 }, { "loss": 10.8716, "grad_norm": 1.0225517749786377, "learning_rate": 0.0009012864268836965, "epoch": 0.3 }, { "loss": 10.8629, "grad_norm": 0.8201847672462463, "learning_rate": 0.0009004113065546513, "epoch": 0.3 }, { "loss": 10.2614, "grad_norm": 0.7885268926620483, "learning_rate": 0.000899536186225606, "epoch": 0.3 }, { "loss": 10.1758, "grad_norm": 0.8671897053718567, "learning_rate": 0.0008986610658965608, "epoch": 0.3 }, { "loss": 10.2796, "grad_norm": 0.8501631617546082, "learning_rate": 0.0008977859455675156, "epoch": 0.31 }, { "loss": 10.4376, "grad_norm": 1.3847661018371582, "learning_rate": 0.0008969108252384703, "epoch": 0.31 }, { "loss": 10.6258, "grad_norm": 1.1267868280410767, "learning_rate": 0.0008960357049094251, "epoch": 0.31 }, { "loss": 10.3214, "grad_norm": 0.9492388963699341, "learning_rate": 0.0008951605845803799, "epoch": 0.31 }, { "loss": 10.3126, "grad_norm": 2.884838819503784, "learning_rate": 0.0008942854642513346, "epoch": 0.32 }, { "loss": 9.8104, "grad_norm": 1.007505178451538, "learning_rate": 0.0008934103439222894, "epoch": 0.32 }, { "loss": 10.7341, "grad_norm": 0.9504636526107788, "learning_rate": 0.0008925352235932441, "epoch": 0.32 }, { "loss": 10.3923, "grad_norm": 1.1075007915496826, "learning_rate": 0.0008916601032641989, "epoch": 0.33 }, { "loss": 10.323, "grad_norm": 1.137343406677246, "learning_rate": 0.0008907849829351537, "epoch": 0.33 }, { "loss": 10.2794, "grad_norm": 0.797771155834198, "learning_rate": 0.0008899098626061084, "epoch": 0.33 }, { "loss": 10.6656, "grad_norm": 1.018343448638916, "learning_rate": 0.0008890347422770632, "epoch": 0.33 }, { "loss": 10.2778, "grad_norm": 1.0548039674758911, "learning_rate": 0.000888159621948018, "epoch": 0.34 }, { "loss": 10.1114, "grad_norm": 3.0174038410186768, "learning_rate": 0.0008872845016189727, "epoch": 0.34 }, { "loss": 10.8685, "grad_norm": 2.50591778755188, "learning_rate": 0.0008864093812899275, "epoch": 0.34 }, { "loss": 10.0677, "grad_norm": 1.2851207256317139, "learning_rate": 0.0008855342609608822, "epoch": 0.34 }, { "loss": 10.0311, "grad_norm": 0.7987344264984131, "learning_rate": 0.0008846591406318369, "epoch": 0.35 }, { "loss": 9.7713, "grad_norm": 1.114479899406433, "learning_rate": 0.0008837840203027917, "epoch": 0.35 }, { "loss": 9.9371, "grad_norm": 1.2233116626739502, "learning_rate": 0.0008829088999737464, "epoch": 0.35 }, { "loss": 10.5333, "grad_norm": 2.0412189960479736, "learning_rate": 0.0008820337796447012, "epoch": 0.35 }, { "loss": 10.2577, "grad_norm": 1.2635306119918823, "learning_rate": 0.0008811586593156559, "epoch": 0.36 }, { "loss": 9.8937, "grad_norm": 12.760872840881348, "learning_rate": 0.0008802835389866107, "epoch": 0.36 }, { "loss": 10.6092, "grad_norm": 1.3580334186553955, "learning_rate": 0.0008794084186575654, "epoch": 0.36 }, { "loss": 10.2467, "grad_norm": 1.250632643699646, "learning_rate": 0.0008785332983285201, "epoch": 0.36 }, { "loss": 10.5076, "grad_norm": 1.458349585533142, "learning_rate": 0.0008776581779994749, "epoch": 0.37 }, { "loss": 10.2769, "grad_norm": 1.9139622449874878, "learning_rate": 0.0008767830576704296, "epoch": 0.37 }, { "loss": 10.4452, "grad_norm": 1.2400761842727661, "learning_rate": 0.0008759079373413844, "epoch": 0.37 }, { "loss": 10.01, "grad_norm": 1.5482594966888428, "learning_rate": 0.0008750328170123392, "epoch": 0.38 }, { "loss": 10.2997, "grad_norm": 1.68232262134552, "learning_rate": 0.0008741576966832939, "epoch": 0.38 }, { "loss": 10.0902, "grad_norm": 1.206350564956665, "learning_rate": 0.0008732825763542487, "epoch": 0.38 }, { "loss": 9.6499, "grad_norm": 1.2805421352386475, "learning_rate": 0.0008724074560252034, "epoch": 0.38 }, { "loss": 10.3858, "grad_norm": 1.1297776699066162, "learning_rate": 0.0008715323356961582, "epoch": 0.39 }, { "loss": 10.4059, "grad_norm": 1.382300853729248, "learning_rate": 0.000870657215367113, "epoch": 0.39 }, { "loss": 9.8993, "grad_norm": 1.1831278800964355, "learning_rate": 0.0008697820950380677, "epoch": 0.39 }, { "loss": 10.2277, "grad_norm": 1.5924201011657715, "learning_rate": 0.0008689069747090225, "epoch": 0.39 }, { "loss": 10.2644, "grad_norm": 0.9275569319725037, "learning_rate": 0.0008680318543799773, "epoch": 0.4 }, { "loss": 10.2756, "grad_norm": 1.220247745513916, "learning_rate": 0.000867156734050932, "epoch": 0.4 }, { "loss": 9.873, "grad_norm": 1.2408357858657837, "learning_rate": 0.0008662816137218868, "epoch": 0.4 }, { "loss": 10.4232, "grad_norm": 2.236565351486206, "learning_rate": 0.0008654064933928415, "epoch": 0.4 }, { "loss": 9.8613, "grad_norm": 1.3093738555908203, "learning_rate": 0.0008645313730637963, "epoch": 0.41 }, { "loss": 10.1708, "grad_norm": 2.232199192047119, "learning_rate": 0.000863656252734751, "epoch": 0.41 }, { "loss": 9.9729, "grad_norm": 1.4281343221664429, "learning_rate": 0.0008627811324057057, "epoch": 0.41 }, { "loss": 10.3467, "grad_norm": 1.7682894468307495, "learning_rate": 0.0008619060120766605, "epoch": 0.41 }, { "loss": 9.7119, "grad_norm": 1.7619984149932861, "learning_rate": 0.0008610308917476152, "epoch": 0.42 }, { "loss": 10.4769, "grad_norm": 1.5372920036315918, "learning_rate": 0.00086015577141857, "epoch": 0.42 }, { "loss": 10.691, "grad_norm": 2.3789474964141846, "learning_rate": 0.0008592806510895248, "epoch": 0.42 }, { "loss": 9.8791, "grad_norm": 2.496776819229126, "learning_rate": 0.0008584055307604795, "epoch": 0.43 }, { "loss": 9.7356, "grad_norm": 4.118072032928467, "learning_rate": 0.0008575304104314343, "epoch": 0.43 }, { "loss": 10.3761, "grad_norm": 1.7359448671340942, "learning_rate": 0.0008566552901023891, "epoch": 0.43 }, { "loss": 10.1403, "grad_norm": 1.8283412456512451, "learning_rate": 0.0008557801697733438, "epoch": 0.43 }, { "loss": 10.306, "grad_norm": 1.9979033470153809, "learning_rate": 0.0008549050494442986, "epoch": 0.44 }, { "loss": 9.5832, "grad_norm": 3.1794967651367188, "learning_rate": 0.0008540299291152533, "epoch": 0.44 }, { "loss": 10.1963, "grad_norm": 3.1991539001464844, "learning_rate": 0.0008531548087862081, "epoch": 0.44 }, { "loss": 10.7828, "grad_norm": 2.5145182609558105, "learning_rate": 0.0008522796884571629, "epoch": 0.44 }, { "loss": 10.1017, "grad_norm": 1.0783337354660034, "learning_rate": 0.0008514045681281176, "epoch": 0.45 }, { "loss": 9.4955, "grad_norm": 6.040937423706055, "learning_rate": 0.0008505294477990724, "epoch": 0.45 }, { "loss": 10.3679, "grad_norm": 1.5212355852127075, "learning_rate": 0.0008496543274700271, "epoch": 0.45 }, { "loss": 9.7236, "grad_norm": 4.30284309387207, "learning_rate": 0.0008487792071409819, "epoch": 0.45 }, { "loss": 9.7635, "grad_norm": 2.9821696281433105, "learning_rate": 0.0008479040868119367, "epoch": 0.46 }, { "loss": 9.8438, "grad_norm": 1.676759958267212, "learning_rate": 0.0008470289664828914, "epoch": 0.46 }, { "loss": 9.6693, "grad_norm": 1.8075122833251953, "learning_rate": 0.0008461538461538462, "epoch": 0.46 }, { "loss": 10.4572, "grad_norm": 2.4182658195495605, "learning_rate": 0.000845278725824801, "epoch": 0.46 }, { "loss": 10.3901, "grad_norm": 1.7208518981933594, "learning_rate": 0.0008444036054957557, "epoch": 0.47 }, { "loss": 9.7696, "grad_norm": 2.4831340312957764, "learning_rate": 0.0008435284851667105, "epoch": 0.47 }, { "loss": 10.409, "grad_norm": 1.3335094451904297, "learning_rate": 0.0008426533648376652, "epoch": 0.47 }, { "loss": 10.526, "grad_norm": 0.9441933035850525, "learning_rate": 0.0008417782445086199, "epoch": 0.48 }, { "loss": 10.14, "grad_norm": 1.1018340587615967, "learning_rate": 0.0008409031241795747, "epoch": 0.48 }, { "loss": 10.0298, "grad_norm": 1.2077239751815796, "learning_rate": 0.0008400280038505294, "epoch": 0.48 }, { "loss": 9.7303, "grad_norm": 2.0401172637939453, "learning_rate": 0.0008391528835214842, "epoch": 0.48 }, { "loss": 10.1229, "grad_norm": 1.9456411600112915, "learning_rate": 0.0008382777631924389, "epoch": 0.49 }, { "loss": 9.9805, "grad_norm": 1.830814003944397, "learning_rate": 0.0008374026428633937, "epoch": 0.49 }, { "loss": 10.1328, "grad_norm": 2.1729185581207275, "learning_rate": 0.0008365275225343485, "epoch": 0.49 }, { "loss": 10.4834, "grad_norm": 1.324315071105957, "learning_rate": 0.0008356524022053032, "epoch": 0.49 }, { "loss": 10.349, "grad_norm": 2.837768077850342, "learning_rate": 0.000834777281876258, "epoch": 0.5 }, { "loss": 9.8015, "grad_norm": 1.1361275911331177, "learning_rate": 0.0008339021615472128, "epoch": 0.5 }, { "loss": 9.5739, "grad_norm": 1.4033498764038086, "learning_rate": 0.0008330270412181675, "epoch": 0.5 }, { "loss": 9.5204, "grad_norm": 1.1027082204818726, "learning_rate": 0.0008321519208891223, "epoch": 0.5 }, { "loss": 9.4961, "grad_norm": 2.2432548999786377, "learning_rate": 0.000831276800560077, "epoch": 0.51 }, { "loss": 10.4562, "grad_norm": 1.3807300329208374, "learning_rate": 0.0008304016802310318, "epoch": 0.51 }, { "loss": 9.9888, "grad_norm": 2.594301462173462, "learning_rate": 0.0008295265599019866, "epoch": 0.51 }, { "loss": 9.4501, "grad_norm": 1.4775426387786865, "learning_rate": 0.0008286514395729413, "epoch": 0.51 }, { "loss": 9.9432, "grad_norm": 1.463850736618042, "learning_rate": 0.0008277763192438961, "epoch": 0.52 }, { "loss": 9.7867, "grad_norm": 1.5370949506759644, "learning_rate": 0.0008269011989148508, "epoch": 0.52 }, { "loss": 9.674, "grad_norm": 1.2858608961105347, "learning_rate": 0.0008260260785858056, "epoch": 0.52 }, { "loss": 10.4663, "grad_norm": 1.2758288383483887, "learning_rate": 0.0008251509582567604, "epoch": 0.52 }, { "loss": 9.552, "grad_norm": 1.181013822555542, "learning_rate": 0.0008242758379277151, "epoch": 0.53 }, { "loss": 9.9999, "grad_norm": 0.9388832449913025, "learning_rate": 0.0008234007175986699, "epoch": 0.53 }, { "loss": 10.1529, "grad_norm": 1.3157830238342285, "learning_rate": 0.0008225255972696247, "epoch": 0.53 }, { "loss": 10.3224, "grad_norm": 1.603309154510498, "learning_rate": 0.0008216504769405794, "epoch": 0.54 }, { "loss": 9.2725, "grad_norm": 1.2987728118896484, "learning_rate": 0.0008207753566115342, "epoch": 0.54 }, { "loss": 10.2593, "grad_norm": 1.398086428642273, "learning_rate": 0.0008199002362824888, "epoch": 0.54 }, { "loss": 9.8407, "grad_norm": 1.3308155536651611, "learning_rate": 0.0008190251159534436, "epoch": 0.54 }, { "loss": 10.7467, "grad_norm": 1.3167645931243896, "learning_rate": 0.0008181499956243984, "epoch": 0.55 }, { "loss": 10.1278, "grad_norm": 1.935791254043579, "learning_rate": 0.0008172748752953531, "epoch": 0.55 }, { "loss": 9.9477, "grad_norm": 1.7790919542312622, "learning_rate": 0.0008163997549663079, "epoch": 0.55 }, { "loss": 9.2234, "grad_norm": 0.8335697650909424, "learning_rate": 0.0008155246346372626, "epoch": 0.55 }, { "loss": 9.8562, "grad_norm": 2.750474691390991, "learning_rate": 0.0008146495143082174, "epoch": 0.56 }, { "loss": 10.3218, "grad_norm": 1.4811447858810425, "learning_rate": 0.0008137743939791722, "epoch": 0.56 }, { "loss": 9.6582, "grad_norm": 1.9921342134475708, "learning_rate": 0.0008128992736501269, "epoch": 0.56 }, { "loss": 9.8513, "grad_norm": 2.635014295578003, "learning_rate": 0.0008120241533210817, "epoch": 0.56 }, { "loss": 9.8862, "grad_norm": 1.5898804664611816, "learning_rate": 0.0008111490329920365, "epoch": 0.57 }, { "loss": 9.4721, "grad_norm": 4.158829689025879, "learning_rate": 0.0008102739126629912, "epoch": 0.57 }, { "loss": 10.1474, "grad_norm": 1.8269054889678955, "learning_rate": 0.000809398792333946, "epoch": 0.57 }, { "loss": 9.4288, "grad_norm": 3.384010076522827, "learning_rate": 0.0008085236720049007, "epoch": 0.57 }, { "loss": 10.0144, "grad_norm": 1.6854453086853027, "learning_rate": 0.0008076485516758555, "epoch": 0.58 }, { "loss": 10.229, "grad_norm": 2.0812976360321045, "learning_rate": 0.0008067734313468103, "epoch": 0.58 }, { "loss": 9.7204, "grad_norm": 1.7673369646072388, "learning_rate": 0.000805898311017765, "epoch": 0.58 }, { "loss": 9.6859, "grad_norm": 2.155963897705078, "learning_rate": 0.0008050231906887198, "epoch": 0.59 }, { "loss": 10.029, "grad_norm": 1.482950210571289, "learning_rate": 0.0008041480703596745, "epoch": 0.59 }, { "loss": 9.25, "grad_norm": 2.6473171710968018, "learning_rate": 0.0008032729500306293, "epoch": 0.59 }, { "loss": 10.028, "grad_norm": 1.3584322929382324, "learning_rate": 0.0008023978297015841, "epoch": 0.59 }, { "loss": 9.6924, "grad_norm": 1.74970543384552, "learning_rate": 0.0008015227093725388, "epoch": 0.6 }, { "loss": 10.0445, "grad_norm": 2.0750019550323486, "learning_rate": 0.0008006475890434936, "epoch": 0.6 }, { "loss": 9.7962, "grad_norm": 7.219356060028076, "learning_rate": 0.0007997724687144482, "epoch": 0.6 }, { "loss": 10.215, "grad_norm": 1.2369924783706665, "learning_rate": 0.0007988973483854029, "epoch": 0.6 }, { "loss": 9.538, "grad_norm": 1.9686328172683716, "learning_rate": 0.0007980222280563577, "epoch": 0.61 }, { "loss": 10.2107, "grad_norm": 1.2081037759780884, "learning_rate": 0.0007971471077273124, "epoch": 0.61 }, { "loss": 9.6709, "grad_norm": 1.7755659818649292, "learning_rate": 0.0007962719873982672, "epoch": 0.61 }, { "loss": 9.7973, "grad_norm": 2.226400375366211, "learning_rate": 0.000795396867069222, "epoch": 0.61 }, { "loss": 9.5564, "grad_norm": 1.2814253568649292, "learning_rate": 0.0007945217467401767, "epoch": 0.62 }, { "loss": 9.7987, "grad_norm": 2.0225868225097656, "learning_rate": 0.0007936466264111315, "epoch": 0.62 }, { "loss": 10.0866, "grad_norm": 2.059910774230957, "learning_rate": 0.0007927715060820862, "epoch": 0.62 }, { "loss": 10.031, "grad_norm": 3.564408779144287, "learning_rate": 0.000791896385753041, "epoch": 0.62 }, { "loss": 9.5562, "grad_norm": 1.6237695217132568, "learning_rate": 0.0007910212654239958, "epoch": 0.63 }, { "loss": 10.032, "grad_norm": 1.8051832914352417, "learning_rate": 0.0007901461450949505, "epoch": 0.63 }, { "loss": 9.5223, "grad_norm": 1.807507872581482, "learning_rate": 0.0007892710247659053, "epoch": 0.63 }, { "loss": 9.4476, "grad_norm": 1.3200876712799072, "learning_rate": 0.00078839590443686, "epoch": 0.64 }, { "loss": 9.4836, "grad_norm": 3.295555353164673, "learning_rate": 0.0007875207841078148, "epoch": 0.64 }, { "loss": 9.9695, "grad_norm": 2.036158561706543, "learning_rate": 0.0007867331758116741, "epoch": 0.64 }, { "loss": 9.414, "grad_norm": 1.6501291990280151, "learning_rate": 0.0007858580554826289, "epoch": 0.64 }, { "loss": 10.3832, "grad_norm": 1.3873107433319092, "learning_rate": 0.0007849829351535836, "epoch": 0.65 }, { "loss": 9.6308, "grad_norm": 1.0633749961853027, "learning_rate": 0.0007841078148245384, "epoch": 0.65 }, { "loss": 9.8861, "grad_norm": 2.238201141357422, "learning_rate": 0.0007832326944954931, "epoch": 0.65 }, { "loss": 9.9682, "grad_norm": 1.2320759296417236, "learning_rate": 0.0007823575741664479, "epoch": 0.65 }, { "loss": 9.496, "grad_norm": 1.8895844221115112, "learning_rate": 0.0007814824538374027, "epoch": 0.66 }, { "loss": 9.9117, "grad_norm": 1.7297803163528442, "learning_rate": 0.0007806073335083574, "epoch": 0.66 }, { "loss": 10.0705, "grad_norm": 1.8089996576309204, "learning_rate": 0.0007797322131793122, "epoch": 0.66 }, { "loss": 9.8684, "grad_norm": 2.4221599102020264, "learning_rate": 0.000778857092850267, "epoch": 0.66 }, { "loss": 9.343, "grad_norm": 1.869035243988037, "learning_rate": 0.0007779819725212217, "epoch": 0.67 }, { "loss": 9.395, "grad_norm": 1.5427782535552979, "learning_rate": 0.0007771068521921765, "epoch": 0.67 }, { "loss": 9.3372, "grad_norm": 1.2343759536743164, "learning_rate": 0.0007762317318631312, "epoch": 0.67 }, { "loss": 10.0514, "grad_norm": 1.057860016822815, "learning_rate": 0.000775356611534086, "epoch": 0.67 }, { "loss": 9.8897, "grad_norm": 4.536896228790283, "learning_rate": 0.0007744814912050408, "epoch": 0.68 }, { "loss": 9.7529, "grad_norm": 2.2841501235961914, "learning_rate": 0.0007736063708759955, "epoch": 0.68 }, { "loss": 9.7393, "grad_norm": 1.4836674928665161, "learning_rate": 0.0007727312505469503, "epoch": 0.68 }, { "loss": 9.4403, "grad_norm": 1.9073762893676758, "learning_rate": 0.000771856130217905, "epoch": 0.69 }, { "loss": 9.8424, "grad_norm": 2.367785930633545, "learning_rate": 0.0007709810098888598, "epoch": 0.69 }, { "loss": 9.5098, "grad_norm": 0.824318528175354, "learning_rate": 0.0007701058895598146, "epoch": 0.69 }, { "loss": 9.4785, "grad_norm": 1.2716361284255981, "learning_rate": 0.0007692307692307693, "epoch": 0.69 }, { "loss": 9.8112, "grad_norm": 2.1307737827301025, "learning_rate": 0.0007683556489017241, "epoch": 0.7 }, { "loss": 9.5932, "grad_norm": 2.0558087825775146, "learning_rate": 0.0007674805285726788, "epoch": 0.7 }, { "loss": 9.5525, "grad_norm": 1.582262396812439, "learning_rate": 0.0007666054082436335, "epoch": 0.7 }, { "loss": 9.8359, "grad_norm": 7.788843154907227, "learning_rate": 0.0007657302879145883, "epoch": 0.7 }, { "loss": 10.3724, "grad_norm": 1.328479528427124, "learning_rate": 0.000764855167585543, "epoch": 0.71 }, { "loss": 8.8465, "grad_norm": 1.6026923656463623, "learning_rate": 0.0007639800472564978, "epoch": 0.71 }, { "loss": 9.4257, "grad_norm": 4.00112247467041, "learning_rate": 0.0007631049269274526, "epoch": 0.71 }, { "loss": 9.4006, "grad_norm": 1.2519035339355469, "learning_rate": 0.0007622298065984073, "epoch": 0.71 }, { "loss": 9.2469, "grad_norm": 1.0302975177764893, "learning_rate": 0.0007613546862693621, "epoch": 0.72 }, { "loss": 9.6992, "grad_norm": 1.066437840461731, "learning_rate": 0.0007604795659403168, "epoch": 0.72 }, { "loss": 8.9602, "grad_norm": 1.232923984527588, "learning_rate": 0.0007596044456112715, "epoch": 0.72 }, { "loss": 10.1371, "grad_norm": 2.129009962081909, "learning_rate": 0.0007587293252822263, "epoch": 0.72 }, { "loss": 9.3879, "grad_norm": 1.385560154914856, "learning_rate": 0.000757854204953181, "epoch": 0.73 }, { "loss": 9.898, "grad_norm": 15.102237701416016, "learning_rate": 0.0007569790846241358, "epoch": 0.73 }, { "loss": 9.723, "grad_norm": 1.5371789932250977, "learning_rate": 0.0007561039642950905, "epoch": 0.73 }, { "loss": 9.5436, "grad_norm": 1.3847825527191162, "learning_rate": 0.0007552288439660453, "epoch": 0.73 }, { "loss": 9.4084, "grad_norm": 2.662229299545288, "learning_rate": 0.0007543537236370001, "epoch": 0.74 }, { "loss": 9.6916, "grad_norm": 1.3952440023422241, "learning_rate": 0.0007534786033079548, "epoch": 0.74 }, { "loss": 9.2971, "grad_norm": 2.79449725151062, "learning_rate": 0.0007526034829789096, "epoch": 0.74 }, { "loss": 9.6677, "grad_norm": 0.959707498550415, "learning_rate": 0.0007517283626498644, "epoch": 0.75 }, { "loss": 9.5952, "grad_norm": 1.7505630254745483, "learning_rate": 0.0007508532423208191, "epoch": 0.75 }, { "loss": 9.901, "grad_norm": 4.176792621612549, "learning_rate": 0.0007499781219917739, "epoch": 0.75 }, { "loss": 9.5036, "grad_norm": 2.338407516479492, "learning_rate": 0.0007491030016627286, "epoch": 0.75 }, { "loss": 10.0173, "grad_norm": 1.4003384113311768, "learning_rate": 0.0007482278813336834, "epoch": 0.76 }, { "loss": 9.7204, "grad_norm": 2.0305333137512207, "learning_rate": 0.0007473527610046382, "epoch": 0.76 }, { "loss": 9.7901, "grad_norm": 2.2396442890167236, "learning_rate": 0.0007464776406755928, "epoch": 0.76 }, { "loss": 9.5465, "grad_norm": 3.230546474456787, "learning_rate": 0.0007456025203465476, "epoch": 0.76 }, { "loss": 8.9817, "grad_norm": 3.14975643157959, "learning_rate": 0.0007447274000175023, "epoch": 0.77 }, { "loss": 10.0403, "grad_norm": 2.1714890003204346, "learning_rate": 0.0007438522796884571, "epoch": 0.77 }, { "loss": 9.054, "grad_norm": 1.8472590446472168, "learning_rate": 0.0007429771593594119, "epoch": 0.77 }, { "loss": 9.4847, "grad_norm": 1.0868862867355347, "learning_rate": 0.0007421020390303666, "epoch": 0.77 }, { "loss": 9.5688, "grad_norm": 0.9088165760040283, "learning_rate": 0.0007412269187013214, "epoch": 0.78 }, { "loss": 9.2655, "grad_norm": 1.2336516380310059, "learning_rate": 0.0007403517983722762, "epoch": 0.78 }, { "loss": 9.6194, "grad_norm": 1.2794588804244995, "learning_rate": 0.0007394766780432309, "epoch": 0.78 }, { "loss": 9.4072, "grad_norm": 1.5056113004684448, "learning_rate": 0.0007386015577141857, "epoch": 0.78 }, { "loss": 8.781, "grad_norm": 1.809520959854126, "learning_rate": 0.0007377264373851404, "epoch": 0.79 }, { "loss": 9.3203, "grad_norm": 3.1000723838806152, "learning_rate": 0.0007368513170560952, "epoch": 0.79 }, { "loss": 9.3199, "grad_norm": 4.879993915557861, "learning_rate": 0.00073597619672705, "epoch": 0.79 }, { "loss": 10.2243, "grad_norm": 1.508380651473999, "learning_rate": 0.0007351010763980047, "epoch": 0.8 }, { "loss": 9.3476, "grad_norm": 1.2379094362258911, "learning_rate": 0.0007342259560689595, "epoch": 0.8 }, { "loss": 9.3482, "grad_norm": 1.3472929000854492, "learning_rate": 0.0007333508357399142, "epoch": 0.8 }, { "loss": 9.1645, "grad_norm": 1.2490941286087036, "learning_rate": 0.000732475715410869, "epoch": 0.8 }, { "loss": 9.8443, "grad_norm": 1.3615162372589111, "learning_rate": 0.0007316005950818238, "epoch": 0.81 }, { "loss": 9.1608, "grad_norm": 1.608033299446106, "learning_rate": 0.0007307254747527785, "epoch": 0.81 }, { "loss": 9.5366, "grad_norm": 1.819758415222168, "learning_rate": 0.0007298503544237333, "epoch": 0.81 }, { "loss": 9.3414, "grad_norm": 1.190049409866333, "learning_rate": 0.0007289752340946881, "epoch": 0.81 }, { "loss": 9.3362, "grad_norm": 1.136693000793457, "learning_rate": 0.0007281001137656428, "epoch": 0.82 }, { "loss": 9.4184, "grad_norm": 1.3066457509994507, "learning_rate": 0.0007272249934365976, "epoch": 0.82 }, { "loss": 9.3295, "grad_norm": 2.193195343017578, "learning_rate": 0.0007263498731075523, "epoch": 0.82 }, { "loss": 9.0824, "grad_norm": 1.2458583116531372, "learning_rate": 0.0007254747527785071, "epoch": 0.82 }, { "loss": 9.4671, "grad_norm": 1.4734137058258057, "learning_rate": 0.0007245996324494618, "epoch": 0.83 }, { "loss": 8.8882, "grad_norm": 1.8609868288040161, "learning_rate": 0.0007237245121204165, "epoch": 0.83 }, { "loss": 9.8334, "grad_norm": 1.2084137201309204, "learning_rate": 0.0007228493917913713, "epoch": 0.83 }, { "loss": 9.301, "grad_norm": 1.3520543575286865, "learning_rate": 0.000721974271462326, "epoch": 0.83 }, { "loss": 9.4308, "grad_norm": 1.7796053886413574, "learning_rate": 0.0007210991511332808, "epoch": 0.84 }, { "loss": 9.2915, "grad_norm": 1.583756685256958, "learning_rate": 0.0007202240308042356, "epoch": 0.84 }, { "loss": 9.543, "grad_norm": 1.3439078330993652, "learning_rate": 0.0007193489104751903, "epoch": 0.84 }, { "loss": 9.4767, "grad_norm": 1.0626850128173828, "learning_rate": 0.0007184737901461451, "epoch": 0.85 }, { "loss": 9.6831, "grad_norm": 1.559846043586731, "learning_rate": 0.0007175986698170999, "epoch": 0.85 }, { "loss": 9.3683, "grad_norm": 1.3399856090545654, "learning_rate": 0.0007167235494880546, "epoch": 0.85 }, { "loss": 9.4018, "grad_norm": 2.0115649700164795, "learning_rate": 0.0007158484291590094, "epoch": 0.85 }, { "loss": 9.6007, "grad_norm": 1.9016413688659668, "learning_rate": 0.0007149733088299641, "epoch": 0.86 }, { "loss": 9.7843, "grad_norm": 9.662792205810547, "learning_rate": 0.0007140981885009189, "epoch": 0.86 }, { "loss": 9.4248, "grad_norm": 0.9219140410423279, "learning_rate": 0.0007132230681718737, "epoch": 0.86 }, { "loss": 9.8659, "grad_norm": 1.0851889848709106, "learning_rate": 0.0007123479478428284, "epoch": 0.86 }, { "loss": 9.1677, "grad_norm": 1.0349225997924805, "learning_rate": 0.0007114728275137832, "epoch": 0.87 }, { "loss": 9.1666, "grad_norm": 1.286309003829956, "learning_rate": 0.000710597707184738, "epoch": 0.87 }, { "loss": 9.5514, "grad_norm": 1.0325031280517578, "learning_rate": 0.0007097225868556927, "epoch": 0.87 }, { "loss": 9.2542, "grad_norm": 1.2344691753387451, "learning_rate": 0.0007088474665266475, "epoch": 0.87 }, { "loss": 9.1687, "grad_norm": 0.9820197224617004, "learning_rate": 0.0007079723461976022, "epoch": 0.88 }, { "loss": 8.9295, "grad_norm": 2.573585033416748, "learning_rate": 0.000707097225868557, "epoch": 0.88 }, { "loss": 9.6702, "grad_norm": 0.8707136511802673, "learning_rate": 0.0007062221055395118, "epoch": 0.88 }, { "loss": 8.5564, "grad_norm": 0.9832028150558472, "learning_rate": 0.0007053469852104665, "epoch": 0.88 }, { "loss": 9.426, "grad_norm": 2.1577107906341553, "learning_rate": 0.0007044718648814213, "epoch": 0.89 }, { "loss": 9.2118, "grad_norm": 1.6314407587051392, "learning_rate": 0.000703596744552376, "epoch": 0.89 }, { "loss": 9.4482, "grad_norm": 1.6563376188278198, "learning_rate": 0.0007027216242233307, "epoch": 0.89 }, { "loss": 10.1221, "grad_norm": 1.3398720026016235, "learning_rate": 0.0007018465038942855, "epoch": 0.9 }, { "loss": 9.2569, "grad_norm": 1.2780015468597412, "learning_rate": 0.0007009713835652402, "epoch": 0.9 }, { "loss": 9.1485, "grad_norm": 1.3434102535247803, "learning_rate": 0.000700096263236195, "epoch": 0.9 }, { "loss": 9.3431, "grad_norm": 2.2103283405303955, "learning_rate": 0.0006992211429071497, "epoch": 0.9 }, { "loss": 9.5529, "grad_norm": 2.479997158050537, "learning_rate": 0.0006983460225781045, "epoch": 0.91 }, { "loss": 8.5835, "grad_norm": 1.3891953229904175, "learning_rate": 0.0006974709022490593, "epoch": 0.91 }, { "loss": 8.835, "grad_norm": 0.9400926828384399, "learning_rate": 0.000696595781920014, "epoch": 0.91 }, { "loss": 9.1069, "grad_norm": 1.2385962009429932, "learning_rate": 0.0006957206615909688, "epoch": 0.91 }, { "loss": 9.2235, "grad_norm": 1.7397691011428833, "learning_rate": 0.0006948455412619236, "epoch": 0.92 }, { "loss": 9.2386, "grad_norm": 1.7163151502609253, "learning_rate": 0.0006939704209328783, "epoch": 0.92 }, { "loss": 8.7562, "grad_norm": 1.5626498460769653, "learning_rate": 0.0006930953006038331, "epoch": 0.92 }, { "loss": 8.8432, "grad_norm": 1.9265193939208984, "learning_rate": 0.0006922201802747878, "epoch": 0.92 }, { "loss": 8.8117, "grad_norm": 1.4459571838378906, "learning_rate": 0.0006913450599457426, "epoch": 0.93 }, { "loss": 9.439, "grad_norm": 0.9559070467948914, "learning_rate": 0.0006904699396166974, "epoch": 0.93 }, { "loss": 9.1912, "grad_norm": 1.9344050884246826, "learning_rate": 0.0006895948192876521, "epoch": 0.93 }, { "loss": 9.5571, "grad_norm": 1.52436101436615, "learning_rate": 0.0006887196989586069, "epoch": 0.93 }, { "loss": 8.9898, "grad_norm": 1.4828134775161743, "learning_rate": 0.0006878445786295616, "epoch": 0.94 }, { "loss": 9.1776, "grad_norm": 1.4312185049057007, "learning_rate": 0.0006869694583005164, "epoch": 0.94 }, { "loss": 10.1621, "grad_norm": 1.2089942693710327, "learning_rate": 0.0006860943379714712, "epoch": 0.94 }, { "loss": 8.8634, "grad_norm": 5.034254550933838, "learning_rate": 0.0006852192176424259, "epoch": 0.94 }, { "loss": 9.1892, "grad_norm": 2.494285821914673, "learning_rate": 0.0006843440973133807, "epoch": 0.95 }, { "loss": 8.6028, "grad_norm": 1.5366199016571045, "learning_rate": 0.0006834689769843355, "epoch": 0.95 }, { "loss": 9.0938, "grad_norm": 1.1272014379501343, "learning_rate": 0.0006825938566552902, "epoch": 0.95 }, { "loss": 9.607, "grad_norm": 3.852747917175293, "learning_rate": 0.000681718736326245, "epoch": 0.96 }, { "loss": 9.6214, "grad_norm": 1.9155749082565308, "learning_rate": 0.0006808436159971996, "epoch": 0.96 }, { "loss": 8.6868, "grad_norm": 1.9045560359954834, "learning_rate": 0.0006799684956681543, "epoch": 0.96 }, { "loss": 9.8133, "grad_norm": 1.4083536863327026, "learning_rate": 0.000679093375339109, "epoch": 0.96 }, { "loss": 9.2029, "grad_norm": 4.824470043182373, "learning_rate": 0.0006782182550100638, "epoch": 0.97 }, { "loss": 9.3758, "grad_norm": 1.2905750274658203, "learning_rate": 0.0006773431346810186, "epoch": 0.97 }, { "loss": 9.2105, "grad_norm": 1.4681618213653564, "learning_rate": 0.0006764680143519733, "epoch": 0.97 }, { "loss": 9.1096, "grad_norm": 1.5041123628616333, "learning_rate": 0.0006755928940229281, "epoch": 0.97 }, { "loss": 9.1485, "grad_norm": 1.7930779457092285, "learning_rate": 0.0006747177736938829, "epoch": 0.98 }, { "loss": 9.2587, "grad_norm": 1.1871591806411743, "learning_rate": 0.0006738426533648376, "epoch": 0.98 }, { "loss": 9.2174, "grad_norm": 1.550445556640625, "learning_rate": 0.0006729675330357924, "epoch": 0.98 }, { "loss": 8.8521, "grad_norm": 1.361382007598877, "learning_rate": 0.0006720924127067471, "epoch": 0.98 }, { "loss": 9.0098, "grad_norm": 1.350142002105713, "learning_rate": 0.0006712172923777019, "epoch": 0.99 }, { "loss": 8.6736, "grad_norm": 1.2662369012832642, "learning_rate": 0.0006703421720486567, "epoch": 0.99 }, { "loss": 8.9752, "grad_norm": 1.474623441696167, "learning_rate": 0.0006694670517196114, "epoch": 0.99 }, { "loss": 8.7473, "grad_norm": 2.676971912384033, "learning_rate": 0.0006685919313905662, "epoch": 0.99 }, { "loss": 8.8512, "grad_norm": 1.114418625831604, "learning_rate": 0.000667716811061521, "epoch": 1.0 }, { "loss": 8.1921, "grad_norm": 2.0294203758239746, "learning_rate": 0.0006668416907324757, "epoch": 1.0 }, { "loss": 8.8171, "grad_norm": 0.9778627157211304, "learning_rate": 0.0006659665704034305, "epoch": 1.0 }, { "loss": 8.8809, "grad_norm": 1.621929407119751, "learning_rate": 0.0006650914500743852, "epoch": 1.01 }, { "loss": 8.9527, "grad_norm": 1.0340059995651245, "learning_rate": 0.00066421632974534, "epoch": 1.01 }, { "loss": 8.6295, "grad_norm": 1.4925633668899536, "learning_rate": 0.0006633412094162948, "epoch": 1.01 }, { "loss": 8.7158, "grad_norm": 1.3651670217514038, "learning_rate": 0.0006624660890872495, "epoch": 1.01 }, { "loss": 9.0606, "grad_norm": 1.1281485557556152, "learning_rate": 0.0006615909687582043, "epoch": 1.02 }, { "loss": 8.8925, "grad_norm": 1.0784941911697388, "learning_rate": 0.000660715848429159, "epoch": 1.02 }, { "loss": 9.1237, "grad_norm": 1.49080228805542, "learning_rate": 0.0006598407281001137, "epoch": 1.02 }, { "loss": 8.9093, "grad_norm": 1.080828309059143, "learning_rate": 0.0006589656077710685, "epoch": 1.02 }, { "loss": 8.9275, "grad_norm": 1.0867069959640503, "learning_rate": 0.0006580904874420232, "epoch": 1.03 }, { "loss": 8.5924, "grad_norm": 1.0178778171539307, "learning_rate": 0.000657215367112978, "epoch": 1.03 }, { "loss": 8.8768, "grad_norm": 0.978421688079834, "learning_rate": 0.0006563402467839327, "epoch": 1.03 }, { "loss": 8.8812, "grad_norm": 1.6234030723571777, "learning_rate": 0.0006554651264548875, "epoch": 1.03 }, { "loss": 9.5212, "grad_norm": 5.744367599487305, "learning_rate": 0.0006545900061258423, "epoch": 1.04 }, { "loss": 8.066, "grad_norm": 3.1010031700134277, "learning_rate": 0.000653714885796797, "epoch": 1.04 }, { "loss": 8.8401, "grad_norm": 1.4084874391555786, "learning_rate": 0.0006528397654677518, "epoch": 1.04 }, { "loss": 9.1554, "grad_norm": 1.4125443696975708, "learning_rate": 0.0006519646451387066, "epoch": 1.04 }, { "loss": 8.5098, "grad_norm": 1.0087417364120483, "learning_rate": 0.0006510895248096613, "epoch": 1.05 }, { "loss": 8.6227, "grad_norm": 1.404480218887329, "learning_rate": 0.0006502144044806161, "epoch": 1.05 }, { "loss": 8.7843, "grad_norm": 1.1295698881149292, "learning_rate": 0.0006493392841515708, "epoch": 1.05 }, { "loss": 8.6766, "grad_norm": 1.0821887254714966, "learning_rate": 0.0006484641638225256, "epoch": 1.06 }, { "loss": 8.6414, "grad_norm": 1.1444706916809082, "learning_rate": 0.0006475890434934804, "epoch": 1.06 }, { "loss": 8.457, "grad_norm": 1.277224063873291, "learning_rate": 0.0006467139231644351, "epoch": 1.06 }, { "loss": 9.058, "grad_norm": 1.4391515254974365, "learning_rate": 0.0006458388028353899, "epoch": 1.06 }, { "loss": 9.0137, "grad_norm": 1.1909124851226807, "learning_rate": 0.0006449636825063447, "epoch": 1.07 }, { "loss": 8.95, "grad_norm": 1.1959373950958252, "learning_rate": 0.0006440885621772994, "epoch": 1.07 }, { "loss": 8.7242, "grad_norm": 1.0742520093917847, "learning_rate": 0.0006432134418482542, "epoch": 1.07 }, { "loss": 8.6848, "grad_norm": 1.1215168237686157, "learning_rate": 0.0006423383215192089, "epoch": 1.07 }, { "loss": 8.2356, "grad_norm": 1.329377293586731, "learning_rate": 0.0006414632011901637, "epoch": 1.08 }, { "loss": 9.357, "grad_norm": 1.2252676486968994, "learning_rate": 0.0006405880808611185, "epoch": 1.08 }, { "loss": 8.9564, "grad_norm": 1.4522862434387207, "learning_rate": 0.0006397129605320732, "epoch": 1.08 }, { "loss": 9.315, "grad_norm": 1.3707520961761475, "learning_rate": 0.000638837840203028, "epoch": 1.08 }, { "loss": 8.5879, "grad_norm": 1.6546357870101929, "learning_rate": 0.0006379627198739826, "epoch": 1.09 }, { "loss": 9.4063, "grad_norm": 0.9310407638549805, "learning_rate": 0.0006370875995449374, "epoch": 1.09 }, { "loss": 8.8435, "grad_norm": 0.9878571629524231, "learning_rate": 0.0006362124792158922, "epoch": 1.09 }, { "loss": 9.0975, "grad_norm": 0.9288727045059204, "learning_rate": 0.0006353373588868469, "epoch": 1.09 }, { "loss": 9.219, "grad_norm": 0.9407894015312195, "learning_rate": 0.0006344622385578017, "epoch": 1.1 }, { "loss": 8.6555, "grad_norm": 0.9899985790252686, "learning_rate": 0.0006335871182287564, "epoch": 1.1 }, { "loss": 8.1403, "grad_norm": 0.8422369360923767, "learning_rate": 0.0006327119978997112, "epoch": 1.1 }, { "loss": 8.5879, "grad_norm": 1.1602038145065308, "learning_rate": 0.000631836877570666, "epoch": 1.11 }, { "loss": 8.8147, "grad_norm": 1.0149036645889282, "learning_rate": 0.0006309617572416207, "epoch": 1.11 }, { "loss": 8.6708, "grad_norm": 1.3015429973602295, "learning_rate": 0.0006300866369125755, "epoch": 1.11 }, { "loss": 8.213, "grad_norm": 1.0710703134536743, "learning_rate": 0.0006292115165835303, "epoch": 1.11 }, { "loss": 8.7651, "grad_norm": 0.9002228379249573, "learning_rate": 0.000628336396254485, "epoch": 1.12 }, { "loss": 9.2161, "grad_norm": 1.2090556621551514, "learning_rate": 0.0006274612759254398, "epoch": 1.12 }, { "loss": 8.4087, "grad_norm": 1.2179570198059082, "learning_rate": 0.0006265861555963945, "epoch": 1.12 }, { "loss": 8.5906, "grad_norm": 1.7626177072525024, "learning_rate": 0.0006257110352673493, "epoch": 1.12 }, { "loss": 8.7996, "grad_norm": 1.2657760381698608, "learning_rate": 0.0006248359149383041, "epoch": 1.13 }, { "loss": 8.7193, "grad_norm": 0.8737196326255798, "learning_rate": 0.0006239607946092588, "epoch": 1.13 }, { "loss": 8.5347, "grad_norm": 1.1074841022491455, "learning_rate": 0.0006230856742802136, "epoch": 1.13 }, { "loss": 8.8374, "grad_norm": 1.264391303062439, "learning_rate": 0.0006222105539511684, "epoch": 1.13 }, { "loss": 7.9866, "grad_norm": 1.0013505220413208, "learning_rate": 0.0006213354336221231, "epoch": 1.14 }, { "loss": 8.1635, "grad_norm": 1.0330276489257812, "learning_rate": 0.0006204603132930779, "epoch": 1.14 }, { "loss": 8.1751, "grad_norm": 1.125343918800354, "learning_rate": 0.0006195851929640326, "epoch": 1.14 }, { "loss": 9.082, "grad_norm": 1.0461503267288208, "learning_rate": 0.0006187100726349874, "epoch": 1.14 }, { "loss": 8.4013, "grad_norm": 1.2671931982040405, "learning_rate": 0.0006178349523059422, "epoch": 1.15 }, { "loss": 8.735, "grad_norm": 1.315640926361084, "learning_rate": 0.0006169598319768969, "epoch": 1.15 }, { "loss": 8.3872, "grad_norm": 1.0746458768844604, "learning_rate": 0.0006160847116478516, "epoch": 1.15 }, { "loss": 8.4791, "grad_norm": 0.9568318724632263, "learning_rate": 0.0006152095913188063, "epoch": 1.15 }, { "loss": 8.4284, "grad_norm": 1.0956138372421265, "learning_rate": 0.0006143344709897611, "epoch": 1.16 }, { "loss": 9.1513, "grad_norm": 1.2635217905044556, "learning_rate": 0.0006134593506607159, "epoch": 1.16 }, { "loss": 8.7084, "grad_norm": 1.242577075958252, "learning_rate": 0.0006125842303316706, "epoch": 1.16 }, { "loss": 8.9941, "grad_norm": 1.0156121253967285, "learning_rate": 0.0006117091100026254, "epoch": 1.17 }, { "loss": 8.731, "grad_norm": 1.3975499868392944, "learning_rate": 0.0006108339896735801, "epoch": 1.17 }, { "loss": 8.5287, "grad_norm": 1.0764504671096802, "learning_rate": 0.0006099588693445349, "epoch": 1.17 }, { "loss": 8.2368, "grad_norm": 1.0151234865188599, "learning_rate": 0.0006090837490154897, "epoch": 1.17 }, { "loss": 9.1091, "grad_norm": 6.751773834228516, "learning_rate": 0.0006082086286864444, "epoch": 1.18 }, { "loss": 8.7919, "grad_norm": 0.95284503698349, "learning_rate": 0.0006073335083573992, "epoch": 1.18 }, { "loss": 8.0937, "grad_norm": 1.131046175956726, "learning_rate": 0.000606458388028354, "epoch": 1.18 }, { "loss": 8.4255, "grad_norm": 0.8307482600212097, "learning_rate": 0.0006055832676993087, "epoch": 1.18 }, { "loss": 8.3428, "grad_norm": 1.1681163311004639, "learning_rate": 0.0006047081473702635, "epoch": 1.19 }, { "loss": 8.1699, "grad_norm": 1.6491031646728516, "learning_rate": 0.0006038330270412182, "epoch": 1.19 }, { "loss": 8.3981, "grad_norm": 0.9328737258911133, "learning_rate": 0.000602957906712173, "epoch": 1.19 }, { "loss": 8.5749, "grad_norm": 1.3434003591537476, "learning_rate": 0.0006020827863831278, "epoch": 1.19 }, { "loss": 8.6492, "grad_norm": 1.1651496887207031, "learning_rate": 0.0006012076660540825, "epoch": 1.2 }, { "loss": 8.9343, "grad_norm": 1.1224288940429688, "learning_rate": 0.0006003325457250373, "epoch": 1.2 }, { "loss": 8.4265, "grad_norm": 1.1075445413589478, "learning_rate": 0.0005994574253959919, "epoch": 1.2 }, { "loss": 8.3367, "grad_norm": 1.0349383354187012, "learning_rate": 0.0005985823050669467, "epoch": 1.2 }, { "loss": 8.6752, "grad_norm": 0.9915909767150879, "learning_rate": 0.0005977071847379015, "epoch": 1.21 }, { "loss": 8.2193, "grad_norm": 1.172624111175537, "learning_rate": 0.0005968320644088562, "epoch": 1.21 }, { "loss": 7.7701, "grad_norm": 1.0810112953186035, "learning_rate": 0.000595956944079811, "epoch": 1.21 }, { "loss": 8.9113, "grad_norm": 1.1411935091018677, "learning_rate": 0.0005950818237507656, "epoch": 1.22 }, { "loss": 8.3426, "grad_norm": 0.9251805543899536, "learning_rate": 0.0005942067034217204, "epoch": 1.22 }, { "loss": 8.1973, "grad_norm": 0.9023226499557495, "learning_rate": 0.0005933315830926752, "epoch": 1.22 }, { "loss": 8.8777, "grad_norm": 0.9467354416847229, "learning_rate": 0.0005924564627636299, "epoch": 1.22 }, { "loss": 8.758, "grad_norm": 0.9941525459289551, "learning_rate": 0.0005915813424345847, "epoch": 1.23 }, { "loss": 8.6786, "grad_norm": 0.7721539735794067, "learning_rate": 0.0005907062221055395, "epoch": 1.23 }, { "loss": 8.7063, "grad_norm": 0.9968111515045166, "learning_rate": 0.0005898311017764942, "epoch": 1.23 }, { "loss": 8.4121, "grad_norm": 0.8019425272941589, "learning_rate": 0.000588955981447449, "epoch": 1.23 }, { "loss": 8.8181, "grad_norm": 1.1664308309555054, "learning_rate": 0.0005880808611184037, "epoch": 1.24 }, { "loss": 8.1548, "grad_norm": 1.008786678314209, "learning_rate": 0.0005872057407893585, "epoch": 1.24 }, { "loss": 8.5725, "grad_norm": 1.2349562644958496, "learning_rate": 0.0005863306204603133, "epoch": 1.24 }, { "loss": 8.8339, "grad_norm": 1.2367397546768188, "learning_rate": 0.000585455500131268, "epoch": 1.24 }, { "loss": 8.3184, "grad_norm": 0.9427123665809631, "learning_rate": 0.0005845803798022228, "epoch": 1.25 }, { "loss": 8.2814, "grad_norm": 0.951808512210846, "learning_rate": 0.0005837052594731775, "epoch": 1.25 }, { "loss": 8.1453, "grad_norm": 1.076816439628601, "learning_rate": 0.0005828301391441323, "epoch": 1.25 }, { "loss": 8.5114, "grad_norm": 1.248741865158081, "learning_rate": 0.0005819550188150871, "epoch": 1.25 }, { "loss": 8.7265, "grad_norm": 1.0166980028152466, "learning_rate": 0.0005810798984860418, "epoch": 1.26 }, { "loss": 9.0454, "grad_norm": 1.273942232131958, "learning_rate": 0.0005802047781569966, "epoch": 1.26 }, { "loss": 8.6499, "grad_norm": 0.8551316857337952, "learning_rate": 0.0005793296578279514, "epoch": 1.26 }, { "loss": 8.0282, "grad_norm": 1.0231510400772095, "learning_rate": 0.0005784545374989061, "epoch": 1.27 }, { "loss": 8.5694, "grad_norm": 0.8138982653617859, "learning_rate": 0.0005775794171698609, "epoch": 1.27 }, { "loss": 8.9449, "grad_norm": 1.151458978652954, "learning_rate": 0.0005767042968408156, "epoch": 1.27 }, { "loss": 8.5309, "grad_norm": 1.311020851135254, "learning_rate": 0.0005758291765117704, "epoch": 1.27 }, { "loss": 8.3937, "grad_norm": 1.0431928634643555, "learning_rate": 0.0005749540561827252, "epoch": 1.28 }, { "loss": 8.0121, "grad_norm": 0.9487342238426208, "learning_rate": 0.0005740789358536799, "epoch": 1.28 }, { "loss": 8.9756, "grad_norm": 0.7705584764480591, "learning_rate": 0.0005732038155246346, "epoch": 1.28 }, { "loss": 8.9679, "grad_norm": 0.9359903335571289, "learning_rate": 0.0005723286951955893, "epoch": 1.28 }, { "loss": 8.0724, "grad_norm": 1.031725525856018, "learning_rate": 0.0005714535748665441, "epoch": 1.29 }, { "loss": 8.7014, "grad_norm": 1.0501611232757568, "learning_rate": 0.0005705784545374989, "epoch": 1.29 }, { "loss": 8.2284, "grad_norm": 0.8158836960792542, "learning_rate": 0.0005697033342084536, "epoch": 1.29 }, { "loss": 8.8206, "grad_norm": 0.8827638030052185, "learning_rate": 0.0005688282138794084, "epoch": 1.29 }, { "loss": 8.4189, "grad_norm": 0.9118880033493042, "learning_rate": 0.0005679530935503632, "epoch": 1.3 }, { "loss": 8.5532, "grad_norm": 1.2081084251403809, "learning_rate": 0.0005670779732213179, "epoch": 1.3 }, { "loss": 8.5477, "grad_norm": 1.3465925455093384, "learning_rate": 0.0005662028528922727, "epoch": 1.3 }, { "loss": 9.2068, "grad_norm": 0.8770077228546143, "learning_rate": 0.0005653277325632274, "epoch": 1.3 }, { "loss": 8.6147, "grad_norm": 1.1257092952728271, "learning_rate": 0.0005644526122341822, "epoch": 1.31 }, { "loss": 8.4279, "grad_norm": 1.0482877492904663, "learning_rate": 0.000563577491905137, "epoch": 1.31 }, { "loss": 9.1236, "grad_norm": 1.0635833740234375, "learning_rate": 0.0005627023715760917, "epoch": 1.31 }, { "loss": 8.7325, "grad_norm": 0.866674542427063, "learning_rate": 0.0005618272512470465, "epoch": 1.32 }, { "loss": 8.3691, "grad_norm": 0.9562137126922607, "learning_rate": 0.0005609521309180012, "epoch": 1.32 }, { "loss": 8.3844, "grad_norm": 1.2593939304351807, "learning_rate": 0.000560077010588956, "epoch": 1.32 }, { "loss": 8.7797, "grad_norm": 0.8865370154380798, "learning_rate": 0.0005592018902599108, "epoch": 1.32 }, { "loss": 8.7078, "grad_norm": 1.0417253971099854, "learning_rate": 0.0005583267699308655, "epoch": 1.33 }, { "loss": 8.6024, "grad_norm": 1.1513303518295288, "learning_rate": 0.0005574516496018203, "epoch": 1.33 }, { "loss": 8.4373, "grad_norm": 0.8727751970291138, "learning_rate": 0.000556576529272775, "epoch": 1.33 }, { "loss": 8.2888, "grad_norm": 1.0075277090072632, "learning_rate": 0.0005557014089437298, "epoch": 1.33 }, { "loss": 8.465, "grad_norm": 0.9511576294898987, "learning_rate": 0.0005548262886146846, "epoch": 1.34 }, { "loss": 7.7129, "grad_norm": 0.9443394541740417, "learning_rate": 0.0005539511682856393, "epoch": 1.34 }, { "loss": 8.4521, "grad_norm": 0.9932364225387573, "learning_rate": 0.0005530760479565941, "epoch": 1.34 }, { "loss": 8.2593, "grad_norm": 0.8069454431533813, "learning_rate": 0.0005522009276275489, "epoch": 1.34 }, { "loss": 8.4721, "grad_norm": 1.1227058172225952, "learning_rate": 0.0005513258072985035, "epoch": 1.35 }, { "loss": 8.9954, "grad_norm": 0.8359375596046448, "learning_rate": 0.0005504506869694583, "epoch": 1.35 }, { "loss": 8.6039, "grad_norm": 1.1721514463424683, "learning_rate": 0.000549575566640413, "epoch": 1.35 }, { "loss": 7.8393, "grad_norm": 1.031473994255066, "learning_rate": 0.0005487004463113678, "epoch": 1.35 }, { "loss": 7.8643, "grad_norm": 0.935614287853241, "learning_rate": 0.0005478253259823226, "epoch": 1.36 }, { "loss": 8.4271, "grad_norm": 0.9366902709007263, "learning_rate": 0.0005469502056532773, "epoch": 1.36 }, { "loss": 8.3338, "grad_norm": 0.9616496562957764, "learning_rate": 0.0005460750853242321, "epoch": 1.36 }, { "loss": 8.1388, "grad_norm": 2.2672061920166016, "learning_rate": 0.0005451999649951868, "epoch": 1.36 }, { "loss": 8.879, "grad_norm": 1.948036789894104, "learning_rate": 0.0005443248446661416, "epoch": 1.37 }, { "loss": 8.8816, "grad_norm": 1.0832654237747192, "learning_rate": 0.0005434497243370964, "epoch": 1.37 }, { "loss": 8.5489, "grad_norm": 0.9174715876579285, "learning_rate": 0.0005425746040080511, "epoch": 1.37 }, { "loss": 8.8525, "grad_norm": 0.8547096252441406, "learning_rate": 0.0005416994836790059, "epoch": 1.38 }, { "loss": 8.6111, "grad_norm": 0.7524705529212952, "learning_rate": 0.0005408243633499607, "epoch": 1.38 }, { "loss": 8.0862, "grad_norm": 0.8433651328086853, "learning_rate": 0.0005399492430209154, "epoch": 1.38 }, { "loss": 8.2379, "grad_norm": 0.8744563460350037, "learning_rate": 0.0005390741226918702, "epoch": 1.38 }, { "loss": 8.2883, "grad_norm": 0.8806482553482056, "learning_rate": 0.0005381990023628249, "epoch": 1.39 }, { "loss": 8.6411, "grad_norm": 0.9276745319366455, "learning_rate": 0.0005373238820337797, "epoch": 1.39 }, { "loss": 8.7561, "grad_norm": 0.9556492567062378, "learning_rate": 0.0005364487617047345, "epoch": 1.39 }, { "loss": 9.305, "grad_norm": 0.8606293797492981, "learning_rate": 0.0005355736413756892, "epoch": 1.39 }, { "loss": 8.3839, "grad_norm": 1.108547329902649, "learning_rate": 0.000534698521046644, "epoch": 1.4 }, { "loss": 8.2164, "grad_norm": 0.9102107882499695, "learning_rate": 0.0005338234007175988, "epoch": 1.4 }, { "loss": 8.606, "grad_norm": 1.0984998941421509, "learning_rate": 0.0005329482803885535, "epoch": 1.4 }, { "loss": 8.0491, "grad_norm": 1.1762152910232544, "learning_rate": 0.0005320731600595083, "epoch": 1.4 }, { "loss": 8.7257, "grad_norm": 0.9669533371925354, "learning_rate": 0.000531198039730463, "epoch": 1.41 }, { "loss": 8.4473, "grad_norm": 1.0668437480926514, "learning_rate": 0.0005303229194014178, "epoch": 1.41 }, { "loss": 8.1594, "grad_norm": 0.8289794921875, "learning_rate": 0.0005294477990723725, "epoch": 1.41 }, { "loss": 8.9208, "grad_norm": 1.0676897764205933, "learning_rate": 0.0005285726787433272, "epoch": 1.41 }, { "loss": 8.0344, "grad_norm": 0.9914399981498718, "learning_rate": 0.000527697558414282, "epoch": 1.42 }, { "loss": 7.9721, "grad_norm": 0.7524304986000061, "learning_rate": 0.0005268224380852367, "epoch": 1.42 }, { "loss": 8.5322, "grad_norm": 0.9521943926811218, "learning_rate": 0.0005259473177561915, "epoch": 1.42 }, { "loss": 8.1986, "grad_norm": 0.9657976627349854, "learning_rate": 0.0005250721974271463, "epoch": 1.43 }, { "loss": 8.476, "grad_norm": 0.9338609576225281, "learning_rate": 0.000524197077098101, "epoch": 1.43 }, { "loss": 8.0189, "grad_norm": 0.8801831007003784, "learning_rate": 0.0005233219567690558, "epoch": 1.43 }, { "loss": 8.0839, "grad_norm": 0.8173283934593201, "learning_rate": 0.0005224468364400105, "epoch": 1.43 }, { "loss": 8.3716, "grad_norm": 0.8624017238616943, "learning_rate": 0.0005215717161109653, "epoch": 1.44 }, { "loss": 8.2837, "grad_norm": 0.8650451302528381, "learning_rate": 0.0005206965957819201, "epoch": 1.44 }, { "loss": 7.889, "grad_norm": 0.8268963098526001, "learning_rate": 0.0005198214754528747, "epoch": 1.44 }, { "loss": 8.7807, "grad_norm": 0.9244619607925415, "learning_rate": 0.0005189463551238295, "epoch": 1.44 }, { "loss": 8.5503, "grad_norm": 0.8533423542976379, "learning_rate": 0.0005180712347947842, "epoch": 1.45 }, { "loss": 7.7895, "grad_norm": 0.885784924030304, "learning_rate": 0.000517196114465739, "epoch": 1.45 }, { "loss": 8.9325, "grad_norm": 1.252569556236267, "learning_rate": 0.0005163209941366938, "epoch": 1.45 }, { "loss": 7.6823, "grad_norm": 0.9340423941612244, "learning_rate": 0.0005154458738076485, "epoch": 1.45 }, { "loss": 8.5812, "grad_norm": 1.1366244554519653, "learning_rate": 0.0005145707534786033, "epoch": 1.46 }, { "loss": 8.1907, "grad_norm": 0.6764490604400635, "learning_rate": 0.0005136956331495581, "epoch": 1.46 }, { "loss": 8.7694, "grad_norm": 0.7598670721054077, "learning_rate": 0.0005128205128205128, "epoch": 1.46 }, { "loss": 8.4732, "grad_norm": 1.1497093439102173, "learning_rate": 0.0005119453924914676, "epoch": 1.46 }, { "loss": 7.9224, "grad_norm": 0.8351478576660156, "learning_rate": 0.0005110702721624223, "epoch": 1.47 }, { "loss": 8.253, "grad_norm": 0.8981735706329346, "learning_rate": 0.0005101951518333771, "epoch": 1.47 }, { "loss": 8.442, "grad_norm": 0.910393238067627, "learning_rate": 0.0005093200315043319, "epoch": 1.47 }, { "loss": 8.4128, "grad_norm": 1.0419617891311646, "learning_rate": 0.0005084449111752865, "epoch": 1.48 }, { "loss": 8.5377, "grad_norm": 1.1774574518203735, "learning_rate": 0.0005075697908462413, "epoch": 1.48 }, { "loss": 8.1727, "grad_norm": 0.8679039478302002, "learning_rate": 0.000506694670517196, "epoch": 1.48 }, { "loss": 8.2085, "grad_norm": 0.8273195028305054, "learning_rate": 0.0005058195501881508, "epoch": 1.48 }, { "loss": 9.0157, "grad_norm": 1.0897700786590576, "learning_rate": 0.0005049444298591056, "epoch": 1.49 }, { "loss": 8.5794, "grad_norm": 1.19176185131073, "learning_rate": 0.0005040693095300603, "epoch": 1.49 }, { "loss": 8.4796, "grad_norm": 0.7944311499595642, "learning_rate": 0.0005031941892010151, "epoch": 1.49 }, { "loss": 8.2379, "grad_norm": 1.1032432317733765, "learning_rate": 0.0005023190688719699, "epoch": 1.49 }, { "loss": 7.8506, "grad_norm": 0.9756267070770264, "learning_rate": 0.0005014439485429246, "epoch": 1.5 }, { "loss": 8.4113, "grad_norm": 0.8557083010673523, "learning_rate": 0.0005005688282138794, "epoch": 1.5 }, { "loss": 8.3315, "grad_norm": 0.9195913672447205, "learning_rate": 0.0004996937078848341, "epoch": 1.5 }, { "loss": 8.3911, "grad_norm": 0.7430265545845032, "learning_rate": 0.0004988185875557889, "epoch": 1.5 }, { "loss": 8.3471, "grad_norm": 0.7685049176216125, "learning_rate": 0.0004979434672267437, "epoch": 1.51 }, { "loss": 8.252, "grad_norm": 0.9667441844940186, "learning_rate": 0.0004970683468976984, "epoch": 1.51 }, { "loss": 7.9134, "grad_norm": 0.878400981426239, "learning_rate": 0.0004961932265686532, "epoch": 1.51 }, { "loss": 8.337, "grad_norm": 0.8655962944030762, "learning_rate": 0.000495318106239608, "epoch": 1.51 }, { "loss": 8.2066, "grad_norm": 0.8063825964927673, "learning_rate": 0.0004944429859105627, "epoch": 1.52 }, { "loss": 8.4102, "grad_norm": 0.7918370962142944, "learning_rate": 0.0004935678655815175, "epoch": 1.52 }, { "loss": 8.1297, "grad_norm": 1.03073251247406, "learning_rate": 0.0004926927452524722, "epoch": 1.52 }, { "loss": 8.296, "grad_norm": 0.9369198679924011, "learning_rate": 0.000491817624923427, "epoch": 1.53 }, { "loss": 7.8051, "grad_norm": 0.9166183471679688, "learning_rate": 0.0004909425045943818, "epoch": 1.53 }, { "loss": 8.0258, "grad_norm": 0.8817450404167175, "learning_rate": 0.0004900673842653365, "epoch": 1.53 }, { "loss": 7.9202, "grad_norm": 1.0320311784744263, "learning_rate": 0.0004891922639362913, "epoch": 1.53 }, { "loss": 8.6314, "grad_norm": 0.9652658700942993, "learning_rate": 0.000488317143607246, "epoch": 1.54 }, { "loss": 8.5648, "grad_norm": 1.0785067081451416, "learning_rate": 0.00048744202327820075, "epoch": 1.54 }, { "loss": 7.528, "grad_norm": 1.0575002431869507, "learning_rate": 0.0004865669029491555, "epoch": 1.54 }, { "loss": 7.9019, "grad_norm": 0.8822360634803772, "learning_rate": 0.0004856917826201103, "epoch": 1.54 }, { "loss": 8.2544, "grad_norm": 0.7296998500823975, "learning_rate": 0.00048481666229106504, "epoch": 1.55 }, { "loss": 8.5853, "grad_norm": 0.925472617149353, "learning_rate": 0.0004839415419620198, "epoch": 1.55 }, { "loss": 8.3512, "grad_norm": 0.8641199469566345, "learning_rate": 0.00048306642163297456, "epoch": 1.55 }, { "loss": 8.0277, "grad_norm": 1.0501607656478882, "learning_rate": 0.0004821913013039293, "epoch": 1.55 }, { "loss": 8.0559, "grad_norm": 0.7827814221382141, "learning_rate": 0.00048131618097488403, "epoch": 1.56 }, { "loss": 8.0869, "grad_norm": 0.929253339767456, "learning_rate": 0.0004804410606458388, "epoch": 1.56 }, { "loss": 8.2206, "grad_norm": 0.9882745742797852, "learning_rate": 0.00047956594031679355, "epoch": 1.56 }, { "loss": 8.8141, "grad_norm": 0.874455988407135, "learning_rate": 0.0004786908199877483, "epoch": 1.56 }, { "loss": 8.403, "grad_norm": 1.1270105838775635, "learning_rate": 0.0004778156996587031, "epoch": 1.57 }, { "loss": 8.7545, "grad_norm": 0.7236598134040833, "learning_rate": 0.00047694057932965784, "epoch": 1.57 }, { "loss": 8.3653, "grad_norm": 0.8243849873542786, "learning_rate": 0.0004760654590006126, "epoch": 1.57 }, { "loss": 8.0057, "grad_norm": 0.9829972386360168, "learning_rate": 0.00047519033867156736, "epoch": 1.57 }, { "loss": 7.7738, "grad_norm": 1.1444923877716064, "learning_rate": 0.0004743152183425221, "epoch": 1.58 }, { "loss": 7.9619, "grad_norm": 1.1846139430999756, "learning_rate": 0.0004734400980134769, "epoch": 1.58 }, { "loss": 8.8667, "grad_norm": 0.9437428712844849, "learning_rate": 0.00047256497768443165, "epoch": 1.58 }, { "loss": 8.2367, "grad_norm": 0.8670662641525269, "learning_rate": 0.0004716898573553864, "epoch": 1.59 }, { "loss": 7.5306, "grad_norm": 0.823538064956665, "learning_rate": 0.00047081473702634117, "epoch": 1.59 }, { "loss": 8.0832, "grad_norm": 0.8938249349594116, "learning_rate": 0.0004699396166972959, "epoch": 1.59 }, { "loss": 7.7995, "grad_norm": 0.8147523999214172, "learning_rate": 0.00046906449636825064, "epoch": 1.59 }, { "loss": 8.2207, "grad_norm": 0.7885489463806152, "learning_rate": 0.0004681893760392054, "epoch": 1.6 }, { "loss": 8.3315, "grad_norm": 0.9256998300552368, "learning_rate": 0.00046731425571016016, "epoch": 1.6 }, { "loss": 7.8139, "grad_norm": 0.7331977486610413, "learning_rate": 0.0004664391353811149, "epoch": 1.6 }, { "loss": 8.2015, "grad_norm": 0.7677296996116638, "learning_rate": 0.0004655640150520697, "epoch": 1.6 }, { "loss": 8.114, "grad_norm": 1.066036343574524, "learning_rate": 0.00046468889472302445, "epoch": 1.61 }, { "loss": 8.3314, "grad_norm": 0.7969563603401184, "learning_rate": 0.0004638137743939792, "epoch": 1.61 }, { "loss": 8.4266, "grad_norm": 0.8454012274742126, "learning_rate": 0.000462938654064934, "epoch": 1.61 }, { "loss": 8.0451, "grad_norm": 1.049949288368225, "learning_rate": 0.00046206353373588874, "epoch": 1.61 }, { "loss": 7.8993, "grad_norm": 0.8960159420967102, "learning_rate": 0.0004611884134068435, "epoch": 1.62 }, { "loss": 8.4117, "grad_norm": 1.0029221773147583, "learning_rate": 0.00046031329307779826, "epoch": 1.62 }, { "loss": 7.9899, "grad_norm": 1.0616450309753418, "learning_rate": 0.0004594381727487529, "epoch": 1.62 }, { "loss": 7.9134, "grad_norm": 0.8082576990127563, "learning_rate": 0.0004585630524197077, "epoch": 1.62 }, { "loss": 8.1685, "grad_norm": 0.9529896974563599, "learning_rate": 0.00045768793209066244, "epoch": 1.63 }, { "loss": 8.7919, "grad_norm": 0.7967125773429871, "learning_rate": 0.0004568128117616172, "epoch": 1.63 }, { "loss": 8.4375, "grad_norm": 0.8775154948234558, "learning_rate": 0.00045593769143257196, "epoch": 1.63 }, { "loss": 8.559, "grad_norm": 0.782707929611206, "learning_rate": 0.0004550625711035267, "epoch": 1.64 }, { "loss": 8.4288, "grad_norm": 0.7907795310020447, "learning_rate": 0.0004541874507744815, "epoch": 1.64 }, { "loss": 8.5237, "grad_norm": 1.0685423612594604, "learning_rate": 0.00045331233044543625, "epoch": 1.64 }, { "loss": 8.4464, "grad_norm": 1.1534669399261475, "learning_rate": 0.000452437210116391, "epoch": 1.64 }, { "loss": 7.8577, "grad_norm": 0.7411785125732422, "learning_rate": 0.00045156208978734577, "epoch": 1.65 }, { "loss": 7.8189, "grad_norm": 0.87079256772995, "learning_rate": 0.00045068696945830053, "epoch": 1.65 }, { "loss": 8.1193, "grad_norm": 0.9850463271141052, "learning_rate": 0.0004498118491292553, "epoch": 1.65 }, { "loss": 7.9457, "grad_norm": 0.8739660978317261, "learning_rate": 0.00044893672880021, "epoch": 1.65 }, { "loss": 7.728, "grad_norm": 0.8551336526870728, "learning_rate": 0.00044806160847116476, "epoch": 1.66 }, { "loss": 8.8456, "grad_norm": 0.8609566688537598, "learning_rate": 0.0004471864881421195, "epoch": 1.66 }, { "loss": 8.0812, "grad_norm": 0.7449157238006592, "learning_rate": 0.0004463113678130743, "epoch": 1.66 }, { "loss": 8.0729, "grad_norm": 0.8253002762794495, "learning_rate": 0.00044543624748402905, "epoch": 1.66 }, { "loss": 8.4942, "grad_norm": 0.8349846601486206, "learning_rate": 0.0004445611271549838, "epoch": 1.67 }, { "loss": 8.3446, "grad_norm": 0.9881287813186646, "learning_rate": 0.00044368600682593857, "epoch": 1.67 }, { "loss": 7.3313, "grad_norm": 0.863059401512146, "learning_rate": 0.00044281088649689333, "epoch": 1.67 }, { "loss": 8.4412, "grad_norm": 0.9246751666069031, "learning_rate": 0.0004419357661678481, "epoch": 1.67 }, { "loss": 8.4511, "grad_norm": 0.7963143587112427, "learning_rate": 0.00044106064583880286, "epoch": 1.68 }, { "loss": 7.8743, "grad_norm": 1.0088573694229126, "learning_rate": 0.0004401855255097576, "epoch": 1.68 }, { "loss": 8.0994, "grad_norm": 0.7680083513259888, "learning_rate": 0.0004393104051807124, "epoch": 1.68 }, { "loss": 7.8712, "grad_norm": 0.8324389457702637, "learning_rate": 0.00043843528485166714, "epoch": 1.69 }, { "loss": 7.8454, "grad_norm": 0.9649554491043091, "learning_rate": 0.00043756016452262185, "epoch": 1.69 }, { "loss": 7.925, "grad_norm": 0.7881239652633667, "learning_rate": 0.0004366850441935766, "epoch": 1.69 }, { "loss": 7.9826, "grad_norm": 1.2129865884780884, "learning_rate": 0.0004358099238645314, "epoch": 1.69 }, { "loss": 8.3911, "grad_norm": 0.7000688910484314, "learning_rate": 0.00043493480353548614, "epoch": 1.7 }, { "loss": 7.9635, "grad_norm": 0.7449495196342468, "learning_rate": 0.0004340596832064409, "epoch": 1.7 }, { "loss": 7.8492, "grad_norm": 0.7399414777755737, "learning_rate": 0.00043318456287739566, "epoch": 1.7 }, { "loss": 8.5288, "grad_norm": 1.0965951681137085, "learning_rate": 0.0004323094425483504, "epoch": 1.7 }, { "loss": 8.0104, "grad_norm": 0.8990981578826904, "learning_rate": 0.0004314343222193052, "epoch": 1.71 }, { "loss": 7.8636, "grad_norm": 0.8695485591888428, "learning_rate": 0.00043055920189025994, "epoch": 1.71 }, { "loss": 7.9194, "grad_norm": 0.7813265919685364, "learning_rate": 0.0004296840815612147, "epoch": 1.71 }, { "loss": 8.4535, "grad_norm": 0.7645956873893738, "learning_rate": 0.00042880896123216947, "epoch": 1.71 }, { "loss": 7.8434, "grad_norm": 1.0397326946258545, "learning_rate": 0.00042793384090312423, "epoch": 1.72 }, { "loss": 7.8072, "grad_norm": 0.9630481004714966, "learning_rate": 0.00042705872057407894, "epoch": 1.72 }, { "loss": 8.327, "grad_norm": 0.7939698696136475, "learning_rate": 0.0004261836002450337, "epoch": 1.72 }, { "loss": 8.2467, "grad_norm": 1.0103453397750854, "learning_rate": 0.00042530847991598846, "epoch": 1.72 }, { "loss": 7.63, "grad_norm": 0.9281976819038391, "learning_rate": 0.0004244333595869432, "epoch": 1.73 }, { "loss": 7.7603, "grad_norm": 0.7895064949989319, "learning_rate": 0.000423558239257898, "epoch": 1.73 }, { "loss": 7.6725, "grad_norm": 0.7491249442100525, "learning_rate": 0.00042268311892885275, "epoch": 1.73 }, { "loss": 8.0813, "grad_norm": 0.7357456088066101, "learning_rate": 0.0004218079985998075, "epoch": 1.74 }, { "loss": 8.1603, "grad_norm": 0.8232001066207886, "learning_rate": 0.00042093287827076227, "epoch": 1.74 }, { "loss": 8.172, "grad_norm": 0.7846309542655945, "learning_rate": 0.00042005775794171703, "epoch": 1.74 }, { "loss": 8.2372, "grad_norm": 0.9100042581558228, "learning_rate": 0.00041918263761267174, "epoch": 1.74 }, { "loss": 7.8489, "grad_norm": 0.9496660828590393, "learning_rate": 0.0004183075172836265, "epoch": 1.75 }, { "loss": 7.7246, "grad_norm": 0.7061757445335388, "learning_rate": 0.00041743239695458126, "epoch": 1.75 }, { "loss": 7.988, "grad_norm": 0.9927607774734497, "learning_rate": 0.00041655727662553597, "epoch": 1.75 }, { "loss": 7.9562, "grad_norm": 0.8585007190704346, "learning_rate": 0.00041568215629649073, "epoch": 1.75 }, { "loss": 8.1105, "grad_norm": 1.0176628828048706, "learning_rate": 0.0004148070359674455, "epoch": 1.76 }, { "loss": 7.7869, "grad_norm": 0.8576889038085938, "learning_rate": 0.00041393191563840026, "epoch": 1.76 }, { "loss": 7.7945, "grad_norm": 0.8359828591346741, "learning_rate": 0.000413056795309355, "epoch": 1.76 }, { "loss": 7.9683, "grad_norm": 0.8636084794998169, "learning_rate": 0.0004121816749803098, "epoch": 1.76 }, { "loss": 8.3303, "grad_norm": 0.9006314873695374, "learning_rate": 0.00041130655465126454, "epoch": 1.77 }, { "loss": 8.1457, "grad_norm": 1.217007040977478, "learning_rate": 0.0004104314343222193, "epoch": 1.77 }, { "loss": 8.6171, "grad_norm": 1.0577572584152222, "learning_rate": 0.00040955631399317407, "epoch": 1.77 }, { "loss": 7.9349, "grad_norm": 0.9530831575393677, "learning_rate": 0.00040868119366412883, "epoch": 1.77 }, { "loss": 8.2722, "grad_norm": 0.9652631282806396, "learning_rate": 0.0004078060733350836, "epoch": 1.78 }, { "loss": 8.185, "grad_norm": 0.7349383234977722, "learning_rate": 0.00040693095300603835, "epoch": 1.78 }, { "loss": 7.3944, "grad_norm": 1.122018814086914, "learning_rate": 0.0004060558326769931, "epoch": 1.78 }, { "loss": 7.8828, "grad_norm": 0.96207195520401, "learning_rate": 0.0004051807123479478, "epoch": 1.78 }, { "loss": 8.1287, "grad_norm": 0.833884060382843, "learning_rate": 0.0004043055920189026, "epoch": 1.79 }, { "loss": 8.0382, "grad_norm": 0.9089711904525757, "learning_rate": 0.00040343047168985734, "epoch": 1.79 }, { "loss": 8.1137, "grad_norm": 0.6977031230926514, "learning_rate": 0.0004025553513608121, "epoch": 1.79 }, { "loss": 7.9215, "grad_norm": 0.9814949631690979, "learning_rate": 0.00040168023103176687, "epoch": 1.8 }, { "loss": 8.2266, "grad_norm": 0.9767114520072937, "learning_rate": 0.00040080511070272163, "epoch": 1.8 }, { "loss": 8.3445, "grad_norm": 1.1093454360961914, "learning_rate": 0.0003999299903736764, "epoch": 1.8 }, { "loss": 8.4239, "grad_norm": 0.93362957239151, "learning_rate": 0.00039905487004463115, "epoch": 1.8 }, { "loss": 8.2468, "grad_norm": 0.9497604370117188, "learning_rate": 0.0003981797497155859, "epoch": 1.81 }, { "loss": 7.8793, "grad_norm": 0.8992236852645874, "learning_rate": 0.0003973046293865407, "epoch": 1.81 }, { "loss": 7.8246, "grad_norm": 0.9486469030380249, "learning_rate": 0.00039642950905749544, "epoch": 1.81 }, { "loss": 8.6243, "grad_norm": 0.970136284828186, "learning_rate": 0.0003955543887284502, "epoch": 1.81 }, { "loss": 7.8859, "grad_norm": 1.0090283155441284, "learning_rate": 0.0003946792683994049, "epoch": 1.82 }, { "loss": 8.156, "grad_norm": 0.9662021994590759, "learning_rate": 0.00039380414807035967, "epoch": 1.82 }, { "loss": 7.7991, "grad_norm": 0.8005274534225464, "learning_rate": 0.00039292902774131443, "epoch": 1.82 }, { "loss": 7.8432, "grad_norm": 0.8537503480911255, "learning_rate": 0.0003920539074122692, "epoch": 1.82 }, { "loss": 7.7118, "grad_norm": 0.8975428342819214, "learning_rate": 0.00039117878708322396, "epoch": 1.83 }, { "loss": 8.0563, "grad_norm": 0.9040714502334595, "learning_rate": 0.0003903036667541787, "epoch": 1.83 }, { "loss": 8.005, "grad_norm": 0.882514476776123, "learning_rate": 0.0003894285464251335, "epoch": 1.83 }, { "loss": 7.99, "grad_norm": 0.9527498483657837, "learning_rate": 0.00038855342609608824, "epoch": 1.83 }, { "loss": 7.9497, "grad_norm": 0.7327905893325806, "learning_rate": 0.000387678305767043, "epoch": 1.84 }, { "loss": 8.1346, "grad_norm": 0.9137473106384277, "learning_rate": 0.00038680318543799776, "epoch": 1.84 }, { "loss": 7.266, "grad_norm": 0.8273423910140991, "learning_rate": 0.0003859280651089525, "epoch": 1.84 }, { "loss": 7.525, "grad_norm": 1.2288787364959717, "learning_rate": 0.0003850529447799073, "epoch": 1.85 }, { "loss": 8.5105, "grad_norm": 0.7940724492073059, "learning_rate": 0.00038417782445086205, "epoch": 1.85 }, { "loss": 8.0599, "grad_norm": 0.9253759384155273, "learning_rate": 0.00038330270412181676, "epoch": 1.85 }, { "loss": 7.1757, "grad_norm": 0.8145419359207153, "learning_rate": 0.0003824275837927715, "epoch": 1.85 }, { "loss": 7.6177, "grad_norm": 1.1738182306289673, "learning_rate": 0.0003815524634637263, "epoch": 1.86 }, { "loss": 7.6901, "grad_norm": 0.9141517877578735, "learning_rate": 0.00038067734313468104, "epoch": 1.86 }, { "loss": 7.7036, "grad_norm": 1.0994611978530884, "learning_rate": 0.00037980222280563575, "epoch": 1.86 }, { "loss": 7.9458, "grad_norm": 0.8445936441421509, "learning_rate": 0.0003789271024765905, "epoch": 1.86 }, { "loss": 7.6019, "grad_norm": 0.8796238899230957, "learning_rate": 0.0003780519821475453, "epoch": 1.87 }, { "loss": 7.7582, "grad_norm": 0.7801417112350464, "learning_rate": 0.00037717686181850004, "epoch": 1.87 }, { "loss": 7.8483, "grad_norm": 1.008893609046936, "learning_rate": 0.0003763017414894548, "epoch": 1.87 }, { "loss": 8.047, "grad_norm": 0.8021620512008667, "learning_rate": 0.00037542662116040956, "epoch": 1.87 }, { "loss": 8.2537, "grad_norm": 0.919774055480957, "learning_rate": 0.0003745515008313643, "epoch": 1.88 }, { "loss": 8.1101, "grad_norm": 1.094642996788025, "learning_rate": 0.0003736763805023191, "epoch": 1.88 }, { "loss": 7.9119, "grad_norm": 1.0133185386657715, "learning_rate": 0.0003728012601732738, "epoch": 1.88 }, { "loss": 7.9624, "grad_norm": 0.7546307444572449, "learning_rate": 0.00037192613984422855, "epoch": 1.88 }, { "loss": 7.9547, "grad_norm": 0.7390889525413513, "learning_rate": 0.0003710510195151833, "epoch": 1.89 }, { "loss": 7.7794, "grad_norm": 0.9140797257423401, "learning_rate": 0.0003701758991861381, "epoch": 1.89 }, { "loss": 8.0254, "grad_norm": 0.8325345516204834, "learning_rate": 0.00036930077885709284, "epoch": 1.89 }, { "loss": 7.7692, "grad_norm": 1.228366732597351, "learning_rate": 0.0003684256585280476, "epoch": 1.9 }, { "loss": 7.2768, "grad_norm": 1.0541235208511353, "learning_rate": 0.00036755053819900236, "epoch": 1.9 }, { "loss": 8.1104, "grad_norm": 1.0765891075134277, "learning_rate": 0.0003666754178699571, "epoch": 1.9 }, { "loss": 7.5317, "grad_norm": 0.9508135914802551, "learning_rate": 0.0003658002975409119, "epoch": 1.9 }, { "loss": 7.1908, "grad_norm": 0.7984021306037903, "learning_rate": 0.00036492517721186665, "epoch": 1.91 }, { "loss": 7.8423, "grad_norm": 1.0381263494491577, "learning_rate": 0.0003640500568828214, "epoch": 1.91 }, { "loss": 8.297, "grad_norm": 0.9509484171867371, "learning_rate": 0.00036317493655377617, "epoch": 1.91 }, { "loss": 7.7339, "grad_norm": 0.8926167488098145, "learning_rate": 0.0003622998162247309, "epoch": 1.91 }, { "loss": 7.9, "grad_norm": 1.0550678968429565, "learning_rate": 0.00036142469589568564, "epoch": 1.92 }, { "loss": 7.6175, "grad_norm": 0.9359092712402344, "learning_rate": 0.0003605495755666404, "epoch": 1.92 }, { "loss": 8.0818, "grad_norm": 0.735281765460968, "learning_rate": 0.00035967445523759516, "epoch": 1.92 }, { "loss": 8.1061, "grad_norm": 0.8289329409599304, "learning_rate": 0.0003587993349085499, "epoch": 1.92 }, { "loss": 7.3778, "grad_norm": 0.7723102569580078, "learning_rate": 0.0003579242145795047, "epoch": 1.93 }, { "loss": 7.853, "grad_norm": 0.7856701612472534, "learning_rate": 0.00035704909425045945, "epoch": 1.93 }, { "loss": 8.5133, "grad_norm": 0.7649736404418945, "learning_rate": 0.0003561739739214142, "epoch": 1.93 }, { "loss": 8.4676, "grad_norm": 0.6755172610282898, "learning_rate": 0.000355298853592369, "epoch": 1.93 }, { "loss": 8.2074, "grad_norm": 0.8537729382514954, "learning_rate": 0.00035442373326332374, "epoch": 1.94 }, { "loss": 8.249, "grad_norm": 0.9827852845191956, "learning_rate": 0.0003535486129342785, "epoch": 1.94 }, { "loss": 8.4107, "grad_norm": 1.2670233249664307, "learning_rate": 0.00035267349260523326, "epoch": 1.94 }, { "loss": 8.1578, "grad_norm": 0.8494543433189392, "learning_rate": 0.000351798372276188, "epoch": 1.95 }, { "loss": 7.9296, "grad_norm": 0.8582159876823425, "learning_rate": 0.00035092325194714273, "epoch": 1.95 }, { "loss": 7.4592, "grad_norm": 0.8539626598358154, "learning_rate": 0.0003500481316180975, "epoch": 1.95 }, { "loss": 8.1603, "grad_norm": 0.9004923701286316, "learning_rate": 0.00034917301128905225, "epoch": 1.95 }, { "loss": 8.1319, "grad_norm": 0.722870945930481, "learning_rate": 0.000348297890960007, "epoch": 1.96 }, { "loss": 7.791, "grad_norm": 0.9422692656517029, "learning_rate": 0.0003474227706309618, "epoch": 1.96 }, { "loss": 8.0631, "grad_norm": 1.2248715162277222, "learning_rate": 0.00034654765030191654, "epoch": 1.96 }, { "loss": 8.3269, "grad_norm": 1.370082974433899, "learning_rate": 0.0003456725299728713, "epoch": 1.96 }, { "loss": 7.7562, "grad_norm": 1.0009835958480835, "learning_rate": 0.00034479740964382606, "epoch": 1.97 }, { "loss": 7.4909, "grad_norm": 0.9207608103752136, "learning_rate": 0.0003439222893147808, "epoch": 1.97 }, { "loss": 7.2907, "grad_norm": 1.0351985692977905, "learning_rate": 0.0003430471689857356, "epoch": 1.97 }, { "loss": 7.9972, "grad_norm": 0.9398946762084961, "learning_rate": 0.00034217204865669035, "epoch": 1.97 }, { "loss": 7.6034, "grad_norm": 0.8558303713798523, "learning_rate": 0.0003412969283276451, "epoch": 1.98 }, { "loss": 8.3452, "grad_norm": 0.8279830813407898, "learning_rate": 0.0003404218079985998, "epoch": 1.98 }, { "loss": 8.3979, "grad_norm": 0.7496762275695801, "learning_rate": 0.0003395466876695545, "epoch": 1.98 }, { "loss": 7.5979, "grad_norm": 0.865039587020874, "learning_rate": 0.0003386715673405093, "epoch": 1.98 }, { "loss": 7.7027, "grad_norm": 0.7518277764320374, "learning_rate": 0.00033779644701146405, "epoch": 1.99 }, { "loss": 7.8756, "grad_norm": 0.8984577059745789, "learning_rate": 0.0003369213266824188, "epoch": 1.99 }, { "loss": 7.4597, "grad_norm": 0.7312489151954651, "learning_rate": 0.00033604620635337357, "epoch": 1.99 }, { "loss": 7.8173, "grad_norm": 0.8688482046127319, "learning_rate": 0.00033517108602432833, "epoch": 1.99 }, { "loss": 7.6772, "grad_norm": 0.9117947816848755, "learning_rate": 0.0003342959656952831, "epoch": 2.0 }, { "loss": 7.65, "grad_norm": 1.044518232345581, "learning_rate": 0.00033342084536623786, "epoch": 2.0 }, { "loss": 7.6424, "grad_norm": 0.8763852119445801, "learning_rate": 0.0003325457250371926, "epoch": 2.0 }, { "loss": 8.1303, "grad_norm": 1.2922908067703247, "learning_rate": 0.0003316706047081474, "epoch": 2.01 }, { "loss": 8.3256, "grad_norm": 0.7980864644050598, "learning_rate": 0.00033079548437910214, "epoch": 2.01 }, { "loss": 7.7353, "grad_norm": 0.8062283396720886, "learning_rate": 0.00032992036405005685, "epoch": 2.01 }, { "loss": 8.2314, "grad_norm": 0.9204174280166626, "learning_rate": 0.0003290452437210116, "epoch": 2.01 }, { "loss": 7.5946, "grad_norm": 0.7235244512557983, "learning_rate": 0.0003281701233919664, "epoch": 2.02 }, { "loss": 7.4673, "grad_norm": 0.8126214146614075, "learning_rate": 0.00032729500306292114, "epoch": 2.02 }, { "loss": 7.6391, "grad_norm": 0.7648585438728333, "learning_rate": 0.0003264198827338759, "epoch": 2.02 }, { "loss": 8.005, "grad_norm": 0.7453392148017883, "learning_rate": 0.00032554476240483066, "epoch": 2.02 }, { "loss": 7.8703, "grad_norm": 0.8830775022506714, "learning_rate": 0.0003246696420757854, "epoch": 2.03 }, { "loss": 7.8639, "grad_norm": 1.2337687015533447, "learning_rate": 0.0003237945217467402, "epoch": 2.03 }, { "loss": 7.8224, "grad_norm": 1.0393247604370117, "learning_rate": 0.00032291940141769494, "epoch": 2.03 }, { "loss": 7.7573, "grad_norm": 0.7463309168815613, "learning_rate": 0.0003220442810886497, "epoch": 2.03 }, { "loss": 8.3318, "grad_norm": 0.8722276091575623, "learning_rate": 0.00032116916075960447, "epoch": 2.04 }, { "loss": 8.0517, "grad_norm": 0.9069348573684692, "learning_rate": 0.00032029404043055923, "epoch": 2.04 }, { "loss": 7.9696, "grad_norm": 0.7715663909912109, "learning_rate": 0.000319418920101514, "epoch": 2.04 }, { "loss": 7.7113, "grad_norm": 0.8788508176803589, "learning_rate": 0.0003185437997724687, "epoch": 2.04 }, { "loss": 7.5771, "grad_norm": 1.057786226272583, "learning_rate": 0.00031766867944342346, "epoch": 2.05 }, { "loss": 7.985, "grad_norm": 1.2888935804367065, "learning_rate": 0.0003167935591143782, "epoch": 2.05 }, { "loss": 7.5748, "grad_norm": 0.8100298047065735, "learning_rate": 0.000315918438785333, "epoch": 2.05 }, { "loss": 7.7785, "grad_norm": 0.9130757451057434, "learning_rate": 0.00031504331845628775, "epoch": 2.06 }, { "loss": 7.3718, "grad_norm": 0.895447313785553, "learning_rate": 0.0003141681981272425, "epoch": 2.06 }, { "loss": 8.0138, "grad_norm": 0.8260514736175537, "learning_rate": 0.00031329307779819727, "epoch": 2.06 }, { "loss": 7.6438, "grad_norm": 0.9353188276290894, "learning_rate": 0.00031241795746915203, "epoch": 2.06 }, { "loss": 7.9212, "grad_norm": 0.8095923066139221, "learning_rate": 0.0003115428371401068, "epoch": 2.07 }, { "loss": 8.2193, "grad_norm": 0.8156134486198425, "learning_rate": 0.00031066771681106156, "epoch": 2.07 }, { "loss": 7.6264, "grad_norm": 0.9613614082336426, "learning_rate": 0.0003097925964820163, "epoch": 2.07 }, { "loss": 7.6684, "grad_norm": 0.8426281809806824, "learning_rate": 0.0003089174761529711, "epoch": 2.07 }, { "loss": 7.7356, "grad_norm": 0.8271446824073792, "learning_rate": 0.0003080423558239258, "epoch": 2.08 }, { "loss": 7.8816, "grad_norm": 0.9108027219772339, "learning_rate": 0.00030716723549488055, "epoch": 2.08 }, { "loss": 8.5754, "grad_norm": 0.8285607099533081, "learning_rate": 0.0003062921151658353, "epoch": 2.08 }, { "loss": 7.8875, "grad_norm": 0.79032963514328, "learning_rate": 0.00030541699483679007, "epoch": 2.08 }, { "loss": 7.4168, "grad_norm": 0.8623600602149963, "learning_rate": 0.00030454187450774483, "epoch": 2.09 }, { "loss": 7.546, "grad_norm": 0.8102550506591797, "learning_rate": 0.0003036667541786996, "epoch": 2.09 }, { "loss": 7.9269, "grad_norm": 1.0298386812210083, "learning_rate": 0.00030279163384965436, "epoch": 2.09 }, { "loss": 7.6682, "grad_norm": 0.8902001976966858, "learning_rate": 0.0003019165135206091, "epoch": 2.09 }, { "loss": 8.0309, "grad_norm": 0.831743597984314, "learning_rate": 0.0003010413931915639, "epoch": 2.1 }, { "loss": 8.157, "grad_norm": 0.8056457042694092, "learning_rate": 0.00030016627286251864, "epoch": 2.1 }, { "loss": 7.6514, "grad_norm": 1.071753978729248, "learning_rate": 0.00029929115253347335, "epoch": 2.1 }, { "loss": 7.8337, "grad_norm": 0.8061104416847229, "learning_rate": 0.0002984160322044281, "epoch": 2.11 }, { "loss": 7.8925, "grad_norm": 1.1958301067352295, "learning_rate": 0.0002975409118753828, "epoch": 2.11 }, { "loss": 6.9557, "grad_norm": 0.7460314631462097, "learning_rate": 0.0002966657915463376, "epoch": 2.11 }, { "loss": 7.724, "grad_norm": 0.8949922323226929, "learning_rate": 0.00029579067121729234, "epoch": 2.11 }, { "loss": 8.1209, "grad_norm": 0.7350090146064758, "learning_rate": 0.0002949155508882471, "epoch": 2.12 }, { "loss": 7.7897, "grad_norm": 0.9530614018440247, "learning_rate": 0.00029404043055920187, "epoch": 2.12 }, { "loss": 7.7916, "grad_norm": 0.7030171155929565, "learning_rate": 0.00029316531023015663, "epoch": 2.12 }, { "loss": 7.53, "grad_norm": 0.8843898177146912, "learning_rate": 0.0002922901899011114, "epoch": 2.12 }, { "loss": 7.5228, "grad_norm": 0.9127951860427856, "learning_rate": 0.00029141506957206615, "epoch": 2.13 }, { "loss": 7.423, "grad_norm": 0.7194523811340332, "learning_rate": 0.0002905399492430209, "epoch": 2.13 }, { "loss": 8.3464, "grad_norm": 0.8251200318336487, "learning_rate": 0.0002896648289139757, "epoch": 2.13 }, { "loss": 7.8906, "grad_norm": 0.9383019804954529, "learning_rate": 0.00028878970858493044, "epoch": 2.13 }, { "loss": 6.9917, "grad_norm": 1.1721993684768677, "learning_rate": 0.0002879145882558852, "epoch": 2.14 }, { "loss": 7.7154, "grad_norm": 0.7905781865119934, "learning_rate": 0.00028703946792683996, "epoch": 2.14 }, { "loss": 7.9272, "grad_norm": 0.9261153936386108, "learning_rate": 0.00028616434759779467, "epoch": 2.14 }, { "loss": 7.9141, "grad_norm": 1.206111192703247, "learning_rate": 0.00028528922726874943, "epoch": 2.14 }, { "loss": 7.9561, "grad_norm": 0.8015759587287903, "learning_rate": 0.0002844141069397042, "epoch": 2.15 }, { "loss": 7.6844, "grad_norm": 0.970389723777771, "learning_rate": 0.00028353898661065896, "epoch": 2.15 }, { "loss": 7.7312, "grad_norm": 1.3079341650009155, "learning_rate": 0.0002826638662816137, "epoch": 2.15 }, { "loss": 7.506, "grad_norm": 0.8393199443817139, "learning_rate": 0.0002817887459525685, "epoch": 2.16 }, { "loss": 7.3006, "grad_norm": 0.9169728755950928, "learning_rate": 0.00028091362562352324, "epoch": 2.16 }, { "loss": 7.5924, "grad_norm": 0.8766190409660339, "learning_rate": 0.000280038505294478, "epoch": 2.16 }, { "loss": 8.2074, "grad_norm": 0.8473224639892578, "learning_rate": 0.00027916338496543277, "epoch": 2.16 }, { "loss": 7.2028, "grad_norm": 0.9415881037712097, "learning_rate": 0.0002782882646363875, "epoch": 2.17 }, { "loss": 7.87, "grad_norm": 0.8043491840362549, "learning_rate": 0.0002774131443073423, "epoch": 2.17 }, { "loss": 8.5354, "grad_norm": 0.9696796536445618, "learning_rate": 0.00027653802397829705, "epoch": 2.17 }, { "loss": 8.1185, "grad_norm": 0.9294397830963135, "learning_rate": 0.00027566290364925176, "epoch": 2.17 }, { "loss": 7.8844, "grad_norm": 1.0350419282913208, "learning_rate": 0.0002747877833202065, "epoch": 2.18 }, { "loss": 7.9054, "grad_norm": 1.086616039276123, "learning_rate": 0.0002739126629911613, "epoch": 2.18 }, { "loss": 7.4362, "grad_norm": 0.865028440952301, "learning_rate": 0.00027303754266211604, "epoch": 2.18 }, { "loss": 7.4039, "grad_norm": 0.8574273586273193, "learning_rate": 0.0002721624223330708, "epoch": 2.18 }, { "loss": 8.0095, "grad_norm": 1.0509589910507202, "learning_rate": 0.00027128730200402557, "epoch": 2.19 }, { "loss": 7.6467, "grad_norm": 0.7813432812690735, "learning_rate": 0.00027041218167498033, "epoch": 2.19 }, { "loss": 7.4786, "grad_norm": 0.855741560459137, "learning_rate": 0.0002695370613459351, "epoch": 2.19 }, { "loss": 7.7862, "grad_norm": 0.8451842069625854, "learning_rate": 0.00026866194101688985, "epoch": 2.19 }, { "loss": 7.7616, "grad_norm": 0.882211446762085, "learning_rate": 0.0002677868206878446, "epoch": 2.2 }, { "loss": 8.1508, "grad_norm": 0.7093100547790527, "learning_rate": 0.0002669117003587994, "epoch": 2.2 }, { "loss": 7.8715, "grad_norm": 0.9282416701316833, "learning_rate": 0.00026603658002975414, "epoch": 2.2 }, { "loss": 7.6333, "grad_norm": 0.8849425911903381, "learning_rate": 0.0002651614597007089, "epoch": 2.2 }, { "loss": 7.624, "grad_norm": 0.8789107203483582, "learning_rate": 0.0002642863393716636, "epoch": 2.21 }, { "loss": 7.5042, "grad_norm": 0.9759025573730469, "learning_rate": 0.00026341121904261837, "epoch": 2.21 }, { "loss": 7.7317, "grad_norm": 0.794627845287323, "learning_rate": 0.00026253609871357313, "epoch": 2.21 }, { "loss": 7.4743, "grad_norm": 1.3992342948913574, "learning_rate": 0.0002616609783845279, "epoch": 2.22 }, { "loss": 7.5986, "grad_norm": 0.8934722542762756, "learning_rate": 0.00026078585805548266, "epoch": 2.22 }, { "loss": 7.7515, "grad_norm": 1.0474205017089844, "learning_rate": 0.00025991073772643736, "epoch": 2.22 }, { "loss": 7.0749, "grad_norm": 0.7677063345909119, "learning_rate": 0.0002590356173973921, "epoch": 2.22 }, { "loss": 7.7033, "grad_norm": 0.8318948149681091, "learning_rate": 0.0002581604970683469, "epoch": 2.23 }, { "loss": 7.775, "grad_norm": 0.7674381136894226, "learning_rate": 0.00025728537673930165, "epoch": 2.23 }, { "loss": 7.5289, "grad_norm": 1.0669969320297241, "learning_rate": 0.0002564102564102564, "epoch": 2.23 }, { "loss": 7.3784, "grad_norm": 1.0004348754882812, "learning_rate": 0.00025553513608121117, "epoch": 2.23 }, { "loss": 7.4305, "grad_norm": 0.7937709093093872, "learning_rate": 0.00025466001575216593, "epoch": 2.24 }, { "loss": 7.1845, "grad_norm": 0.9088554382324219, "learning_rate": 0.00025378489542312064, "epoch": 2.24 }, { "loss": 7.9313, "grad_norm": 1.0221823453903198, "learning_rate": 0.0002529097750940754, "epoch": 2.24 }, { "loss": 7.4251, "grad_norm": 0.7980064153671265, "learning_rate": 0.00025203465476503016, "epoch": 2.24 }, { "loss": 8.0494, "grad_norm": 0.8470319509506226, "learning_rate": 0.0002511595344359849, "epoch": 2.25 }, { "loss": 7.7765, "grad_norm": 1.101785659790039, "learning_rate": 0.0002502844141069397, "epoch": 2.25 }, { "loss": 7.8624, "grad_norm": 0.8655755519866943, "learning_rate": 0.00024940929377789445, "epoch": 2.25 }, { "loss": 7.6855, "grad_norm": 1.0447689294815063, "learning_rate": 0.0002485341734488492, "epoch": 2.25 }, { "loss": 7.7653, "grad_norm": 0.9611648917198181, "learning_rate": 0.000247659053119804, "epoch": 2.26 }, { "loss": 8.0705, "grad_norm": 1.410849928855896, "learning_rate": 0.00024678393279075874, "epoch": 2.26 }, { "loss": 7.8147, "grad_norm": 0.9252009987831116, "learning_rate": 0.0002459088124617135, "epoch": 2.26 }, { "loss": 7.9366, "grad_norm": 0.899348258972168, "learning_rate": 0.00024503369213266826, "epoch": 2.27 }, { "loss": 8.089, "grad_norm": 0.7920341491699219, "learning_rate": 0.000244158571803623, "epoch": 2.27 }, { "loss": 7.5066, "grad_norm": 0.8289885520935059, "learning_rate": 0.00024328345147457776, "epoch": 2.27 }, { "loss": 7.4402, "grad_norm": 0.9304541349411011, "learning_rate": 0.00024240833114553252, "epoch": 2.27 }, { "loss": 8.1004, "grad_norm": 0.8798967003822327, "learning_rate": 0.00024153321081648728, "epoch": 2.28 }, { "loss": 7.8528, "grad_norm": 0.9733609557151794, "learning_rate": 0.00024065809048744201, "epoch": 2.28 }, { "loss": 7.1178, "grad_norm": 1.1248620748519897, "learning_rate": 0.00023978297015839678, "epoch": 2.28 }, { "loss": 7.7862, "grad_norm": 1.2658095359802246, "learning_rate": 0.00023890784982935154, "epoch": 2.28 }, { "loss": 7.9395, "grad_norm": 1.0820565223693848, "learning_rate": 0.0002380327295003063, "epoch": 2.29 }, { "loss": 7.4596, "grad_norm": 0.9462448954582214, "learning_rate": 0.00023715760917126106, "epoch": 2.29 }, { "loss": 7.8461, "grad_norm": 0.8025732636451721, "learning_rate": 0.00023628248884221582, "epoch": 2.29 }, { "loss": 7.7102, "grad_norm": 0.7947144508361816, "learning_rate": 0.00023540736851317059, "epoch": 2.29 }, { "loss": 7.8149, "grad_norm": 0.8819990158081055, "learning_rate": 0.00023453224818412532, "epoch": 2.3 }, { "loss": 7.5168, "grad_norm": 0.9773268103599548, "learning_rate": 0.00023365712785508008, "epoch": 2.3 }, { "loss": 7.7338, "grad_norm": 1.384716510772705, "learning_rate": 0.00023278200752603484, "epoch": 2.3 }, { "loss": 6.9549, "grad_norm": 1.1293810606002808, "learning_rate": 0.0002319068871969896, "epoch": 2.3 }, { "loss": 7.8655, "grad_norm": 0.7238449454307556, "learning_rate": 0.00023103176686794437, "epoch": 2.31 }, { "loss": 7.7399, "grad_norm": 0.8876301646232605, "learning_rate": 0.00023015664653889913, "epoch": 2.31 }, { "loss": 7.5196, "grad_norm": 0.7352742552757263, "learning_rate": 0.00022928152620985384, "epoch": 2.31 }, { "loss": 8.0545, "grad_norm": 1.0614981651306152, "learning_rate": 0.0002284064058808086, "epoch": 2.32 }, { "loss": 7.8036, "grad_norm": 0.999052882194519, "learning_rate": 0.00022753128555176336, "epoch": 2.32 }, { "loss": 7.5506, "grad_norm": 1.084981918334961, "learning_rate": 0.00022665616522271812, "epoch": 2.32 }, { "loss": 7.7953, "grad_norm": 1.110907793045044, "learning_rate": 0.00022578104489367288, "epoch": 2.32 }, { "loss": 7.6064, "grad_norm": 1.29153311252594, "learning_rate": 0.00022490592456462765, "epoch": 2.33 }, { "loss": 7.9157, "grad_norm": 1.5039303302764893, "learning_rate": 0.00022403080423558238, "epoch": 2.33 }, { "loss": 7.5924, "grad_norm": 0.850940465927124, "learning_rate": 0.00022315568390653714, "epoch": 2.33 }, { "loss": 7.9425, "grad_norm": 0.79768967628479, "learning_rate": 0.0002222805635774919, "epoch": 2.33 }, { "loss": 8.0374, "grad_norm": 0.771493673324585, "learning_rate": 0.00022140544324844667, "epoch": 2.34 }, { "loss": 7.1645, "grad_norm": 0.7525059580802917, "learning_rate": 0.00022053032291940143, "epoch": 2.34 }, { "loss": 7.5769, "grad_norm": 0.9684802293777466, "learning_rate": 0.0002196552025903562, "epoch": 2.34 }, { "loss": 7.781, "grad_norm": 1.1203564405441284, "learning_rate": 0.00021878008226131092, "epoch": 2.34 }, { "loss": 7.4585, "grad_norm": 1.0650273561477661, "learning_rate": 0.0002179049619322657, "epoch": 2.35 }, { "loss": 7.7015, "grad_norm": 0.9924284219741821, "learning_rate": 0.00021702984160322045, "epoch": 2.35 }, { "loss": 7.572, "grad_norm": 0.8644096255302429, "learning_rate": 0.0002161547212741752, "epoch": 2.35 }, { "loss": 7.8879, "grad_norm": 0.854030966758728, "learning_rate": 0.00021527960094512997, "epoch": 2.35 }, { "loss": 7.842, "grad_norm": 0.7271285653114319, "learning_rate": 0.00021440448061608473, "epoch": 2.36 }, { "loss": 7.652, "grad_norm": 0.6921567320823669, "learning_rate": 0.00021352936028703947, "epoch": 2.36 }, { "loss": 7.8335, "grad_norm": 1.2016472816467285, "learning_rate": 0.00021265423995799423, "epoch": 2.36 }, { "loss": 7.5109, "grad_norm": 0.79868084192276, "learning_rate": 0.000211779119628949, "epoch": 2.37 }, { "loss": 7.3853, "grad_norm": 0.8064858913421631, "learning_rate": 0.00021090399929990375, "epoch": 2.37 }, { "loss": 7.6334, "grad_norm": 0.9092600345611572, "learning_rate": 0.00021002887897085852, "epoch": 2.37 }, { "loss": 7.9536, "grad_norm": 1.0683679580688477, "learning_rate": 0.00020915375864181325, "epoch": 2.37 }, { "loss": 7.7399, "grad_norm": 1.1141338348388672, "learning_rate": 0.00020827863831276799, "epoch": 2.38 }, { "loss": 7.9567, "grad_norm": 0.9624096751213074, "learning_rate": 0.00020740351798372275, "epoch": 2.38 }, { "loss": 8.1788, "grad_norm": 0.7703258991241455, "learning_rate": 0.0002065283976546775, "epoch": 2.38 }, { "loss": 7.8642, "grad_norm": 0.9297539591789246, "learning_rate": 0.00020565327732563227, "epoch": 2.38 }, { "loss": 7.4837, "grad_norm": 0.7845075130462646, "learning_rate": 0.00020477815699658703, "epoch": 2.39 }, { "loss": 7.5431, "grad_norm": 0.8620021343231201, "learning_rate": 0.0002039030366675418, "epoch": 2.39 }, { "loss": 7.7398, "grad_norm": 0.8532699942588806, "learning_rate": 0.00020302791633849656, "epoch": 2.39 }, { "loss": 7.7079, "grad_norm": 1.1266266107559204, "learning_rate": 0.0002021527960094513, "epoch": 2.39 }, { "loss": 7.8789, "grad_norm": 1.003790020942688, "learning_rate": 0.00020127767568040605, "epoch": 2.4 }, { "loss": 7.5108, "grad_norm": 1.1769237518310547, "learning_rate": 0.00020040255535136081, "epoch": 2.4 }, { "loss": 7.8151, "grad_norm": 0.9078934192657471, "learning_rate": 0.00019952743502231558, "epoch": 2.4 }, { "loss": 7.4092, "grad_norm": 0.8376544713973999, "learning_rate": 0.00019865231469327034, "epoch": 2.4 }, { "loss": 7.4658, "grad_norm": 0.9094048738479614, "learning_rate": 0.0001977771943642251, "epoch": 2.41 }, { "loss": 7.6113, "grad_norm": 1.1345362663269043, "learning_rate": 0.00019690207403517984, "epoch": 2.41 }, { "loss": 7.4598, "grad_norm": 0.8164626955986023, "learning_rate": 0.0001960269537061346, "epoch": 2.41 }, { "loss": 7.707, "grad_norm": 1.125823736190796, "learning_rate": 0.00019515183337708936, "epoch": 2.41 }, { "loss": 8.0873, "grad_norm": 0.8651579022407532, "learning_rate": 0.00019427671304804412, "epoch": 2.42 }, { "loss": 7.5421, "grad_norm": 0.9041004776954651, "learning_rate": 0.00019340159271899888, "epoch": 2.42 }, { "loss": 7.9615, "grad_norm": 0.8012003302574158, "learning_rate": 0.00019252647238995364, "epoch": 2.42 }, { "loss": 7.6728, "grad_norm": 0.8691316246986389, "learning_rate": 0.00019165135206090838, "epoch": 2.43 }, { "loss": 7.4882, "grad_norm": 0.8700850605964661, "learning_rate": 0.00019077623173186314, "epoch": 2.43 }, { "loss": 7.4824, "grad_norm": 1.0540724992752075, "learning_rate": 0.00018990111140281788, "epoch": 2.43 }, { "loss": 7.3133, "grad_norm": 0.9065701365470886, "learning_rate": 0.00018902599107377264, "epoch": 2.43 }, { "loss": 8.1036, "grad_norm": 0.8794527649879456, "learning_rate": 0.0001881508707447274, "epoch": 2.44 }, { "loss": 7.3707, "grad_norm": 0.9155571460723877, "learning_rate": 0.00018727575041568216, "epoch": 2.44 }, { "loss": 7.0801, "grad_norm": 0.7177339792251587, "learning_rate": 0.0001864006300866369, "epoch": 2.44 }, { "loss": 7.4368, "grad_norm": 0.8027993440628052, "learning_rate": 0.00018552550975759166, "epoch": 2.44 }, { "loss": 8.2545, "grad_norm": 0.9770577549934387, "learning_rate": 0.00018465038942854642, "epoch": 2.45 }, { "loss": 7.4767, "grad_norm": 1.0428367853164673, "learning_rate": 0.00018377526909950118, "epoch": 2.45 }, { "loss": 8.4641, "grad_norm": 0.8214976787567139, "learning_rate": 0.00018290014877045594, "epoch": 2.45 }, { "loss": 7.054, "grad_norm": 1.1258653402328491, "learning_rate": 0.0001820250284414107, "epoch": 2.45 }, { "loss": 7.5935, "grad_norm": 1.07210373878479, "learning_rate": 0.00018114990811236544, "epoch": 2.46 }, { "loss": 7.8104, "grad_norm": 1.0441612005233765, "learning_rate": 0.0001802747877833202, "epoch": 2.46 }, { "loss": 7.6147, "grad_norm": 0.9820619821548462, "learning_rate": 0.00017939966745427496, "epoch": 2.46 }, { "loss": 8.1347, "grad_norm": 0.8725702166557312, "learning_rate": 0.00017852454712522973, "epoch": 2.46 }, { "loss": 8.227, "grad_norm": 0.8640567660331726, "learning_rate": 0.0001776494267961845, "epoch": 2.47 }, { "loss": 7.392, "grad_norm": 1.0909335613250732, "learning_rate": 0.00017677430646713925, "epoch": 2.47 }, { "loss": 6.7634, "grad_norm": 0.8133190274238586, "learning_rate": 0.000175899186138094, "epoch": 2.47 }, { "loss": 7.802, "grad_norm": 0.9833294749259949, "learning_rate": 0.00017502406580904875, "epoch": 2.48 }, { "loss": 7.2764, "grad_norm": 0.9594758152961731, "learning_rate": 0.0001741489454800035, "epoch": 2.48 }, { "loss": 7.1931, "grad_norm": 0.9970749616622925, "learning_rate": 0.00017327382515095827, "epoch": 2.48 }, { "loss": 7.6539, "grad_norm": 0.8486274480819702, "learning_rate": 0.00017239870482191303, "epoch": 2.48 }, { "loss": 7.3367, "grad_norm": 0.9591713547706604, "learning_rate": 0.0001715235844928678, "epoch": 2.49 }, { "loss": 7.545, "grad_norm": 1.1163291931152344, "learning_rate": 0.00017064846416382255, "epoch": 2.49 }, { "loss": 8.3214, "grad_norm": 0.8581505417823792, "learning_rate": 0.00016977334383477726, "epoch": 2.49 }, { "loss": 7.1871, "grad_norm": 0.8021834492683411, "learning_rate": 0.00016889822350573202, "epoch": 2.49 }, { "loss": 7.8969, "grad_norm": 0.9090090990066528, "learning_rate": 0.00016802310317668679, "epoch": 2.5 }, { "loss": 7.37, "grad_norm": 0.8283194303512573, "learning_rate": 0.00016714798284764155, "epoch": 2.5 }, { "loss": 7.4669, "grad_norm": 0.8183834552764893, "learning_rate": 0.0001662728625185963, "epoch": 2.5 }, { "loss": 7.4362, "grad_norm": 0.9701572060585022, "learning_rate": 0.00016539774218955107, "epoch": 2.5 }, { "loss": 7.2859, "grad_norm": 1.079610824584961, "learning_rate": 0.0001645226218605058, "epoch": 2.51 }, { "loss": 8.0835, "grad_norm": 0.8598064184188843, "learning_rate": 0.00016364750153146057, "epoch": 2.51 }, { "loss": 7.6696, "grad_norm": 0.8653038740158081, "learning_rate": 0.00016277238120241533, "epoch": 2.51 }, { "loss": 7.6096, "grad_norm": 1.0018919706344604, "learning_rate": 0.0001618972608733701, "epoch": 2.51 }, { "loss": 7.7412, "grad_norm": 0.8919802308082581, "learning_rate": 0.00016102214054432485, "epoch": 2.52 }, { "loss": 7.4504, "grad_norm": 0.8712960481643677, "learning_rate": 0.00016014702021527962, "epoch": 2.52 }, { "loss": 8.1, "grad_norm": 0.8894332647323608, "learning_rate": 0.00015927189988623435, "epoch": 2.52 }, { "loss": 8.1017, "grad_norm": 1.024781584739685, "learning_rate": 0.0001583967795571891, "epoch": 2.53 }, { "loss": 7.6484, "grad_norm": 0.9175984859466553, "learning_rate": 0.00015752165922814387, "epoch": 2.53 }, { "loss": 7.3766, "grad_norm": 0.9064013361930847, "learning_rate": 0.00015664653889909864, "epoch": 2.53 }, { "loss": 7.6414, "grad_norm": 0.9600405097007751, "learning_rate": 0.0001557714185700534, "epoch": 2.53 }, { "loss": 7.2811, "grad_norm": 0.9788243174552917, "learning_rate": 0.00015489629824100816, "epoch": 2.54 }, { "loss": 7.3704, "grad_norm": 0.8740330338478088, "learning_rate": 0.0001540211779119629, "epoch": 2.54 }, { "loss": 7.5645, "grad_norm": 0.8021050095558167, "learning_rate": 0.00015314605758291766, "epoch": 2.54 }, { "loss": 7.9078, "grad_norm": 1.0614405870437622, "learning_rate": 0.00015227093725387242, "epoch": 2.54 }, { "loss": 7.3365, "grad_norm": 0.8063251376152039, "learning_rate": 0.00015139581692482718, "epoch": 2.55 }, { "loss": 7.8801, "grad_norm": 0.8937615752220154, "learning_rate": 0.00015052069659578194, "epoch": 2.55 }, { "loss": 8.0013, "grad_norm": 0.9128641486167908, "learning_rate": 0.00014964557626673668, "epoch": 2.55 }, { "loss": 8.1354, "grad_norm": 0.8519286513328552, "learning_rate": 0.0001487704559376914, "epoch": 2.55 }, { "loss": 7.6918, "grad_norm": 0.9265363812446594, "learning_rate": 0.00014789533560864617, "epoch": 2.56 }, { "loss": 7.7237, "grad_norm": 1.113276720046997, "learning_rate": 0.00014702021527960093, "epoch": 2.56 }, { "loss": 7.3281, "grad_norm": 0.9011558890342712, "learning_rate": 0.0001461450949505557, "epoch": 2.56 }, { "loss": 6.577, "grad_norm": 0.990836501121521, "learning_rate": 0.00014526997462151046, "epoch": 2.56 }, { "loss": 7.4641, "grad_norm": 1.1346269845962524, "learning_rate": 0.00014439485429246522, "epoch": 2.57 }, { "loss": 7.6071, "grad_norm": 1.0057759284973145, "learning_rate": 0.00014351973396341998, "epoch": 2.57 }, { "loss": 8.0022, "grad_norm": 0.8524260520935059, "learning_rate": 0.00014264461363437472, "epoch": 2.57 }, { "loss": 7.4056, "grad_norm": 0.7590330839157104, "learning_rate": 0.00014176949330532948, "epoch": 2.58 }, { "loss": 7.9487, "grad_norm": 1.2074108123779297, "learning_rate": 0.00014089437297628424, "epoch": 2.58 }, { "loss": 7.7237, "grad_norm": 0.9621999263763428, "learning_rate": 0.000140019252647239, "epoch": 2.58 }, { "loss": 7.2588, "grad_norm": 0.843911349773407, "learning_rate": 0.00013914413231819376, "epoch": 2.58 }, { "loss": 7.1697, "grad_norm": 0.7619708180427551, "learning_rate": 0.00013826901198914853, "epoch": 2.59 }, { "loss": 7.2903, "grad_norm": 1.0736790895462036, "learning_rate": 0.00013739389166010326, "epoch": 2.59 }, { "loss": 7.9193, "grad_norm": 1.03206467628479, "learning_rate": 0.00013651877133105802, "epoch": 2.59 }, { "loss": 8.182, "grad_norm": 0.9106431603431702, "learning_rate": 0.00013564365100201278, "epoch": 2.59 }, { "loss": 7.5139, "grad_norm": 0.9506519436836243, "learning_rate": 0.00013476853067296755, "epoch": 2.6 }, { "loss": 7.9055, "grad_norm": 0.859704852104187, "learning_rate": 0.0001338934103439223, "epoch": 2.6 }, { "loss": 8.0011, "grad_norm": 0.9628238677978516, "learning_rate": 0.00013301829001487707, "epoch": 2.6 }, { "loss": 7.4412, "grad_norm": 0.8472156524658203, "learning_rate": 0.0001321431696858318, "epoch": 2.6 }, { "loss": 7.6981, "grad_norm": 0.9454402327537537, "learning_rate": 0.00013126804935678657, "epoch": 2.61 }, { "loss": 7.4101, "grad_norm": 0.8925793766975403, "learning_rate": 0.00013039292902774133, "epoch": 2.61 }, { "loss": 7.1784, "grad_norm": 0.8468560576438904, "learning_rate": 0.00012951780869869606, "epoch": 2.61 }, { "loss": 7.6655, "grad_norm": 0.8432177901268005, "learning_rate": 0.00012864268836965082, "epoch": 2.61 }, { "loss": 7.4518, "grad_norm": 0.813543438911438, "learning_rate": 0.00012776756804060559, "epoch": 2.62 }, { "loss": 7.5661, "grad_norm": 1.134985327720642, "learning_rate": 0.00012689244771156032, "epoch": 2.62 }, { "loss": 7.3611, "grad_norm": 1.05497407913208, "learning_rate": 0.00012601732738251508, "epoch": 2.62 }, { "loss": 7.8965, "grad_norm": 1.0532019138336182, "learning_rate": 0.00012514220705346984, "epoch": 2.62 }, { "loss": 8.113, "grad_norm": 1.0708712339401245, "learning_rate": 0.0001242670867244246, "epoch": 2.63 }, { "loss": 7.4782, "grad_norm": 1.1848175525665283, "learning_rate": 0.00012339196639537937, "epoch": 2.63 }, { "loss": 7.761, "grad_norm": 0.9672744870185852, "learning_rate": 0.00012251684606633413, "epoch": 2.63 }, { "loss": 7.8212, "grad_norm": 1.2713532447814941, "learning_rate": 0.00012164172573728888, "epoch": 2.64 }, { "loss": 7.5313, "grad_norm": 1.026662826538086, "learning_rate": 0.00012076660540824364, "epoch": 2.64 }, { "loss": 7.9599, "grad_norm": 0.8448575139045715, "learning_rate": 0.00011989148507919839, "epoch": 2.64 }, { "loss": 7.7506, "grad_norm": 1.041380524635315, "learning_rate": 0.00011901636475015315, "epoch": 2.64 }, { "loss": 7.9023, "grad_norm": 0.8197987675666809, "learning_rate": 0.00011814124442110791, "epoch": 2.65 }, { "loss": 7.4913, "grad_norm": 0.918388307094574, "learning_rate": 0.00011726612409206266, "epoch": 2.65 }, { "loss": 7.8685, "grad_norm": 0.9161803722381592, "learning_rate": 0.00011639100376301742, "epoch": 2.65 }, { "loss": 7.5855, "grad_norm": 0.8994104266166687, "learning_rate": 0.00011551588343397218, "epoch": 2.65 }, { "loss": 7.6488, "grad_norm": 0.8985808491706848, "learning_rate": 0.00011464076310492692, "epoch": 2.66 }, { "loss": 7.5261, "grad_norm": 0.9975460767745972, "learning_rate": 0.00011376564277588168, "epoch": 2.66 }, { "loss": 7.9051, "grad_norm": 1.051378607749939, "learning_rate": 0.00011289052244683644, "epoch": 2.66 }, { "loss": 7.2571, "grad_norm": 1.017866611480713, "learning_rate": 0.00011201540211779119, "epoch": 2.66 }, { "loss": 7.6068, "grad_norm": 1.1010361909866333, "learning_rate": 0.00011114028178874595, "epoch": 2.67 }, { "loss": 7.6306, "grad_norm": 0.9585467576980591, "learning_rate": 0.00011026516145970071, "epoch": 2.67 }, { "loss": 7.3702, "grad_norm": 0.9484645128250122, "learning_rate": 0.00010939004113065546, "epoch": 2.67 }, { "loss": 7.4482, "grad_norm": 1.0726372003555298, "learning_rate": 0.00010851492080161022, "epoch": 2.67 }, { "loss": 7.7554, "grad_norm": 0.8078585863113403, "learning_rate": 0.00010763980047256499, "epoch": 2.68 }, { "loss": 7.3881, "grad_norm": 0.9488946199417114, "learning_rate": 0.00010676468014351973, "epoch": 2.68 }, { "loss": 7.7557, "grad_norm": 0.8590677976608276, "learning_rate": 0.0001058895598144745, "epoch": 2.68 }, { "loss": 7.2128, "grad_norm": 0.8768866062164307, "learning_rate": 0.00010501443948542926, "epoch": 2.69 }, { "loss": 7.6447, "grad_norm": 1.1127121448516846, "learning_rate": 0.00010413931915638399, "epoch": 2.69 }, { "loss": 7.7283, "grad_norm": 0.7706397771835327, "learning_rate": 0.00010326419882733875, "epoch": 2.69 }, { "loss": 7.8187, "grad_norm": 0.910484254360199, "learning_rate": 0.00010238907849829352, "epoch": 2.69 }, { "loss": 6.9677, "grad_norm": 0.8292771577835083, "learning_rate": 0.00010151395816924828, "epoch": 2.7 }, { "loss": 7.7939, "grad_norm": 1.2936872243881226, "learning_rate": 0.00010063883784020303, "epoch": 2.7 }, { "loss": 7.2773, "grad_norm": 1.050876259803772, "learning_rate": 9.976371751115779e-05, "epoch": 2.7 }, { "loss": 7.6461, "grad_norm": 1.0275306701660156, "learning_rate": 9.888859718211255e-05, "epoch": 2.7 }, { "loss": 7.4058, "grad_norm": 0.9414623379707336, "learning_rate": 9.80134768530673e-05, "epoch": 2.71 }, { "loss": 7.4938, "grad_norm": 0.8367570042610168, "learning_rate": 9.713835652402206e-05, "epoch": 2.71 }, { "loss": 7.4702, "grad_norm": 0.9100292325019836, "learning_rate": 9.626323619497682e-05, "epoch": 2.71 }, { "loss": 7.4209, "grad_norm": 0.881262481212616, "learning_rate": 9.538811586593157e-05, "epoch": 2.71 }, { "loss": 7.568, "grad_norm": 1.0841021537780762, "learning_rate": 9.451299553688632e-05, "epoch": 2.72 }, { "loss": 7.4385, "grad_norm": 0.8553777933120728, "learning_rate": 9.363787520784108e-05, "epoch": 2.72 }, { "loss": 7.7745, "grad_norm": 0.8244187235832214, "learning_rate": 9.276275487879583e-05, "epoch": 2.72 }, { "loss": 7.3427, "grad_norm": 1.0330350399017334, "learning_rate": 9.188763454975059e-05, "epoch": 2.72 }, { "loss": 7.4313, "grad_norm": 0.86846524477005, "learning_rate": 9.101251422070535e-05, "epoch": 2.73 }, { "loss": 7.6994, "grad_norm": 1.0151475667953491, "learning_rate": 9.01373938916601e-05, "epoch": 2.73 }, { "loss": 7.6994, "grad_norm": 0.8053341507911682, "learning_rate": 8.926227356261486e-05, "epoch": 2.73 }, { "loss": 7.8782, "grad_norm": 0.917957067489624, "learning_rate": 8.838715323356962e-05, "epoch": 2.74 }, { "loss": 7.5889, "grad_norm": 1.556181788444519, "learning_rate": 8.751203290452437e-05, "epoch": 2.74 }, { "loss": 7.6279, "grad_norm": 1.043771743774414, "learning_rate": 8.663691257547913e-05, "epoch": 2.74 }, { "loss": 7.8682, "grad_norm": 1.1640032529830933, "learning_rate": 8.57617922464339e-05, "epoch": 2.74 }, { "loss": 7.8918, "grad_norm": 0.8830235600471497, "learning_rate": 8.488667191738863e-05, "epoch": 2.75 }, { "loss": 7.5466, "grad_norm": 0.958690345287323, "learning_rate": 8.401155158834339e-05, "epoch": 2.75 }, { "loss": 7.5439, "grad_norm": 1.1970360279083252, "learning_rate": 8.313643125929815e-05, "epoch": 2.75 }, { "loss": 8.1002, "grad_norm": 0.9388788938522339, "learning_rate": 8.22613109302529e-05, "epoch": 2.75 }, { "loss": 7.6892, "grad_norm": 1.0798841714859009, "learning_rate": 8.138619060120766e-05, "epoch": 2.76 }, { "loss": 8.0534, "grad_norm": 1.2909208536148071, "learning_rate": 8.051107027216243e-05, "epoch": 2.76 }, { "loss": 7.3369, "grad_norm": 1.272641658782959, "learning_rate": 7.963594994311717e-05, "epoch": 2.76 }, { "loss": 7.5785, "grad_norm": 0.9654033780097961, "learning_rate": 7.876082961407194e-05, "epoch": 2.76 }, { "loss": 7.8078, "grad_norm": 0.8423277139663696, "learning_rate": 7.78857092850267e-05, "epoch": 2.77 }, { "loss": 7.8086, "grad_norm": 0.9509181380271912, "learning_rate": 7.701058895598145e-05, "epoch": 2.77 }, { "loss": 8.1405, "grad_norm": 0.9167718291282654, "learning_rate": 7.613546862693621e-05, "epoch": 2.77 }, { "loss": 7.7728, "grad_norm": 0.9845168590545654, "learning_rate": 7.526034829789097e-05, "epoch": 2.77 }, { "loss": 7.4146, "grad_norm": 0.9597529768943787, "learning_rate": 7.43852279688457e-05, "epoch": 2.78 }, { "loss": 7.0711, "grad_norm": 1.0068391561508179, "learning_rate": 7.351010763980047e-05, "epoch": 2.78 }, { "loss": 7.1173, "grad_norm": 0.8510629534721375, "learning_rate": 7.263498731075523e-05, "epoch": 2.78 }, { "loss": 7.1843, "grad_norm": 0.8737899661064148, "learning_rate": 7.175986698170999e-05, "epoch": 2.79 }, { "loss": 7.376, "grad_norm": 0.9045628905296326, "learning_rate": 7.088474665266474e-05, "epoch": 2.79 }, { "loss": 7.4447, "grad_norm": 0.8932380080223083, "learning_rate": 7.00096263236195e-05, "epoch": 2.79 }, { "loss": 7.362, "grad_norm": 0.8961164951324463, "learning_rate": 6.913450599457426e-05, "epoch": 2.79 }, { "loss": 7.4237, "grad_norm": 1.0015422105789185, "learning_rate": 6.825938566552901e-05, "epoch": 2.8 }, { "loss": 7.2541, "grad_norm": 0.9842544198036194, "learning_rate": 6.738426533648377e-05, "epoch": 2.8 }, { "loss": 8.0427, "grad_norm": 1.0375638008117676, "learning_rate": 6.650914500743853e-05, "epoch": 2.8 }, { "loss": 7.4801, "grad_norm": 0.9552834630012512, "learning_rate": 6.563402467839328e-05, "epoch": 2.8 }, { "loss": 7.8596, "grad_norm": 0.8038078546524048, "learning_rate": 6.475890434934803e-05, "epoch": 2.81 }, { "loss": 7.2653, "grad_norm": 0.8008092045783997, "learning_rate": 6.388378402030279e-05, "epoch": 2.81 }, { "loss": 7.159, "grad_norm": 1.087442398071289, "learning_rate": 6.300866369125754e-05, "epoch": 2.81 }, { "loss": 7.0556, "grad_norm": 1.0442233085632324, "learning_rate": 6.21335433622123e-05, "epoch": 2.81 }, { "loss": 7.162, "grad_norm": 1.0271589756011963, "learning_rate": 6.125842303316706e-05, "epoch": 2.82 }, { "loss": 7.5864, "grad_norm": 0.9957409501075745, "learning_rate": 6.038330270412182e-05, "epoch": 2.82 }, { "loss": 8.4511, "grad_norm": 0.870765745639801, "learning_rate": 5.9508182375076575e-05, "epoch": 2.82 }, { "loss": 7.9488, "grad_norm": 0.8632308840751648, "learning_rate": 5.863306204603133e-05, "epoch": 2.82 }, { "loss": 8.1216, "grad_norm": 1.1113914251327515, "learning_rate": 5.775794171698609e-05, "epoch": 2.83 }, { "loss": 7.7049, "grad_norm": 0.9410499334335327, "learning_rate": 5.688282138794084e-05, "epoch": 2.83 }, { "loss": 7.6916, "grad_norm": 0.8908835053443909, "learning_rate": 5.6007701058895595e-05, "epoch": 2.83 }, { "loss": 7.5659, "grad_norm": 0.7924339175224304, "learning_rate": 5.513258072985036e-05, "epoch": 2.83 }, { "loss": 7.4359, "grad_norm": 0.8098507523536682, "learning_rate": 5.425746040080511e-05, "epoch": 2.84 }, { "loss": 7.4043, "grad_norm": 0.8541660904884338, "learning_rate": 5.338234007175987e-05, "epoch": 2.84 }, { "loss": 7.5664, "grad_norm": 0.9474323987960815, "learning_rate": 5.250721974271463e-05, "epoch": 2.84 }, { "loss": 7.7903, "grad_norm": 1.0568387508392334, "learning_rate": 5.163209941366938e-05, "epoch": 2.85 }, { "loss": 7.4216, "grad_norm": 0.9031184315681458, "learning_rate": 5.075697908462414e-05, "epoch": 2.85 }, { "loss": 7.5944, "grad_norm": 0.8136922121047974, "learning_rate": 4.9881858755578894e-05, "epoch": 2.85 }, { "loss": 7.6272, "grad_norm": 1.1002339124679565, "learning_rate": 4.900673842653365e-05, "epoch": 2.85 }, { "loss": 7.5846, "grad_norm": 1.2232916355133057, "learning_rate": 4.813161809748841e-05, "epoch": 2.86 }, { "loss": 7.8478, "grad_norm": 0.8891430497169495, "learning_rate": 4.725649776844316e-05, "epoch": 2.86 }, { "loss": 7.3, "grad_norm": 0.9129414558410645, "learning_rate": 4.6381377439397914e-05, "epoch": 2.86 }, { "loss": 7.4529, "grad_norm": 0.7938532829284668, "learning_rate": 4.5506257110352676e-05, "epoch": 2.86 }, { "loss": 7.2803, "grad_norm": 0.9501358270645142, "learning_rate": 4.463113678130743e-05, "epoch": 2.87 }, { "loss": 7.5943, "grad_norm": 1.0423897504806519, "learning_rate": 4.3756016452262186e-05, "epoch": 2.87 }, { "loss": 7.2376, "grad_norm": 0.9883305430412292, "learning_rate": 4.288089612321695e-05, "epoch": 2.87 }, { "loss": 7.6255, "grad_norm": 0.9974358677864075, "learning_rate": 4.2005775794171696e-05, "epoch": 2.87 }, { "loss": 7.2739, "grad_norm": 0.9481905102729797, "learning_rate": 4.113065546512645e-05, "epoch": 2.88 }, { "loss": 8.0077, "grad_norm": 1.067797064781189, "learning_rate": 4.025553513608121e-05, "epoch": 2.88 }, { "loss": 7.5522, "grad_norm": 0.8410007953643799, "learning_rate": 3.938041480703597e-05, "epoch": 2.88 }, { "loss": 7.5854, "grad_norm": 0.847583532333374, "learning_rate": 3.8505294477990723e-05, "epoch": 2.88 }, { "loss": 7.2142, "grad_norm": 1.0279533863067627, "learning_rate": 3.7630174148945485e-05, "epoch": 2.89 }, { "loss": 7.4712, "grad_norm": 1.1256965398788452, "learning_rate": 3.6755053819900234e-05, "epoch": 2.89 }, { "loss": 7.7947, "grad_norm": 1.0278571844100952, "learning_rate": 3.5879933490854995e-05, "epoch": 2.89 }, { "loss": 7.3523, "grad_norm": 0.9609654545783997, "learning_rate": 3.500481316180975e-05, "epoch": 2.9 }, { "loss": 7.3334, "grad_norm": 0.8453736901283264, "learning_rate": 3.4129692832764505e-05, "epoch": 2.9 }, { "loss": 7.177, "grad_norm": 0.8161653280258179, "learning_rate": 3.325457250371927e-05, "epoch": 2.9 }, { "loss": 7.9061, "grad_norm": 0.9861032366752625, "learning_rate": 3.2379452174674016e-05, "epoch": 2.9 }, { "loss": 7.3155, "grad_norm": 1.1409838199615479, "learning_rate": 3.150433184562877e-05, "epoch": 2.91 }, { "loss": 7.7667, "grad_norm": 0.8848074078559875, "learning_rate": 3.062921151658353e-05, "epoch": 2.91 }, { "loss": 7.2722, "grad_norm": 0.8996227979660034, "learning_rate": 2.9754091187538288e-05, "epoch": 2.91 }, { "loss": 7.4819, "grad_norm": 0.9429714679718018, "learning_rate": 2.8878970858493046e-05, "epoch": 2.91 }, { "loss": 7.4154, "grad_norm": 1.1169899702072144, "learning_rate": 2.8003850529447798e-05, "epoch": 2.92 }, { "loss": 7.7691, "grad_norm": 0.8326570987701416, "learning_rate": 2.7128730200402556e-05, "epoch": 2.92 }, { "loss": 7.6196, "grad_norm": 0.9243487119674683, "learning_rate": 2.6253609871357314e-05, "epoch": 2.92 }, { "loss": 7.6529, "grad_norm": 0.8465039730072021, "learning_rate": 2.537848954231207e-05, "epoch": 2.92 }, { "loss": 7.2623, "grad_norm": 1.0216766595840454, "learning_rate": 2.4503369213266825e-05, "epoch": 2.93 }, { "loss": 7.5628, "grad_norm": 0.9314711689949036, "learning_rate": 2.362824888422158e-05, "epoch": 2.93 }, { "loss": 7.9252, "grad_norm": 0.8769168853759766, "learning_rate": 2.2753128555176338e-05, "epoch": 2.93 }, { "loss": 7.2971, "grad_norm": 0.8925982713699341, "learning_rate": 2.1878008226131093e-05, "epoch": 2.93 }, { "loss": 7.1022, "grad_norm": 0.937786340713501, "learning_rate": 2.1002887897085848e-05, "epoch": 2.94 }, { "loss": 7.5253, "grad_norm": 0.900693416595459, "learning_rate": 2.0127767568040607e-05, "epoch": 2.94 }, { "loss": 7.5837, "grad_norm": 0.9113482236862183, "learning_rate": 1.9252647238995362e-05, "epoch": 2.94 }, { "loss": 7.7925, "grad_norm": 0.8734735250473022, "learning_rate": 1.8377526909950117e-05, "epoch": 2.95 }, { "loss": 7.5821, "grad_norm": 0.8616068959236145, "learning_rate": 1.7502406580904875e-05, "epoch": 2.95 }, { "loss": 6.7659, "grad_norm": 0.8509213328361511, "learning_rate": 1.6627286251859634e-05, "epoch": 2.95 }, { "loss": 7.9045, "grad_norm": 0.8518444895744324, "learning_rate": 1.5752165922814385e-05, "epoch": 2.95 }, { "loss": 7.2314, "grad_norm": 1.1429413557052612, "learning_rate": 1.4877045593769144e-05, "epoch": 2.96 }, { "loss": 7.5707, "grad_norm": 0.825677752494812, "learning_rate": 1.4001925264723899e-05, "epoch": 2.96 }, { "loss": 7.2231, "grad_norm": 0.9227612018585205, "learning_rate": 1.3126804935678657e-05, "epoch": 2.96 }, { "loss": 7.3015, "grad_norm": 0.9745140671730042, "learning_rate": 1.2251684606633412e-05, "epoch": 2.96 }, { "loss": 7.5931, "grad_norm": 0.8096091151237488, "learning_rate": 1.1376564277588169e-05, "epoch": 2.97 }, { "loss": 7.2393, "grad_norm": 0.9233807921409607, "learning_rate": 1.0501443948542924e-05, "epoch": 2.97 }, { "loss": 7.2871, "grad_norm": 0.7690852880477905, "learning_rate": 9.626323619497681e-06, "epoch": 2.97 }, { "loss": 7.5845, "grad_norm": 0.882102370262146, "learning_rate": 8.751203290452438e-06, "epoch": 2.97 }, { "loss": 7.2335, "grad_norm": 0.887958288192749, "learning_rate": 7.876082961407193e-06, "epoch": 2.98 }, { "loss": 7.5324, "grad_norm": 0.8895597457885742, "learning_rate": 7.000962632361949e-06, "epoch": 2.98 }, { "loss": 7.1145, "grad_norm": 0.8137519955635071, "learning_rate": 6.125842303316706e-06, "epoch": 2.98 }, { "loss": 7.411, "grad_norm": 0.9460362195968628, "learning_rate": 5.250721974271462e-06, "epoch": 2.98 }, { "loss": 7.6058, "grad_norm": 0.9842742681503296, "learning_rate": 4.375601645226219e-06, "epoch": 2.99 }, { "loss": 7.6927, "grad_norm": 0.938562273979187, "learning_rate": 3.5004813161809747e-06, "epoch": 2.99 }, { "loss": 7.6995, "grad_norm": 0.9931243658065796, "learning_rate": 2.625360987135731e-06, "epoch": 2.99 }, { "loss": 7.5572, "grad_norm": 0.8916573524475098, "learning_rate": 1.7502406580904874e-06, "epoch": 3.0 }, { "loss": 7.2556, "grad_norm": 0.780832052230835, "learning_rate": 8.751203290452437e-07, "epoch": 3.0 }, { "train_runtime": 112786.1501, "train_samples_per_second": 3.243, "train_steps_per_second": 0.101, "train_loss": 8.874524852365107, "epoch": 3.0 } ]