{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 4504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008880994671403197, "grad_norm": 13.040843821412901, "learning_rate": 2.2172949002217297e-08, "loss": 0.4427, "step": 1 }, { "epoch": 0.0017761989342806395, "grad_norm": 13.677664712800397, "learning_rate": 4.4345898004434594e-08, "loss": 0.5223, "step": 2 }, { "epoch": 0.0026642984014209592, "grad_norm": 13.41303079869153, "learning_rate": 6.651884700665188e-08, "loss": 0.4662, "step": 3 }, { "epoch": 0.003552397868561279, "grad_norm": 12.591263156172579, "learning_rate": 8.869179600886919e-08, "loss": 0.4713, "step": 4 }, { "epoch": 0.004440497335701598, "grad_norm": 12.748122761493525, "learning_rate": 1.1086474501108649e-07, "loss": 0.4617, "step": 5 }, { "epoch": 0.0053285968028419185, "grad_norm": 12.251887276789628, "learning_rate": 1.3303769401330377e-07, "loss": 0.4433, "step": 6 }, { "epoch": 0.006216696269982238, "grad_norm": 12.117567915620455, "learning_rate": 1.5521064301552109e-07, "loss": 0.4138, "step": 7 }, { "epoch": 0.007104795737122558, "grad_norm": 12.929954917378, "learning_rate": 1.7738359201773838e-07, "loss": 0.4779, "step": 8 }, { "epoch": 0.007992895204262877, "grad_norm": 12.912164370309165, "learning_rate": 1.9955654101995567e-07, "loss": 0.4406, "step": 9 }, { "epoch": 0.008880994671403197, "grad_norm": 12.182085316407958, "learning_rate": 2.2172949002217298e-07, "loss": 0.4246, "step": 10 }, { "epoch": 0.009769094138543518, "grad_norm": 13.840302903530718, "learning_rate": 2.439024390243903e-07, "loss": 0.4855, "step": 11 }, { "epoch": 0.010657193605683837, "grad_norm": 12.889473918224763, "learning_rate": 2.6607538802660754e-07, "loss": 0.4364, "step": 12 }, { "epoch": 0.011545293072824156, "grad_norm": 12.388896580352663, "learning_rate": 2.8824833702882486e-07, "loss": 0.4101, "step": 13 }, { "epoch": 0.012433392539964476, "grad_norm": 12.49046885123923, "learning_rate": 3.1042128603104217e-07, "loss": 0.4726, "step": 14 }, { "epoch": 0.013321492007104795, "grad_norm": 11.48222167766721, "learning_rate": 3.3259423503325944e-07, "loss": 0.3645, "step": 15 }, { "epoch": 0.014209591474245116, "grad_norm": 12.157840445103568, "learning_rate": 3.5476718403547675e-07, "loss": 0.465, "step": 16 }, { "epoch": 0.015097690941385435, "grad_norm": 11.307771204326675, "learning_rate": 3.7694013303769407e-07, "loss": 0.438, "step": 17 }, { "epoch": 0.015985790408525755, "grad_norm": 11.130951839748358, "learning_rate": 3.9911308203991133e-07, "loss": 0.4046, "step": 18 }, { "epoch": 0.016873889875666074, "grad_norm": 11.745967760805645, "learning_rate": 4.2128603104212865e-07, "loss": 0.4274, "step": 19 }, { "epoch": 0.017761989342806393, "grad_norm": 11.644501698063689, "learning_rate": 4.4345898004434597e-07, "loss": 0.4188, "step": 20 }, { "epoch": 0.018650088809946713, "grad_norm": 10.785511103209847, "learning_rate": 4.6563192904656323e-07, "loss": 0.4375, "step": 21 }, { "epoch": 0.019538188277087035, "grad_norm": 9.26581226542501, "learning_rate": 4.878048780487805e-07, "loss": 0.3922, "step": 22 }, { "epoch": 0.020426287744227355, "grad_norm": 9.9748547025296, "learning_rate": 5.099778270509978e-07, "loss": 0.3854, "step": 23 }, { "epoch": 0.021314387211367674, "grad_norm": 10.047025994539355, "learning_rate": 5.321507760532151e-07, "loss": 0.4155, "step": 24 }, { "epoch": 0.022202486678507993, "grad_norm": 9.0090924049002, "learning_rate": 5.543237250554324e-07, "loss": 0.3851, "step": 25 }, { "epoch": 0.023090586145648313, "grad_norm": 8.616619523300912, "learning_rate": 5.764966740576497e-07, "loss": 0.3766, "step": 26 }, { "epoch": 0.023978685612788632, "grad_norm": 8.573705551462735, "learning_rate": 5.98669623059867e-07, "loss": 0.3393, "step": 27 }, { "epoch": 0.02486678507992895, "grad_norm": 5.89996526738856, "learning_rate": 6.208425720620843e-07, "loss": 0.332, "step": 28 }, { "epoch": 0.02575488454706927, "grad_norm": 4.133420274556032, "learning_rate": 6.430155210643016e-07, "loss": 0.2742, "step": 29 }, { "epoch": 0.02664298401420959, "grad_norm": 4.562422798003308, "learning_rate": 6.651884700665189e-07, "loss": 0.2775, "step": 30 }, { "epoch": 0.027531083481349913, "grad_norm": 4.670481858708041, "learning_rate": 6.873614190687362e-07, "loss": 0.2935, "step": 31 }, { "epoch": 0.028419182948490232, "grad_norm": 4.509382510160496, "learning_rate": 7.095343680709535e-07, "loss": 0.3363, "step": 32 }, { "epoch": 0.02930728241563055, "grad_norm": 4.363609746277317, "learning_rate": 7.317073170731707e-07, "loss": 0.2934, "step": 33 }, { "epoch": 0.03019538188277087, "grad_norm": 3.596275007281521, "learning_rate": 7.538802660753881e-07, "loss": 0.2653, "step": 34 }, { "epoch": 0.03108348134991119, "grad_norm": 3.351254206377662, "learning_rate": 7.760532150776054e-07, "loss": 0.2821, "step": 35 }, { "epoch": 0.03197158081705151, "grad_norm": 3.1261834239669195, "learning_rate": 7.982261640798227e-07, "loss": 0.2584, "step": 36 }, { "epoch": 0.03285968028419183, "grad_norm": 3.1711488715341862, "learning_rate": 8.203991130820401e-07, "loss": 0.2705, "step": 37 }, { "epoch": 0.03374777975133215, "grad_norm": 2.786764471661445, "learning_rate": 8.425720620842573e-07, "loss": 0.2583, "step": 38 }, { "epoch": 0.03463587921847247, "grad_norm": 2.8688909432984784, "learning_rate": 8.647450110864745e-07, "loss": 0.2202, "step": 39 }, { "epoch": 0.035523978685612786, "grad_norm": 2.3720310952864145, "learning_rate": 8.869179600886919e-07, "loss": 0.2556, "step": 40 }, { "epoch": 0.03641207815275311, "grad_norm": 2.1357508951578352, "learning_rate": 9.090909090909091e-07, "loss": 0.2256, "step": 41 }, { "epoch": 0.037300177619893425, "grad_norm": 1.9597318478589312, "learning_rate": 9.312638580931265e-07, "loss": 0.2283, "step": 42 }, { "epoch": 0.03818827708703375, "grad_norm": 2.321583307882343, "learning_rate": 9.534368070953437e-07, "loss": 0.2276, "step": 43 }, { "epoch": 0.03907637655417407, "grad_norm": 2.5274924171412754, "learning_rate": 9.75609756097561e-07, "loss": 0.2243, "step": 44 }, { "epoch": 0.03996447602131439, "grad_norm": 2.5093600192649723, "learning_rate": 9.977827050997784e-07, "loss": 0.2387, "step": 45 }, { "epoch": 0.04085257548845471, "grad_norm": 2.258421609405366, "learning_rate": 1.0199556541019955e-06, "loss": 0.2294, "step": 46 }, { "epoch": 0.041740674955595025, "grad_norm": 2.0837448190567245, "learning_rate": 1.042128603104213e-06, "loss": 0.2293, "step": 47 }, { "epoch": 0.04262877442273535, "grad_norm": 1.7457513775154567, "learning_rate": 1.0643015521064302e-06, "loss": 0.2379, "step": 48 }, { "epoch": 0.043516873889875664, "grad_norm": 1.504770899895591, "learning_rate": 1.0864745011086475e-06, "loss": 0.2182, "step": 49 }, { "epoch": 0.04440497335701599, "grad_norm": 1.650451976656802, "learning_rate": 1.1086474501108648e-06, "loss": 0.185, "step": 50 }, { "epoch": 0.0452930728241563, "grad_norm": 1.7889570187921355, "learning_rate": 1.130820399113082e-06, "loss": 0.2132, "step": 51 }, { "epoch": 0.046181172291296625, "grad_norm": 1.6099856493045095, "learning_rate": 1.1529933481152994e-06, "loss": 0.1983, "step": 52 }, { "epoch": 0.04706927175843695, "grad_norm": 1.3281445457110155, "learning_rate": 1.1751662971175167e-06, "loss": 0.184, "step": 53 }, { "epoch": 0.047957371225577264, "grad_norm": 1.5880809634733573, "learning_rate": 1.197339246119734e-06, "loss": 0.2071, "step": 54 }, { "epoch": 0.04884547069271759, "grad_norm": 1.489451471752179, "learning_rate": 1.2195121951219514e-06, "loss": 0.2037, "step": 55 }, { "epoch": 0.0497335701598579, "grad_norm": 1.8245481090703408, "learning_rate": 1.2416851441241687e-06, "loss": 0.2003, "step": 56 }, { "epoch": 0.050621669626998225, "grad_norm": 1.7421793808491324, "learning_rate": 1.2638580931263858e-06, "loss": 0.2021, "step": 57 }, { "epoch": 0.05150976909413854, "grad_norm": 1.478392790648627, "learning_rate": 1.2860310421286031e-06, "loss": 0.194, "step": 58 }, { "epoch": 0.052397868561278864, "grad_norm": 1.1458896328589037, "learning_rate": 1.3082039911308206e-06, "loss": 0.1863, "step": 59 }, { "epoch": 0.05328596802841918, "grad_norm": 1.343280607139686, "learning_rate": 1.3303769401330377e-06, "loss": 0.1902, "step": 60 }, { "epoch": 0.0541740674955595, "grad_norm": 1.3741303683100614, "learning_rate": 1.352549889135255e-06, "loss": 0.2023, "step": 61 }, { "epoch": 0.055062166962699825, "grad_norm": 1.258896205353153, "learning_rate": 1.3747228381374724e-06, "loss": 0.2016, "step": 62 }, { "epoch": 0.05595026642984014, "grad_norm": 1.3943019068049274, "learning_rate": 1.3968957871396897e-06, "loss": 0.2135, "step": 63 }, { "epoch": 0.056838365896980464, "grad_norm": 1.3471146628155388, "learning_rate": 1.419068736141907e-06, "loss": 0.1809, "step": 64 }, { "epoch": 0.05772646536412078, "grad_norm": 1.5701464198094832, "learning_rate": 1.4412416851441243e-06, "loss": 0.1786, "step": 65 }, { "epoch": 0.0586145648312611, "grad_norm": 1.371471043542019, "learning_rate": 1.4634146341463414e-06, "loss": 0.1986, "step": 66 }, { "epoch": 0.05950266429840142, "grad_norm": 1.488399056738155, "learning_rate": 1.485587583148559e-06, "loss": 0.2069, "step": 67 }, { "epoch": 0.06039076376554174, "grad_norm": 1.1836349492010712, "learning_rate": 1.5077605321507763e-06, "loss": 0.1928, "step": 68 }, { "epoch": 0.06127886323268206, "grad_norm": 1.5478134200389975, "learning_rate": 1.5299334811529934e-06, "loss": 0.183, "step": 69 }, { "epoch": 0.06216696269982238, "grad_norm": 1.2814053714377254, "learning_rate": 1.5521064301552107e-06, "loss": 0.2091, "step": 70 }, { "epoch": 0.0630550621669627, "grad_norm": 1.1829355995137947, "learning_rate": 1.5742793791574282e-06, "loss": 0.1475, "step": 71 }, { "epoch": 0.06394316163410302, "grad_norm": 1.3941994415209034, "learning_rate": 1.5964523281596453e-06, "loss": 0.1432, "step": 72 }, { "epoch": 0.06483126110124333, "grad_norm": 1.2121269627847069, "learning_rate": 1.6186252771618627e-06, "loss": 0.1789, "step": 73 }, { "epoch": 0.06571936056838366, "grad_norm": 1.3375032889817995, "learning_rate": 1.6407982261640802e-06, "loss": 0.1879, "step": 74 }, { "epoch": 0.06660746003552398, "grad_norm": 1.1029253885450885, "learning_rate": 1.6629711751662973e-06, "loss": 0.1255, "step": 75 }, { "epoch": 0.0674955595026643, "grad_norm": 1.3094518409858786, "learning_rate": 1.6851441241685146e-06, "loss": 0.1831, "step": 76 }, { "epoch": 0.06838365896980461, "grad_norm": 1.5769224156692587, "learning_rate": 1.707317073170732e-06, "loss": 0.1672, "step": 77 }, { "epoch": 0.06927175843694494, "grad_norm": 1.2775816415734151, "learning_rate": 1.729490022172949e-06, "loss": 0.1735, "step": 78 }, { "epoch": 0.07015985790408526, "grad_norm": 0.9763817638147934, "learning_rate": 1.7516629711751666e-06, "loss": 0.1344, "step": 79 }, { "epoch": 0.07104795737122557, "grad_norm": 1.201528948127603, "learning_rate": 1.7738359201773839e-06, "loss": 0.1485, "step": 80 }, { "epoch": 0.0719360568383659, "grad_norm": 1.1447121851778421, "learning_rate": 1.796008869179601e-06, "loss": 0.1699, "step": 81 }, { "epoch": 0.07282415630550622, "grad_norm": 1.05293863502435, "learning_rate": 1.8181818181818183e-06, "loss": 0.1395, "step": 82 }, { "epoch": 0.07371225577264653, "grad_norm": 1.3517411214685857, "learning_rate": 1.8403547671840354e-06, "loss": 0.1844, "step": 83 }, { "epoch": 0.07460035523978685, "grad_norm": 1.7577841297827297, "learning_rate": 1.862527716186253e-06, "loss": 0.1779, "step": 84 }, { "epoch": 0.07548845470692718, "grad_norm": 1.1023377599109887, "learning_rate": 1.8847006651884702e-06, "loss": 0.1633, "step": 85 }, { "epoch": 0.0763765541740675, "grad_norm": 1.1472821024712776, "learning_rate": 1.9068736141906873e-06, "loss": 0.1413, "step": 86 }, { "epoch": 0.07726465364120781, "grad_norm": 1.1520998196670071, "learning_rate": 1.9290465631929047e-06, "loss": 0.1801, "step": 87 }, { "epoch": 0.07815275310834814, "grad_norm": 1.2441514098841544, "learning_rate": 1.951219512195122e-06, "loss": 0.16, "step": 88 }, { "epoch": 0.07904085257548846, "grad_norm": 1.475416128749588, "learning_rate": 1.9733924611973393e-06, "loss": 0.1493, "step": 89 }, { "epoch": 0.07992895204262877, "grad_norm": 1.5111225554836951, "learning_rate": 1.995565410199557e-06, "loss": 0.1733, "step": 90 }, { "epoch": 0.08081705150976909, "grad_norm": 1.3681513326820232, "learning_rate": 2.017738359201774e-06, "loss": 0.1695, "step": 91 }, { "epoch": 0.08170515097690942, "grad_norm": 1.4294052270061006, "learning_rate": 2.039911308203991e-06, "loss": 0.1545, "step": 92 }, { "epoch": 0.08259325044404973, "grad_norm": 1.1951778622057059, "learning_rate": 2.0620842572062086e-06, "loss": 0.1517, "step": 93 }, { "epoch": 0.08348134991119005, "grad_norm": 1.4048018125143116, "learning_rate": 2.084257206208426e-06, "loss": 0.1379, "step": 94 }, { "epoch": 0.08436944937833037, "grad_norm": 1.2064909876304688, "learning_rate": 2.106430155210643e-06, "loss": 0.1539, "step": 95 }, { "epoch": 0.0852575488454707, "grad_norm": 1.2323744596339212, "learning_rate": 2.1286031042128603e-06, "loss": 0.1438, "step": 96 }, { "epoch": 0.08614564831261101, "grad_norm": 1.1549888394827348, "learning_rate": 2.150776053215078e-06, "loss": 0.1554, "step": 97 }, { "epoch": 0.08703374777975133, "grad_norm": 1.2476661230334434, "learning_rate": 2.172949002217295e-06, "loss": 0.1574, "step": 98 }, { "epoch": 0.08792184724689166, "grad_norm": 1.5260144602597139, "learning_rate": 2.1951219512195125e-06, "loss": 0.1605, "step": 99 }, { "epoch": 0.08880994671403197, "grad_norm": 1.347374395733047, "learning_rate": 2.2172949002217296e-06, "loss": 0.155, "step": 100 }, { "epoch": 0.08969804618117229, "grad_norm": 1.0555188775346487, "learning_rate": 2.2394678492239467e-06, "loss": 0.139, "step": 101 }, { "epoch": 0.0905861456483126, "grad_norm": 1.367331017367306, "learning_rate": 2.261640798226164e-06, "loss": 0.1449, "step": 102 }, { "epoch": 0.09147424511545293, "grad_norm": 1.3152545619381775, "learning_rate": 2.2838137472283817e-06, "loss": 0.1585, "step": 103 }, { "epoch": 0.09236234458259325, "grad_norm": 1.0647690513712063, "learning_rate": 2.305986696230599e-06, "loss": 0.1325, "step": 104 }, { "epoch": 0.09325044404973357, "grad_norm": 1.5394023192423911, "learning_rate": 2.3281596452328164e-06, "loss": 0.1351, "step": 105 }, { "epoch": 0.0941385435168739, "grad_norm": 1.4071751363116185, "learning_rate": 2.3503325942350335e-06, "loss": 0.1368, "step": 106 }, { "epoch": 0.09502664298401421, "grad_norm": 1.0560128882683737, "learning_rate": 2.3725055432372506e-06, "loss": 0.1371, "step": 107 }, { "epoch": 0.09591474245115453, "grad_norm": 1.11181193435597, "learning_rate": 2.394678492239468e-06, "loss": 0.1649, "step": 108 }, { "epoch": 0.09680284191829484, "grad_norm": 1.241176938087842, "learning_rate": 2.4168514412416856e-06, "loss": 0.1645, "step": 109 }, { "epoch": 0.09769094138543517, "grad_norm": 1.40617304043457, "learning_rate": 2.4390243902439027e-06, "loss": 0.1468, "step": 110 }, { "epoch": 0.09857904085257549, "grad_norm": 1.034869455847088, "learning_rate": 2.46119733924612e-06, "loss": 0.1333, "step": 111 }, { "epoch": 0.0994671403197158, "grad_norm": 1.1404231888742273, "learning_rate": 2.4833702882483374e-06, "loss": 0.1436, "step": 112 }, { "epoch": 0.10035523978685613, "grad_norm": 1.0366648839999737, "learning_rate": 2.5055432372505545e-06, "loss": 0.1557, "step": 113 }, { "epoch": 0.10124333925399645, "grad_norm": 1.4059313118048324, "learning_rate": 2.5277161862527716e-06, "loss": 0.1499, "step": 114 }, { "epoch": 0.10213143872113677, "grad_norm": 2.3968727543849626, "learning_rate": 2.549889135254989e-06, "loss": 0.1517, "step": 115 }, { "epoch": 0.10301953818827708, "grad_norm": 1.339768764551887, "learning_rate": 2.5720620842572062e-06, "loss": 0.1321, "step": 116 }, { "epoch": 0.10390763765541741, "grad_norm": 1.4530407107366679, "learning_rate": 2.5942350332594233e-06, "loss": 0.1197, "step": 117 }, { "epoch": 0.10479573712255773, "grad_norm": 1.5225116947078698, "learning_rate": 2.6164079822616413e-06, "loss": 0.133, "step": 118 }, { "epoch": 0.10568383658969804, "grad_norm": 1.1686399652945325, "learning_rate": 2.6385809312638584e-06, "loss": 0.1354, "step": 119 }, { "epoch": 0.10657193605683836, "grad_norm": 1.6020618834435372, "learning_rate": 2.6607538802660755e-06, "loss": 0.1367, "step": 120 }, { "epoch": 0.10746003552397869, "grad_norm": 1.1988715520142583, "learning_rate": 2.682926829268293e-06, "loss": 0.1403, "step": 121 }, { "epoch": 0.108348134991119, "grad_norm": 1.0364273812267792, "learning_rate": 2.70509977827051e-06, "loss": 0.1318, "step": 122 }, { "epoch": 0.10923623445825932, "grad_norm": 1.1057236878883572, "learning_rate": 2.7272727272727272e-06, "loss": 0.1392, "step": 123 }, { "epoch": 0.11012433392539965, "grad_norm": 1.1277933112704228, "learning_rate": 2.7494456762749448e-06, "loss": 0.1256, "step": 124 }, { "epoch": 0.11101243339253997, "grad_norm": 1.0659897014429838, "learning_rate": 2.7716186252771623e-06, "loss": 0.1248, "step": 125 }, { "epoch": 0.11190053285968028, "grad_norm": 1.186164779320272, "learning_rate": 2.7937915742793794e-06, "loss": 0.116, "step": 126 }, { "epoch": 0.1127886323268206, "grad_norm": 1.6226791741262894, "learning_rate": 2.815964523281597e-06, "loss": 0.126, "step": 127 }, { "epoch": 0.11367673179396093, "grad_norm": 1.0482165891480628, "learning_rate": 2.838137472283814e-06, "loss": 0.1207, "step": 128 }, { "epoch": 0.11456483126110124, "grad_norm": 1.358096964021043, "learning_rate": 2.860310421286031e-06, "loss": 0.1558, "step": 129 }, { "epoch": 0.11545293072824156, "grad_norm": 1.9653651902270786, "learning_rate": 2.8824833702882487e-06, "loss": 0.1464, "step": 130 }, { "epoch": 0.11634103019538189, "grad_norm": 0.8888278900485693, "learning_rate": 2.9046563192904658e-06, "loss": 0.114, "step": 131 }, { "epoch": 0.1172291296625222, "grad_norm": 1.0600150312970615, "learning_rate": 2.926829268292683e-06, "loss": 0.1442, "step": 132 }, { "epoch": 0.11811722912966252, "grad_norm": 2.3980686456454072, "learning_rate": 2.949002217294901e-06, "loss": 0.1531, "step": 133 }, { "epoch": 0.11900532859680284, "grad_norm": 1.537750142292643, "learning_rate": 2.971175166297118e-06, "loss": 0.136, "step": 134 }, { "epoch": 0.11989342806394317, "grad_norm": 1.1173719039790706, "learning_rate": 2.993348115299335e-06, "loss": 0.1227, "step": 135 }, { "epoch": 0.12078152753108348, "grad_norm": 1.2564664316562757, "learning_rate": 3.0155210643015526e-06, "loss": 0.1103, "step": 136 }, { "epoch": 0.1216696269982238, "grad_norm": 1.015330802209704, "learning_rate": 3.0376940133037697e-06, "loss": 0.1249, "step": 137 }, { "epoch": 0.12255772646536411, "grad_norm": 0.9052921105923755, "learning_rate": 3.0598669623059868e-06, "loss": 0.1065, "step": 138 }, { "epoch": 0.12344582593250444, "grad_norm": 1.554369007870476, "learning_rate": 3.0820399113082043e-06, "loss": 0.1253, "step": 139 }, { "epoch": 0.12433392539964476, "grad_norm": 1.0994838247729177, "learning_rate": 3.1042128603104214e-06, "loss": 0.1268, "step": 140 }, { "epoch": 0.12522202486678508, "grad_norm": 1.289812156765464, "learning_rate": 3.1263858093126385e-06, "loss": 0.1316, "step": 141 }, { "epoch": 0.1261101243339254, "grad_norm": 1.117530196743946, "learning_rate": 3.1485587583148565e-06, "loss": 0.1205, "step": 142 }, { "epoch": 0.1269982238010657, "grad_norm": 1.4963202257356378, "learning_rate": 3.1707317073170736e-06, "loss": 0.1282, "step": 143 }, { "epoch": 0.12788632326820604, "grad_norm": 1.2143329167513737, "learning_rate": 3.1929046563192907e-06, "loss": 0.1268, "step": 144 }, { "epoch": 0.12877442273534637, "grad_norm": 1.0842233342893233, "learning_rate": 3.215077605321508e-06, "loss": 0.1186, "step": 145 }, { "epoch": 0.12966252220248667, "grad_norm": 1.7013265505553796, "learning_rate": 3.2372505543237253e-06, "loss": 0.1118, "step": 146 }, { "epoch": 0.130550621669627, "grad_norm": 1.6188620303249415, "learning_rate": 3.2594235033259424e-06, "loss": 0.1589, "step": 147 }, { "epoch": 0.13143872113676733, "grad_norm": 1.0009419982608598, "learning_rate": 3.2815964523281604e-06, "loss": 0.1173, "step": 148 }, { "epoch": 0.13232682060390763, "grad_norm": 1.02275400737328, "learning_rate": 3.3037694013303775e-06, "loss": 0.1191, "step": 149 }, { "epoch": 0.13321492007104796, "grad_norm": 1.0442081136592782, "learning_rate": 3.3259423503325946e-06, "loss": 0.1425, "step": 150 }, { "epoch": 0.1341030195381883, "grad_norm": 1.075564124443683, "learning_rate": 3.348115299334812e-06, "loss": 0.0982, "step": 151 }, { "epoch": 0.1349911190053286, "grad_norm": 1.064389736484248, "learning_rate": 3.370288248337029e-06, "loss": 0.1211, "step": 152 }, { "epoch": 0.13587921847246892, "grad_norm": 1.0766274819279984, "learning_rate": 3.3924611973392463e-06, "loss": 0.1267, "step": 153 }, { "epoch": 0.13676731793960922, "grad_norm": 0.9372935900244141, "learning_rate": 3.414634146341464e-06, "loss": 0.1287, "step": 154 }, { "epoch": 0.13765541740674955, "grad_norm": 0.9947383850408118, "learning_rate": 3.436807095343681e-06, "loss": 0.1192, "step": 155 }, { "epoch": 0.13854351687388988, "grad_norm": 1.1107344059739117, "learning_rate": 3.458980044345898e-06, "loss": 0.1169, "step": 156 }, { "epoch": 0.13943161634103018, "grad_norm": 1.029883982289853, "learning_rate": 3.481152993348116e-06, "loss": 0.1256, "step": 157 }, { "epoch": 0.14031971580817051, "grad_norm": 1.1727622617313658, "learning_rate": 3.503325942350333e-06, "loss": 0.136, "step": 158 }, { "epoch": 0.14120781527531084, "grad_norm": 0.8288001336590112, "learning_rate": 3.52549889135255e-06, "loss": 0.1064, "step": 159 }, { "epoch": 0.14209591474245115, "grad_norm": 1.210344473508419, "learning_rate": 3.5476718403547677e-06, "loss": 0.1321, "step": 160 }, { "epoch": 0.14298401420959148, "grad_norm": 1.0298292706998382, "learning_rate": 3.569844789356985e-06, "loss": 0.1063, "step": 161 }, { "epoch": 0.1438721136767318, "grad_norm": 1.1291599864754656, "learning_rate": 3.592017738359202e-06, "loss": 0.1247, "step": 162 }, { "epoch": 0.1447602131438721, "grad_norm": 1.1019933938699389, "learning_rate": 3.614190687361419e-06, "loss": 0.1377, "step": 163 }, { "epoch": 0.14564831261101244, "grad_norm": 0.8189641520937786, "learning_rate": 3.6363636363636366e-06, "loss": 0.0991, "step": 164 }, { "epoch": 0.14653641207815277, "grad_norm": 1.5124026105918429, "learning_rate": 3.6585365853658537e-06, "loss": 0.1348, "step": 165 }, { "epoch": 0.14742451154529307, "grad_norm": 1.2711556676869127, "learning_rate": 3.680709534368071e-06, "loss": 0.1206, "step": 166 }, { "epoch": 0.1483126110124334, "grad_norm": 0.7604195531895337, "learning_rate": 3.7028824833702887e-06, "loss": 0.0998, "step": 167 }, { "epoch": 0.1492007104795737, "grad_norm": 0.901217436396349, "learning_rate": 3.725055432372506e-06, "loss": 0.1047, "step": 168 }, { "epoch": 0.15008880994671403, "grad_norm": 1.3122925202426214, "learning_rate": 3.747228381374723e-06, "loss": 0.1368, "step": 169 }, { "epoch": 0.15097690941385436, "grad_norm": 1.1806821018774798, "learning_rate": 3.7694013303769405e-06, "loss": 0.111, "step": 170 }, { "epoch": 0.15186500888099466, "grad_norm": 1.220431749996127, "learning_rate": 3.7915742793791576e-06, "loss": 0.1558, "step": 171 }, { "epoch": 0.152753108348135, "grad_norm": 1.027739869330392, "learning_rate": 3.8137472283813747e-06, "loss": 0.1261, "step": 172 }, { "epoch": 0.15364120781527532, "grad_norm": 0.9958834179246624, "learning_rate": 3.835920177383592e-06, "loss": 0.1034, "step": 173 }, { "epoch": 0.15452930728241562, "grad_norm": 0.9734421309270681, "learning_rate": 3.858093126385809e-06, "loss": 0.1164, "step": 174 }, { "epoch": 0.15541740674955595, "grad_norm": 1.1663068744058451, "learning_rate": 3.8802660753880264e-06, "loss": 0.1232, "step": 175 }, { "epoch": 0.15630550621669628, "grad_norm": 1.2152791890173018, "learning_rate": 3.902439024390244e-06, "loss": 0.1307, "step": 176 }, { "epoch": 0.15719360568383658, "grad_norm": 1.1930092764069682, "learning_rate": 3.9246119733924615e-06, "loss": 0.12, "step": 177 }, { "epoch": 0.15808170515097691, "grad_norm": 1.4229478314887962, "learning_rate": 3.946784922394679e-06, "loss": 0.1139, "step": 178 }, { "epoch": 0.15896980461811722, "grad_norm": 1.1541778733556443, "learning_rate": 3.9689578713968965e-06, "loss": 0.1493, "step": 179 }, { "epoch": 0.15985790408525755, "grad_norm": 0.8900810526022614, "learning_rate": 3.991130820399114e-06, "loss": 0.0989, "step": 180 }, { "epoch": 0.16074600355239788, "grad_norm": 0.9862154082139323, "learning_rate": 4.013303769401331e-06, "loss": 0.1066, "step": 181 }, { "epoch": 0.16163410301953818, "grad_norm": 1.0129924983516876, "learning_rate": 4.035476718403548e-06, "loss": 0.1246, "step": 182 }, { "epoch": 0.1625222024866785, "grad_norm": 0.8334997069061231, "learning_rate": 4.057649667405765e-06, "loss": 0.1113, "step": 183 }, { "epoch": 0.16341030195381884, "grad_norm": 1.3871494257343306, "learning_rate": 4.079822616407982e-06, "loss": 0.1127, "step": 184 }, { "epoch": 0.16429840142095914, "grad_norm": 1.3181794682195818, "learning_rate": 4.1019955654102e-06, "loss": 0.1345, "step": 185 }, { "epoch": 0.16518650088809947, "grad_norm": 0.986271734827267, "learning_rate": 4.124168514412417e-06, "loss": 0.1154, "step": 186 }, { "epoch": 0.1660746003552398, "grad_norm": 0.896943232410562, "learning_rate": 4.146341463414634e-06, "loss": 0.1107, "step": 187 }, { "epoch": 0.1669626998223801, "grad_norm": 1.2040214431740082, "learning_rate": 4.168514412416852e-06, "loss": 0.121, "step": 188 }, { "epoch": 0.16785079928952043, "grad_norm": 1.1911353300667678, "learning_rate": 4.190687361419069e-06, "loss": 0.1208, "step": 189 }, { "epoch": 0.16873889875666073, "grad_norm": 1.2508431970298757, "learning_rate": 4.212860310421286e-06, "loss": 0.1024, "step": 190 }, { "epoch": 0.16962699822380106, "grad_norm": 1.6842128262347582, "learning_rate": 4.2350332594235035e-06, "loss": 0.1081, "step": 191 }, { "epoch": 0.1705150976909414, "grad_norm": 1.0369558664708232, "learning_rate": 4.257206208425721e-06, "loss": 0.1111, "step": 192 }, { "epoch": 0.1714031971580817, "grad_norm": 0.8454133452905808, "learning_rate": 4.279379157427938e-06, "loss": 0.1006, "step": 193 }, { "epoch": 0.17229129662522202, "grad_norm": 1.12897982508959, "learning_rate": 4.301552106430156e-06, "loss": 0.1133, "step": 194 }, { "epoch": 0.17317939609236235, "grad_norm": 0.9532666539200393, "learning_rate": 4.323725055432373e-06, "loss": 0.1032, "step": 195 }, { "epoch": 0.17406749555950266, "grad_norm": 1.0733733404707433, "learning_rate": 4.34589800443459e-06, "loss": 0.123, "step": 196 }, { "epoch": 0.17495559502664298, "grad_norm": 0.7809602870024568, "learning_rate": 4.368070953436808e-06, "loss": 0.0967, "step": 197 }, { "epoch": 0.17584369449378331, "grad_norm": 1.2871583905784865, "learning_rate": 4.390243902439025e-06, "loss": 0.1414, "step": 198 }, { "epoch": 0.17673179396092362, "grad_norm": 0.8487919353091347, "learning_rate": 4.412416851441242e-06, "loss": 0.099, "step": 199 }, { "epoch": 0.17761989342806395, "grad_norm": 0.8803605777922451, "learning_rate": 4.434589800443459e-06, "loss": 0.1001, "step": 200 }, { "epoch": 0.17850799289520428, "grad_norm": 0.8101636456213316, "learning_rate": 4.456762749445676e-06, "loss": 0.1035, "step": 201 }, { "epoch": 0.17939609236234458, "grad_norm": 0.9967375332978891, "learning_rate": 4.478935698447893e-06, "loss": 0.1132, "step": 202 }, { "epoch": 0.1802841918294849, "grad_norm": 0.8752029350579277, "learning_rate": 4.501108647450111e-06, "loss": 0.1073, "step": 203 }, { "epoch": 0.1811722912966252, "grad_norm": 0.9312003004475127, "learning_rate": 4.523281596452328e-06, "loss": 0.1437, "step": 204 }, { "epoch": 0.18206039076376554, "grad_norm": 0.9249575582110223, "learning_rate": 4.5454545454545455e-06, "loss": 0.1321, "step": 205 }, { "epoch": 0.18294849023090587, "grad_norm": 0.9302778787096789, "learning_rate": 4.5676274944567635e-06, "loss": 0.1106, "step": 206 }, { "epoch": 0.18383658969804617, "grad_norm": 0.997721919364688, "learning_rate": 4.5898004434589806e-06, "loss": 0.1197, "step": 207 }, { "epoch": 0.1847246891651865, "grad_norm": 0.8646784783181414, "learning_rate": 4.611973392461198e-06, "loss": 0.107, "step": 208 }, { "epoch": 0.18561278863232683, "grad_norm": 0.878322984730142, "learning_rate": 4.634146341463416e-06, "loss": 0.1009, "step": 209 }, { "epoch": 0.18650088809946713, "grad_norm": 1.0258913988437643, "learning_rate": 4.656319290465633e-06, "loss": 0.1204, "step": 210 }, { "epoch": 0.18738898756660746, "grad_norm": 0.7285340355199612, "learning_rate": 4.67849223946785e-06, "loss": 0.1026, "step": 211 }, { "epoch": 0.1882770870337478, "grad_norm": 1.1552769161453678, "learning_rate": 4.700665188470067e-06, "loss": 0.1099, "step": 212 }, { "epoch": 0.1891651865008881, "grad_norm": 0.9048845643790788, "learning_rate": 4.722838137472284e-06, "loss": 0.1118, "step": 213 }, { "epoch": 0.19005328596802842, "grad_norm": 0.6551802366319263, "learning_rate": 4.745011086474501e-06, "loss": 0.0927, "step": 214 }, { "epoch": 0.19094138543516873, "grad_norm": 1.1989568948073477, "learning_rate": 4.767184035476719e-06, "loss": 0.1316, "step": 215 }, { "epoch": 0.19182948490230906, "grad_norm": 1.3977062892280108, "learning_rate": 4.789356984478936e-06, "loss": 0.1504, "step": 216 }, { "epoch": 0.19271758436944939, "grad_norm": 1.0870233932912872, "learning_rate": 4.811529933481153e-06, "loss": 0.1156, "step": 217 }, { "epoch": 0.1936056838365897, "grad_norm": 0.8488857557091133, "learning_rate": 4.833702882483371e-06, "loss": 0.1092, "step": 218 }, { "epoch": 0.19449378330373002, "grad_norm": 0.9902013418159533, "learning_rate": 4.855875831485588e-06, "loss": 0.1165, "step": 219 }, { "epoch": 0.19538188277087035, "grad_norm": 1.0948237904417308, "learning_rate": 4.8780487804878055e-06, "loss": 0.1153, "step": 220 }, { "epoch": 0.19626998223801065, "grad_norm": 1.191423921883629, "learning_rate": 4.900221729490023e-06, "loss": 0.1291, "step": 221 }, { "epoch": 0.19715808170515098, "grad_norm": 0.7823243340205541, "learning_rate": 4.92239467849224e-06, "loss": 0.0987, "step": 222 }, { "epoch": 0.1980461811722913, "grad_norm": 1.1222921015784384, "learning_rate": 4.944567627494457e-06, "loss": 0.1162, "step": 223 }, { "epoch": 0.1989342806394316, "grad_norm": 1.0874257819130313, "learning_rate": 4.966740576496675e-06, "loss": 0.1197, "step": 224 }, { "epoch": 0.19982238010657194, "grad_norm": 1.0791997810485823, "learning_rate": 4.988913525498892e-06, "loss": 0.1133, "step": 225 }, { "epoch": 0.20071047957371227, "grad_norm": 1.329628162108887, "learning_rate": 5.011086474501109e-06, "loss": 0.1158, "step": 226 }, { "epoch": 0.20159857904085257, "grad_norm": 0.7819188176330516, "learning_rate": 5.033259423503326e-06, "loss": 0.0937, "step": 227 }, { "epoch": 0.2024866785079929, "grad_norm": 0.7711644719096397, "learning_rate": 5.055432372505543e-06, "loss": 0.101, "step": 228 }, { "epoch": 0.2033747779751332, "grad_norm": 1.2128139947821643, "learning_rate": 5.077605321507761e-06, "loss": 0.1025, "step": 229 }, { "epoch": 0.20426287744227353, "grad_norm": 1.0640410043385014, "learning_rate": 5.099778270509978e-06, "loss": 0.1111, "step": 230 }, { "epoch": 0.20515097690941386, "grad_norm": 1.0170744010005723, "learning_rate": 5.121951219512195e-06, "loss": 0.1211, "step": 231 }, { "epoch": 0.20603907637655416, "grad_norm": 1.109251802320127, "learning_rate": 5.1441241685144124e-06, "loss": 0.1427, "step": 232 }, { "epoch": 0.2069271758436945, "grad_norm": 0.9589611675842314, "learning_rate": 5.1662971175166295e-06, "loss": 0.1183, "step": 233 }, { "epoch": 0.20781527531083482, "grad_norm": 1.109878614980834, "learning_rate": 5.188470066518847e-06, "loss": 0.1069, "step": 234 }, { "epoch": 0.20870337477797513, "grad_norm": 0.9525740331445137, "learning_rate": 5.2106430155210654e-06, "loss": 0.0877, "step": 235 }, { "epoch": 0.20959147424511546, "grad_norm": 1.1332712198064505, "learning_rate": 5.2328159645232826e-06, "loss": 0.1179, "step": 236 }, { "epoch": 0.21047957371225579, "grad_norm": 1.165084609634557, "learning_rate": 5.2549889135255e-06, "loss": 0.1158, "step": 237 }, { "epoch": 0.2113676731793961, "grad_norm": 0.9551403627646942, "learning_rate": 5.277161862527717e-06, "loss": 0.1202, "step": 238 }, { "epoch": 0.21225577264653642, "grad_norm": 1.3132605084853501, "learning_rate": 5.299334811529934e-06, "loss": 0.1023, "step": 239 }, { "epoch": 0.21314387211367672, "grad_norm": 1.1918356065854254, "learning_rate": 5.321507760532151e-06, "loss": 0.13, "step": 240 }, { "epoch": 0.21403197158081705, "grad_norm": 0.8732190922428062, "learning_rate": 5.343680709534369e-06, "loss": 0.0939, "step": 241 }, { "epoch": 0.21492007104795738, "grad_norm": 0.7943260488358588, "learning_rate": 5.365853658536586e-06, "loss": 0.0971, "step": 242 }, { "epoch": 0.21580817051509768, "grad_norm": 1.1992570718913784, "learning_rate": 5.388026607538803e-06, "loss": 0.1154, "step": 243 }, { "epoch": 0.216696269982238, "grad_norm": 1.2103166864927497, "learning_rate": 5.41019955654102e-06, "loss": 0.1279, "step": 244 }, { "epoch": 0.21758436944937834, "grad_norm": 0.9755034574356769, "learning_rate": 5.432372505543237e-06, "loss": 0.1115, "step": 245 }, { "epoch": 0.21847246891651864, "grad_norm": 1.1826367678301064, "learning_rate": 5.4545454545454545e-06, "loss": 0.1186, "step": 246 }, { "epoch": 0.21936056838365897, "grad_norm": 1.0911959641988822, "learning_rate": 5.476718403547672e-06, "loss": 0.1067, "step": 247 }, { "epoch": 0.2202486678507993, "grad_norm": 0.8868011155114296, "learning_rate": 5.4988913525498895e-06, "loss": 0.1179, "step": 248 }, { "epoch": 0.2211367673179396, "grad_norm": 1.2833847038326471, "learning_rate": 5.5210643015521075e-06, "loss": 0.129, "step": 249 }, { "epoch": 0.22202486678507993, "grad_norm": 1.3643340180187968, "learning_rate": 5.5432372505543246e-06, "loss": 0.1212, "step": 250 }, { "epoch": 0.22291296625222023, "grad_norm": 1.2843488941841317, "learning_rate": 5.565410199556542e-06, "loss": 0.1265, "step": 251 }, { "epoch": 0.22380106571936056, "grad_norm": 0.812674418729087, "learning_rate": 5.587583148558759e-06, "loss": 0.0932, "step": 252 }, { "epoch": 0.2246891651865009, "grad_norm": 0.9792111463263631, "learning_rate": 5.609756097560977e-06, "loss": 0.1004, "step": 253 }, { "epoch": 0.2255772646536412, "grad_norm": 1.8989961235893313, "learning_rate": 5.631929046563194e-06, "loss": 0.1256, "step": 254 }, { "epoch": 0.22646536412078153, "grad_norm": 1.1308233607195404, "learning_rate": 5.654101995565411e-06, "loss": 0.1318, "step": 255 }, { "epoch": 0.22735346358792186, "grad_norm": 1.675009909952044, "learning_rate": 5.676274944567628e-06, "loss": 0.1031, "step": 256 }, { "epoch": 0.22824156305506216, "grad_norm": 0.8060616010785078, "learning_rate": 5.698447893569845e-06, "loss": 0.0893, "step": 257 }, { "epoch": 0.2291296625222025, "grad_norm": 1.1704861373713222, "learning_rate": 5.720620842572062e-06, "loss": 0.1351, "step": 258 }, { "epoch": 0.23001776198934282, "grad_norm": 0.8444420732939596, "learning_rate": 5.742793791574279e-06, "loss": 0.0969, "step": 259 }, { "epoch": 0.23090586145648312, "grad_norm": 1.2877473673171203, "learning_rate": 5.764966740576497e-06, "loss": 0.1346, "step": 260 }, { "epoch": 0.23179396092362345, "grad_norm": 0.8334646425787372, "learning_rate": 5.787139689578714e-06, "loss": 0.0983, "step": 261 }, { "epoch": 0.23268206039076378, "grad_norm": 0.9171036171575526, "learning_rate": 5.8093126385809315e-06, "loss": 0.0939, "step": 262 }, { "epoch": 0.23357015985790408, "grad_norm": 0.8826773379519395, "learning_rate": 5.831485587583149e-06, "loss": 0.1108, "step": 263 }, { "epoch": 0.2344582593250444, "grad_norm": 0.8803411228533244, "learning_rate": 5.853658536585366e-06, "loss": 0.0892, "step": 264 }, { "epoch": 0.2353463587921847, "grad_norm": 0.9593996610025081, "learning_rate": 5.875831485587583e-06, "loss": 0.1114, "step": 265 }, { "epoch": 0.23623445825932504, "grad_norm": 0.8473811030250791, "learning_rate": 5.898004434589802e-06, "loss": 0.0983, "step": 266 }, { "epoch": 0.23712255772646537, "grad_norm": 0.8900843974703818, "learning_rate": 5.920177383592019e-06, "loss": 0.1167, "step": 267 }, { "epoch": 0.23801065719360567, "grad_norm": 0.9318764854171055, "learning_rate": 5.942350332594236e-06, "loss": 0.1007, "step": 268 }, { "epoch": 0.238898756660746, "grad_norm": 0.9402921543885013, "learning_rate": 5.964523281596453e-06, "loss": 0.1084, "step": 269 }, { "epoch": 0.23978685612788633, "grad_norm": 1.5655719582905685, "learning_rate": 5.98669623059867e-06, "loss": 0.1403, "step": 270 }, { "epoch": 0.24067495559502664, "grad_norm": 1.0285971062943506, "learning_rate": 6.008869179600887e-06, "loss": 0.1148, "step": 271 }, { "epoch": 0.24156305506216696, "grad_norm": 0.88569050368874, "learning_rate": 6.031042128603105e-06, "loss": 0.1122, "step": 272 }, { "epoch": 0.2424511545293073, "grad_norm": 0.9349873568776979, "learning_rate": 6.053215077605322e-06, "loss": 0.0991, "step": 273 }, { "epoch": 0.2433392539964476, "grad_norm": 0.9162971549498059, "learning_rate": 6.075388026607539e-06, "loss": 0.101, "step": 274 }, { "epoch": 0.24422735346358793, "grad_norm": 0.8286328678796847, "learning_rate": 6.0975609756097564e-06, "loss": 0.0943, "step": 275 }, { "epoch": 0.24511545293072823, "grad_norm": 1.1987117793960447, "learning_rate": 6.1197339246119735e-06, "loss": 0.1305, "step": 276 }, { "epoch": 0.24600355239786856, "grad_norm": 1.0475168234127854, "learning_rate": 6.141906873614191e-06, "loss": 0.1118, "step": 277 }, { "epoch": 0.2468916518650089, "grad_norm": 0.9193825808839732, "learning_rate": 6.164079822616409e-06, "loss": 0.1094, "step": 278 }, { "epoch": 0.2477797513321492, "grad_norm": 0.8372396630709162, "learning_rate": 6.186252771618626e-06, "loss": 0.1011, "step": 279 }, { "epoch": 0.24866785079928952, "grad_norm": 1.2776492189182604, "learning_rate": 6.208425720620843e-06, "loss": 0.1091, "step": 280 }, { "epoch": 0.24955595026642985, "grad_norm": 0.9827831306483911, "learning_rate": 6.23059866962306e-06, "loss": 0.1033, "step": 281 }, { "epoch": 0.25044404973357015, "grad_norm": 0.8117968866243302, "learning_rate": 6.252771618625277e-06, "loss": 0.1083, "step": 282 }, { "epoch": 0.25133214920071045, "grad_norm": 0.9259173802915021, "learning_rate": 6.274944567627494e-06, "loss": 0.1191, "step": 283 }, { "epoch": 0.2522202486678508, "grad_norm": 0.9560506019339821, "learning_rate": 6.297117516629713e-06, "loss": 0.1127, "step": 284 }, { "epoch": 0.2531083481349911, "grad_norm": 1.0450882846146754, "learning_rate": 6.31929046563193e-06, "loss": 0.1047, "step": 285 }, { "epoch": 0.2539964476021314, "grad_norm": 1.1396885068276017, "learning_rate": 6.341463414634147e-06, "loss": 0.1066, "step": 286 }, { "epoch": 0.25488454706927177, "grad_norm": 0.9982877841646272, "learning_rate": 6.363636363636364e-06, "loss": 0.1052, "step": 287 }, { "epoch": 0.2557726465364121, "grad_norm": 1.0559192079092554, "learning_rate": 6.385809312638581e-06, "loss": 0.1125, "step": 288 }, { "epoch": 0.2566607460035524, "grad_norm": 0.8756361747433646, "learning_rate": 6.4079822616407984e-06, "loss": 0.0906, "step": 289 }, { "epoch": 0.25754884547069273, "grad_norm": 1.8607634927392362, "learning_rate": 6.430155210643016e-06, "loss": 0.1199, "step": 290 }, { "epoch": 0.25843694493783304, "grad_norm": 1.1933913566515744, "learning_rate": 6.4523281596452335e-06, "loss": 0.0967, "step": 291 }, { "epoch": 0.25932504440497334, "grad_norm": 1.0732491347021387, "learning_rate": 6.474501108647451e-06, "loss": 0.1172, "step": 292 }, { "epoch": 0.2602131438721137, "grad_norm": 1.0464473664142486, "learning_rate": 6.496674057649668e-06, "loss": 0.1058, "step": 293 }, { "epoch": 0.261101243339254, "grad_norm": 1.390606848296661, "learning_rate": 6.518847006651885e-06, "loss": 0.1161, "step": 294 }, { "epoch": 0.2619893428063943, "grad_norm": 0.8469819515334254, "learning_rate": 6.541019955654102e-06, "loss": 0.0845, "step": 295 }, { "epoch": 0.26287744227353466, "grad_norm": 0.852351395715418, "learning_rate": 6.563192904656321e-06, "loss": 0.1043, "step": 296 }, { "epoch": 0.26376554174067496, "grad_norm": 0.6890587841071393, "learning_rate": 6.585365853658538e-06, "loss": 0.0844, "step": 297 }, { "epoch": 0.26465364120781526, "grad_norm": 0.9415217807109038, "learning_rate": 6.607538802660755e-06, "loss": 0.0876, "step": 298 }, { "epoch": 0.2655417406749556, "grad_norm": 1.0001337064125344, "learning_rate": 6.629711751662972e-06, "loss": 0.0905, "step": 299 }, { "epoch": 0.2664298401420959, "grad_norm": 0.9285830454903911, "learning_rate": 6.651884700665189e-06, "loss": 0.1065, "step": 300 }, { "epoch": 0.2673179396092362, "grad_norm": 0.7175570567997788, "learning_rate": 6.674057649667406e-06, "loss": 0.0941, "step": 301 }, { "epoch": 0.2682060390763766, "grad_norm": 0.8639952187699179, "learning_rate": 6.696230598669624e-06, "loss": 0.1102, "step": 302 }, { "epoch": 0.2690941385435169, "grad_norm": 0.8941541155162261, "learning_rate": 6.718403547671841e-06, "loss": 0.1015, "step": 303 }, { "epoch": 0.2699822380106572, "grad_norm": 0.8487330548553126, "learning_rate": 6.740576496674058e-06, "loss": 0.1147, "step": 304 }, { "epoch": 0.27087033747779754, "grad_norm": 1.0437605088989104, "learning_rate": 6.7627494456762755e-06, "loss": 0.1006, "step": 305 }, { "epoch": 0.27175843694493784, "grad_norm": 0.9407640851480744, "learning_rate": 6.784922394678493e-06, "loss": 0.0969, "step": 306 }, { "epoch": 0.27264653641207814, "grad_norm": 0.8117833769010849, "learning_rate": 6.80709534368071e-06, "loss": 0.1014, "step": 307 }, { "epoch": 0.27353463587921845, "grad_norm": 0.8114003548176898, "learning_rate": 6.829268292682928e-06, "loss": 0.0979, "step": 308 }, { "epoch": 0.2744227353463588, "grad_norm": 0.7633066126158762, "learning_rate": 6.851441241685145e-06, "loss": 0.1041, "step": 309 }, { "epoch": 0.2753108348134991, "grad_norm": 1.0186060599613544, "learning_rate": 6.873614190687362e-06, "loss": 0.1101, "step": 310 }, { "epoch": 0.2761989342806394, "grad_norm": 0.8807977402444054, "learning_rate": 6.895787139689579e-06, "loss": 0.1067, "step": 311 }, { "epoch": 0.27708703374777977, "grad_norm": 0.7810626374595981, "learning_rate": 6.917960088691796e-06, "loss": 0.099, "step": 312 }, { "epoch": 0.27797513321492007, "grad_norm": 0.9333340452246128, "learning_rate": 6.940133037694013e-06, "loss": 0.0972, "step": 313 }, { "epoch": 0.27886323268206037, "grad_norm": 0.9632806942955823, "learning_rate": 6.962305986696232e-06, "loss": 0.1069, "step": 314 }, { "epoch": 0.2797513321492007, "grad_norm": 0.8661746123894044, "learning_rate": 6.984478935698449e-06, "loss": 0.1104, "step": 315 }, { "epoch": 0.28063943161634103, "grad_norm": 0.9726637372731115, "learning_rate": 7.006651884700666e-06, "loss": 0.1063, "step": 316 }, { "epoch": 0.28152753108348133, "grad_norm": 0.8979446464223533, "learning_rate": 7.028824833702883e-06, "loss": 0.0886, "step": 317 }, { "epoch": 0.2824156305506217, "grad_norm": 0.9287233565372144, "learning_rate": 7.0509977827051e-06, "loss": 0.0927, "step": 318 }, { "epoch": 0.283303730017762, "grad_norm": 0.9525737964024173, "learning_rate": 7.0731707317073175e-06, "loss": 0.097, "step": 319 }, { "epoch": 0.2841918294849023, "grad_norm": 0.7230385948585585, "learning_rate": 7.0953436807095355e-06, "loss": 0.0989, "step": 320 }, { "epoch": 0.28507992895204265, "grad_norm": 0.7968698079710659, "learning_rate": 7.117516629711753e-06, "loss": 0.0837, "step": 321 }, { "epoch": 0.28596802841918295, "grad_norm": 0.8034475120364327, "learning_rate": 7.13968957871397e-06, "loss": 0.0896, "step": 322 }, { "epoch": 0.28685612788632325, "grad_norm": 1.321430422974807, "learning_rate": 7.161862527716187e-06, "loss": 0.1078, "step": 323 }, { "epoch": 0.2877442273534636, "grad_norm": 0.9085910562132562, "learning_rate": 7.184035476718404e-06, "loss": 0.098, "step": 324 }, { "epoch": 0.2886323268206039, "grad_norm": 0.7201916573679772, "learning_rate": 7.206208425720621e-06, "loss": 0.102, "step": 325 }, { "epoch": 0.2895204262877442, "grad_norm": 0.9432494926172132, "learning_rate": 7.228381374722838e-06, "loss": 0.1075, "step": 326 }, { "epoch": 0.29040852575488457, "grad_norm": 0.9498069827378723, "learning_rate": 7.250554323725056e-06, "loss": 0.0906, "step": 327 }, { "epoch": 0.2912966252220249, "grad_norm": 0.9154528005863768, "learning_rate": 7.272727272727273e-06, "loss": 0.1153, "step": 328 }, { "epoch": 0.2921847246891652, "grad_norm": 0.7264447802913219, "learning_rate": 7.29490022172949e-06, "loss": 0.09, "step": 329 }, { "epoch": 0.29307282415630553, "grad_norm": 0.9685882300583614, "learning_rate": 7.317073170731707e-06, "loss": 0.1109, "step": 330 }, { "epoch": 0.29396092362344584, "grad_norm": 1.1890607203846189, "learning_rate": 7.3392461197339245e-06, "loss": 0.1229, "step": 331 }, { "epoch": 0.29484902309058614, "grad_norm": 0.7286638790755859, "learning_rate": 7.361419068736142e-06, "loss": 0.0779, "step": 332 }, { "epoch": 0.29573712255772644, "grad_norm": 0.8339317001253415, "learning_rate": 7.38359201773836e-06, "loss": 0.1076, "step": 333 }, { "epoch": 0.2966252220248668, "grad_norm": 1.0598424258062524, "learning_rate": 7.4057649667405775e-06, "loss": 0.1043, "step": 334 }, { "epoch": 0.2975133214920071, "grad_norm": 0.8775990639677548, "learning_rate": 7.427937915742795e-06, "loss": 0.1021, "step": 335 }, { "epoch": 0.2984014209591474, "grad_norm": 0.8202595963041531, "learning_rate": 7.450110864745012e-06, "loss": 0.0938, "step": 336 }, { "epoch": 0.29928952042628776, "grad_norm": 0.7157189268067363, "learning_rate": 7.472283813747229e-06, "loss": 0.0954, "step": 337 }, { "epoch": 0.30017761989342806, "grad_norm": 0.9611095780525852, "learning_rate": 7.494456762749446e-06, "loss": 0.115, "step": 338 }, { "epoch": 0.30106571936056836, "grad_norm": 0.7193806879098371, "learning_rate": 7.516629711751664e-06, "loss": 0.0838, "step": 339 }, { "epoch": 0.3019538188277087, "grad_norm": 1.2379064911042004, "learning_rate": 7.538802660753881e-06, "loss": 0.1038, "step": 340 }, { "epoch": 0.302841918294849, "grad_norm": 0.8380165411759865, "learning_rate": 7.560975609756098e-06, "loss": 0.096, "step": 341 }, { "epoch": 0.3037300177619893, "grad_norm": 0.7842272699282425, "learning_rate": 7.583148558758315e-06, "loss": 0.1068, "step": 342 }, { "epoch": 0.3046181172291297, "grad_norm": 0.7903903611927442, "learning_rate": 7.605321507760532e-06, "loss": 0.0989, "step": 343 }, { "epoch": 0.30550621669627, "grad_norm": 0.8711590419288197, "learning_rate": 7.627494456762749e-06, "loss": 0.1033, "step": 344 }, { "epoch": 0.3063943161634103, "grad_norm": 0.7914571220374275, "learning_rate": 7.649667405764967e-06, "loss": 0.0882, "step": 345 }, { "epoch": 0.30728241563055064, "grad_norm": 0.9047258915871558, "learning_rate": 7.671840354767184e-06, "loss": 0.1079, "step": 346 }, { "epoch": 0.30817051509769094, "grad_norm": 0.6793865515086529, "learning_rate": 7.694013303769402e-06, "loss": 0.0827, "step": 347 }, { "epoch": 0.30905861456483125, "grad_norm": 0.9312853306792186, "learning_rate": 7.716186252771619e-06, "loss": 0.101, "step": 348 }, { "epoch": 0.3099467140319716, "grad_norm": 0.8024490916477357, "learning_rate": 7.738359201773836e-06, "loss": 0.1081, "step": 349 }, { "epoch": 0.3108348134991119, "grad_norm": 0.5541290987926398, "learning_rate": 7.760532150776053e-06, "loss": 0.0715, "step": 350 }, { "epoch": 0.3117229129662522, "grad_norm": 0.8662311135063834, "learning_rate": 7.782705099778272e-06, "loss": 0.1014, "step": 351 }, { "epoch": 0.31261101243339257, "grad_norm": 0.6050107863388078, "learning_rate": 7.804878048780489e-06, "loss": 0.0768, "step": 352 }, { "epoch": 0.31349911190053287, "grad_norm": 1.0089660444158453, "learning_rate": 7.827050997782706e-06, "loss": 0.1169, "step": 353 }, { "epoch": 0.31438721136767317, "grad_norm": 1.0024341093771925, "learning_rate": 7.849223946784923e-06, "loss": 0.1027, "step": 354 }, { "epoch": 0.31527531083481347, "grad_norm": 0.8423589628695387, "learning_rate": 7.87139689578714e-06, "loss": 0.0979, "step": 355 }, { "epoch": 0.31616341030195383, "grad_norm": 0.9539703915364602, "learning_rate": 7.893569844789357e-06, "loss": 0.089, "step": 356 }, { "epoch": 0.31705150976909413, "grad_norm": 1.3130291273077888, "learning_rate": 7.915742793791576e-06, "loss": 0.1095, "step": 357 }, { "epoch": 0.31793960923623443, "grad_norm": 0.8961830406796122, "learning_rate": 7.937915742793793e-06, "loss": 0.109, "step": 358 }, { "epoch": 0.3188277087033748, "grad_norm": 0.9541141598854677, "learning_rate": 7.96008869179601e-06, "loss": 0.0992, "step": 359 }, { "epoch": 0.3197158081705151, "grad_norm": 1.139854942538658, "learning_rate": 7.982261640798227e-06, "loss": 0.0879, "step": 360 }, { "epoch": 0.3206039076376554, "grad_norm": 1.0321508216179291, "learning_rate": 8.004434589800444e-06, "loss": 0.0945, "step": 361 }, { "epoch": 0.32149200710479575, "grad_norm": 0.8779246767220663, "learning_rate": 8.026607538802662e-06, "loss": 0.0941, "step": 362 }, { "epoch": 0.32238010657193605, "grad_norm": 1.0379702354670175, "learning_rate": 8.048780487804879e-06, "loss": 0.109, "step": 363 }, { "epoch": 0.32326820603907636, "grad_norm": 0.954746767681055, "learning_rate": 8.070953436807096e-06, "loss": 0.1049, "step": 364 }, { "epoch": 0.3241563055062167, "grad_norm": 0.895515034198231, "learning_rate": 8.093126385809313e-06, "loss": 0.0818, "step": 365 }, { "epoch": 0.325044404973357, "grad_norm": 0.9922258085230082, "learning_rate": 8.11529933481153e-06, "loss": 0.1029, "step": 366 }, { "epoch": 0.3259325044404973, "grad_norm": 0.9009685079488943, "learning_rate": 8.137472283813747e-06, "loss": 0.095, "step": 367 }, { "epoch": 0.3268206039076377, "grad_norm": 1.3774247173074172, "learning_rate": 8.159645232815964e-06, "loss": 0.1122, "step": 368 }, { "epoch": 0.327708703374778, "grad_norm": 0.8714121817372873, "learning_rate": 8.181818181818183e-06, "loss": 0.0723, "step": 369 }, { "epoch": 0.3285968028419183, "grad_norm": 0.7075961840882904, "learning_rate": 8.2039911308204e-06, "loss": 0.0776, "step": 370 }, { "epoch": 0.32948490230905864, "grad_norm": 0.6315560397637983, "learning_rate": 8.226164079822617e-06, "loss": 0.0834, "step": 371 }, { "epoch": 0.33037300177619894, "grad_norm": 0.8952416568044784, "learning_rate": 8.248337028824834e-06, "loss": 0.1081, "step": 372 }, { "epoch": 0.33126110124333924, "grad_norm": 0.9750507657786389, "learning_rate": 8.270509977827051e-06, "loss": 0.1021, "step": 373 }, { "epoch": 0.3321492007104796, "grad_norm": 0.6597951127128676, "learning_rate": 8.292682926829268e-06, "loss": 0.0772, "step": 374 }, { "epoch": 0.3330373001776199, "grad_norm": 1.0302393327528312, "learning_rate": 8.314855875831487e-06, "loss": 0.0928, "step": 375 }, { "epoch": 0.3339253996447602, "grad_norm": 0.714526637794305, "learning_rate": 8.337028824833704e-06, "loss": 0.0904, "step": 376 }, { "epoch": 0.33481349911190056, "grad_norm": 1.16627438770377, "learning_rate": 8.359201773835921e-06, "loss": 0.1028, "step": 377 }, { "epoch": 0.33570159857904086, "grad_norm": 0.7022637891988187, "learning_rate": 8.381374722838139e-06, "loss": 0.0864, "step": 378 }, { "epoch": 0.33658969804618116, "grad_norm": 0.9803104096407456, "learning_rate": 8.403547671840356e-06, "loss": 0.1045, "step": 379 }, { "epoch": 0.33747779751332146, "grad_norm": 0.8890791556325177, "learning_rate": 8.425720620842573e-06, "loss": 0.0928, "step": 380 }, { "epoch": 0.3383658969804618, "grad_norm": 0.8699816063506878, "learning_rate": 8.44789356984479e-06, "loss": 0.1072, "step": 381 }, { "epoch": 0.3392539964476021, "grad_norm": 1.3626912807800942, "learning_rate": 8.470066518847007e-06, "loss": 0.113, "step": 382 }, { "epoch": 0.3401420959147424, "grad_norm": 0.796850832011232, "learning_rate": 8.492239467849224e-06, "loss": 0.0911, "step": 383 }, { "epoch": 0.3410301953818828, "grad_norm": 0.9340650220252851, "learning_rate": 8.514412416851441e-06, "loss": 0.0919, "step": 384 }, { "epoch": 0.3419182948490231, "grad_norm": 0.6680290038713156, "learning_rate": 8.536585365853658e-06, "loss": 0.0865, "step": 385 }, { "epoch": 0.3428063943161634, "grad_norm": 1.0089022028279644, "learning_rate": 8.558758314855875e-06, "loss": 0.0971, "step": 386 }, { "epoch": 0.34369449378330375, "grad_norm": 0.7667339002753097, "learning_rate": 8.580931263858093e-06, "loss": 0.0974, "step": 387 }, { "epoch": 0.34458259325044405, "grad_norm": 1.1397829731941995, "learning_rate": 8.603104212860311e-06, "loss": 0.0959, "step": 388 }, { "epoch": 0.34547069271758435, "grad_norm": 0.830151012174381, "learning_rate": 8.625277161862528e-06, "loss": 0.0742, "step": 389 }, { "epoch": 0.3463587921847247, "grad_norm": 1.0064411564475326, "learning_rate": 8.647450110864746e-06, "loss": 0.1114, "step": 390 }, { "epoch": 0.347246891651865, "grad_norm": 0.9085306016933736, "learning_rate": 8.669623059866963e-06, "loss": 0.0977, "step": 391 }, { "epoch": 0.3481349911190053, "grad_norm": 0.8979237778722817, "learning_rate": 8.69179600886918e-06, "loss": 0.1125, "step": 392 }, { "epoch": 0.34902309058614567, "grad_norm": 1.101428725819367, "learning_rate": 8.713968957871397e-06, "loss": 0.1224, "step": 393 }, { "epoch": 0.34991119005328597, "grad_norm": 0.7879624686592628, "learning_rate": 8.736141906873616e-06, "loss": 0.1049, "step": 394 }, { "epoch": 0.35079928952042627, "grad_norm": 0.6892827477349056, "learning_rate": 8.758314855875833e-06, "loss": 0.0864, "step": 395 }, { "epoch": 0.35168738898756663, "grad_norm": 0.8862673113489934, "learning_rate": 8.78048780487805e-06, "loss": 0.0853, "step": 396 }, { "epoch": 0.35257548845470693, "grad_norm": 0.8648624534546676, "learning_rate": 8.802660753880267e-06, "loss": 0.0985, "step": 397 }, { "epoch": 0.35346358792184723, "grad_norm": 0.7441627676016863, "learning_rate": 8.824833702882484e-06, "loss": 0.0965, "step": 398 }, { "epoch": 0.3543516873889876, "grad_norm": 0.9456738596772384, "learning_rate": 8.847006651884701e-06, "loss": 0.1117, "step": 399 }, { "epoch": 0.3552397868561279, "grad_norm": 0.677577657857005, "learning_rate": 8.869179600886918e-06, "loss": 0.0689, "step": 400 }, { "epoch": 0.3561278863232682, "grad_norm": 0.7861698907374766, "learning_rate": 8.891352549889135e-06, "loss": 0.0894, "step": 401 }, { "epoch": 0.35701598579040855, "grad_norm": 0.9404806421842811, "learning_rate": 8.913525498891353e-06, "loss": 0.0966, "step": 402 }, { "epoch": 0.35790408525754885, "grad_norm": 0.9318679815099119, "learning_rate": 8.93569844789357e-06, "loss": 0.0911, "step": 403 }, { "epoch": 0.35879218472468916, "grad_norm": 0.5912566380660288, "learning_rate": 8.957871396895787e-06, "loss": 0.0838, "step": 404 }, { "epoch": 0.35968028419182946, "grad_norm": 0.9873336619294049, "learning_rate": 8.980044345898006e-06, "loss": 0.1021, "step": 405 }, { "epoch": 0.3605683836589698, "grad_norm": 0.9805051718815688, "learning_rate": 9.002217294900223e-06, "loss": 0.1159, "step": 406 }, { "epoch": 0.3614564831261101, "grad_norm": 0.7038213966191287, "learning_rate": 9.02439024390244e-06, "loss": 0.0887, "step": 407 }, { "epoch": 0.3623445825932504, "grad_norm": 0.7583067296422552, "learning_rate": 9.046563192904657e-06, "loss": 0.0984, "step": 408 }, { "epoch": 0.3632326820603908, "grad_norm": 0.7349365833394198, "learning_rate": 9.068736141906874e-06, "loss": 0.0868, "step": 409 }, { "epoch": 0.3641207815275311, "grad_norm": 0.74961148956145, "learning_rate": 9.090909090909091e-06, "loss": 0.0913, "step": 410 }, { "epoch": 0.3650088809946714, "grad_norm": 0.6621419287165237, "learning_rate": 9.113082039911308e-06, "loss": 0.0869, "step": 411 }, { "epoch": 0.36589698046181174, "grad_norm": 0.6809137513161271, "learning_rate": 9.135254988913527e-06, "loss": 0.0867, "step": 412 }, { "epoch": 0.36678507992895204, "grad_norm": 0.7615144304741818, "learning_rate": 9.157427937915744e-06, "loss": 0.0999, "step": 413 }, { "epoch": 0.36767317939609234, "grad_norm": 0.6837257281417417, "learning_rate": 9.179600886917961e-06, "loss": 0.0912, "step": 414 }, { "epoch": 0.3685612788632327, "grad_norm": 0.7497215509952461, "learning_rate": 9.201773835920178e-06, "loss": 0.1024, "step": 415 }, { "epoch": 0.369449378330373, "grad_norm": 0.8930543281812162, "learning_rate": 9.223946784922395e-06, "loss": 0.0907, "step": 416 }, { "epoch": 0.3703374777975133, "grad_norm": 0.65736242707052, "learning_rate": 9.246119733924612e-06, "loss": 0.087, "step": 417 }, { "epoch": 0.37122557726465366, "grad_norm": 0.7099665406423, "learning_rate": 9.268292682926831e-06, "loss": 0.0999, "step": 418 }, { "epoch": 0.37211367673179396, "grad_norm": 0.686234581166355, "learning_rate": 9.290465631929048e-06, "loss": 0.0898, "step": 419 }, { "epoch": 0.37300177619893427, "grad_norm": 0.6747224045550618, "learning_rate": 9.312638580931265e-06, "loss": 0.0771, "step": 420 }, { "epoch": 0.3738898756660746, "grad_norm": 0.6014800643412591, "learning_rate": 9.334811529933483e-06, "loss": 0.0811, "step": 421 }, { "epoch": 0.3747779751332149, "grad_norm": 0.7827277868490677, "learning_rate": 9.3569844789357e-06, "loss": 0.1097, "step": 422 }, { "epoch": 0.3756660746003552, "grad_norm": 0.7422145310046832, "learning_rate": 9.379157427937917e-06, "loss": 0.0797, "step": 423 }, { "epoch": 0.3765541740674956, "grad_norm": 0.8317608189040117, "learning_rate": 9.401330376940134e-06, "loss": 0.0802, "step": 424 }, { "epoch": 0.3774422735346359, "grad_norm": 0.8116023813284492, "learning_rate": 9.423503325942351e-06, "loss": 0.0777, "step": 425 }, { "epoch": 0.3783303730017762, "grad_norm": 0.8589221898382653, "learning_rate": 9.445676274944568e-06, "loss": 0.0873, "step": 426 }, { "epoch": 0.37921847246891655, "grad_norm": 0.9066303214042967, "learning_rate": 9.467849223946785e-06, "loss": 0.0965, "step": 427 }, { "epoch": 0.38010657193605685, "grad_norm": 1.13614698496287, "learning_rate": 9.490022172949002e-06, "loss": 0.0909, "step": 428 }, { "epoch": 0.38099467140319715, "grad_norm": 0.7486420726064874, "learning_rate": 9.51219512195122e-06, "loss": 0.1123, "step": 429 }, { "epoch": 0.38188277087033745, "grad_norm": 0.6648813988527883, "learning_rate": 9.534368070953438e-06, "loss": 0.0729, "step": 430 }, { "epoch": 0.3827708703374778, "grad_norm": 1.0031946962163094, "learning_rate": 9.556541019955655e-06, "loss": 0.098, "step": 431 }, { "epoch": 0.3836589698046181, "grad_norm": 0.7284073775670307, "learning_rate": 9.578713968957872e-06, "loss": 0.0922, "step": 432 }, { "epoch": 0.3845470692717584, "grad_norm": 0.8118365633131759, "learning_rate": 9.60088691796009e-06, "loss": 0.0969, "step": 433 }, { "epoch": 0.38543516873889877, "grad_norm": 0.6878668150492785, "learning_rate": 9.623059866962307e-06, "loss": 0.0864, "step": 434 }, { "epoch": 0.38632326820603907, "grad_norm": 0.8369281571161575, "learning_rate": 9.645232815964524e-06, "loss": 0.0905, "step": 435 }, { "epoch": 0.3872113676731794, "grad_norm": 0.7848911958376642, "learning_rate": 9.667405764966743e-06, "loss": 0.0783, "step": 436 }, { "epoch": 0.38809946714031973, "grad_norm": 0.7169928750946686, "learning_rate": 9.68957871396896e-06, "loss": 0.0809, "step": 437 }, { "epoch": 0.38898756660746003, "grad_norm": 0.8595443954435857, "learning_rate": 9.711751662971177e-06, "loss": 0.0865, "step": 438 }, { "epoch": 0.38987566607460034, "grad_norm": 0.8347165131840893, "learning_rate": 9.733924611973394e-06, "loss": 0.0981, "step": 439 }, { "epoch": 0.3907637655417407, "grad_norm": 0.7491623779751316, "learning_rate": 9.756097560975611e-06, "loss": 0.0752, "step": 440 }, { "epoch": 0.391651865008881, "grad_norm": 0.5719404469911055, "learning_rate": 9.778270509977828e-06, "loss": 0.09, "step": 441 }, { "epoch": 0.3925399644760213, "grad_norm": 1.0813454316817122, "learning_rate": 9.800443458980045e-06, "loss": 0.0761, "step": 442 }, { "epoch": 0.39342806394316165, "grad_norm": 0.7034557816282575, "learning_rate": 9.822616407982262e-06, "loss": 0.0788, "step": 443 }, { "epoch": 0.39431616341030196, "grad_norm": 0.6503808869091405, "learning_rate": 9.84478935698448e-06, "loss": 0.0735, "step": 444 }, { "epoch": 0.39520426287744226, "grad_norm": 0.8255682094309857, "learning_rate": 9.866962305986696e-06, "loss": 0.0745, "step": 445 }, { "epoch": 0.3960923623445826, "grad_norm": 0.623187790209699, "learning_rate": 9.889135254988914e-06, "loss": 0.0652, "step": 446 }, { "epoch": 0.3969804618117229, "grad_norm": 1.0289119443547698, "learning_rate": 9.91130820399113e-06, "loss": 0.0912, "step": 447 }, { "epoch": 0.3978685612788632, "grad_norm": 0.7159877064925825, "learning_rate": 9.93348115299335e-06, "loss": 0.094, "step": 448 }, { "epoch": 0.3987566607460036, "grad_norm": 0.6928649401766506, "learning_rate": 9.955654101995567e-06, "loss": 0.0766, "step": 449 }, { "epoch": 0.3996447602131439, "grad_norm": 1.7774653552366473, "learning_rate": 9.977827050997784e-06, "loss": 0.0987, "step": 450 }, { "epoch": 0.4005328596802842, "grad_norm": 0.8510762775390016, "learning_rate": 1e-05, "loss": 0.0892, "step": 451 }, { "epoch": 0.40142095914742454, "grad_norm": 0.9978840214000684, "learning_rate": 9.999998497942616e-06, "loss": 0.0956, "step": 452 }, { "epoch": 0.40230905861456484, "grad_norm": 0.7864949846643574, "learning_rate": 9.999993991771364e-06, "loss": 0.0835, "step": 453 }, { "epoch": 0.40319715808170514, "grad_norm": 0.8536233008347437, "learning_rate": 9.999986481488953e-06, "loss": 0.0972, "step": 454 }, { "epoch": 0.40408525754884544, "grad_norm": 0.9969692666322935, "learning_rate": 9.999975967099894e-06, "loss": 0.0823, "step": 455 }, { "epoch": 0.4049733570159858, "grad_norm": 0.7039412746032423, "learning_rate": 9.999962448610504e-06, "loss": 0.0871, "step": 456 }, { "epoch": 0.4058614564831261, "grad_norm": 0.7008530432074359, "learning_rate": 9.999945926028907e-06, "loss": 0.0772, "step": 457 }, { "epoch": 0.4067495559502664, "grad_norm": 0.9569874508135047, "learning_rate": 9.99992639936503e-06, "loss": 0.1082, "step": 458 }, { "epoch": 0.40763765541740676, "grad_norm": 0.696882654288798, "learning_rate": 9.999903868630602e-06, "loss": 0.0896, "step": 459 }, { "epoch": 0.40852575488454707, "grad_norm": 0.6689970831734524, "learning_rate": 9.999878333839165e-06, "loss": 0.0828, "step": 460 }, { "epoch": 0.40941385435168737, "grad_norm": 0.9693346223383633, "learning_rate": 9.999849795006055e-06, "loss": 0.0935, "step": 461 }, { "epoch": 0.4103019538188277, "grad_norm": 0.9283219979748674, "learning_rate": 9.999818252148425e-06, "loss": 0.082, "step": 462 }, { "epoch": 0.411190053285968, "grad_norm": 0.6806422342640777, "learning_rate": 9.999783705285223e-06, "loss": 0.0763, "step": 463 }, { "epoch": 0.41207815275310833, "grad_norm": 0.8149452612499798, "learning_rate": 9.999746154437206e-06, "loss": 0.0974, "step": 464 }, { "epoch": 0.4129662522202487, "grad_norm": 0.9378706598051992, "learning_rate": 9.999705599626935e-06, "loss": 0.1065, "step": 465 }, { "epoch": 0.413854351687389, "grad_norm": 0.6190653939919241, "learning_rate": 9.999662040878779e-06, "loss": 0.072, "step": 466 }, { "epoch": 0.4147424511545293, "grad_norm": 1.2939605591798196, "learning_rate": 9.999615478218904e-06, "loss": 0.0886, "step": 467 }, { "epoch": 0.41563055062166965, "grad_norm": 0.909327391547214, "learning_rate": 9.99956591167529e-06, "loss": 0.0864, "step": 468 }, { "epoch": 0.41651865008880995, "grad_norm": 0.7099744376001921, "learning_rate": 9.999513341277718e-06, "loss": 0.0868, "step": 469 }, { "epoch": 0.41740674955595025, "grad_norm": 0.7836599608091721, "learning_rate": 9.99945776705777e-06, "loss": 0.0807, "step": 470 }, { "epoch": 0.4182948490230906, "grad_norm": 0.9494608810490509, "learning_rate": 9.99939918904884e-06, "loss": 0.0936, "step": 471 }, { "epoch": 0.4191829484902309, "grad_norm": 0.8376853679370798, "learning_rate": 9.99933760728612e-06, "loss": 0.0889, "step": 472 }, { "epoch": 0.4200710479573712, "grad_norm": 0.5773413335287602, "learning_rate": 9.999273021806613e-06, "loss": 0.0773, "step": 473 }, { "epoch": 0.42095914742451157, "grad_norm": 1.0064143721408958, "learning_rate": 9.99920543264912e-06, "loss": 0.1058, "step": 474 }, { "epoch": 0.4218472468916519, "grad_norm": 0.5748082696643984, "learning_rate": 9.999134839854252e-06, "loss": 0.0741, "step": 475 }, { "epoch": 0.4227353463587922, "grad_norm": 0.878238503380513, "learning_rate": 9.999061243464424e-06, "loss": 0.0955, "step": 476 }, { "epoch": 0.42362344582593253, "grad_norm": 0.9398055447662498, "learning_rate": 9.99898464352385e-06, "loss": 0.0935, "step": 477 }, { "epoch": 0.42451154529307283, "grad_norm": 0.8328901593585009, "learning_rate": 9.998905040078557e-06, "loss": 0.0806, "step": 478 }, { "epoch": 0.42539964476021314, "grad_norm": 0.7584912885126325, "learning_rate": 9.998822433176371e-06, "loss": 0.0899, "step": 479 }, { "epoch": 0.42628774422735344, "grad_norm": 0.9436305122294443, "learning_rate": 9.998736822866926e-06, "loss": 0.0907, "step": 480 }, { "epoch": 0.4271758436944938, "grad_norm": 0.7921723045923234, "learning_rate": 9.998648209201655e-06, "loss": 0.093, "step": 481 }, { "epoch": 0.4280639431616341, "grad_norm": 0.8406677566067806, "learning_rate": 9.998556592233803e-06, "loss": 0.1007, "step": 482 }, { "epoch": 0.4289520426287744, "grad_norm": 0.6616072134539779, "learning_rate": 9.998461972018414e-06, "loss": 0.0821, "step": 483 }, { "epoch": 0.42984014209591476, "grad_norm": 0.921893217099755, "learning_rate": 9.998364348612338e-06, "loss": 0.0673, "step": 484 }, { "epoch": 0.43072824156305506, "grad_norm": 0.7245471556467965, "learning_rate": 9.998263722074228e-06, "loss": 0.0726, "step": 485 }, { "epoch": 0.43161634103019536, "grad_norm": 0.7657455636678407, "learning_rate": 9.998160092464547e-06, "loss": 0.0849, "step": 486 }, { "epoch": 0.4325044404973357, "grad_norm": 0.7852019441805916, "learning_rate": 9.998053459845552e-06, "loss": 0.0778, "step": 487 }, { "epoch": 0.433392539964476, "grad_norm": 0.9016164990467911, "learning_rate": 9.997943824281313e-06, "loss": 0.0828, "step": 488 }, { "epoch": 0.4342806394316163, "grad_norm": 0.8165736472227916, "learning_rate": 9.997831185837705e-06, "loss": 0.0738, "step": 489 }, { "epoch": 0.4351687388987567, "grad_norm": 0.7094229690268178, "learning_rate": 9.997715544582398e-06, "loss": 0.0955, "step": 490 }, { "epoch": 0.436056838365897, "grad_norm": 0.8289115556465729, "learning_rate": 9.997596900584875e-06, "loss": 0.0721, "step": 491 }, { "epoch": 0.4369449378330373, "grad_norm": 0.8771832244496556, "learning_rate": 9.99747525391642e-06, "loss": 0.1127, "step": 492 }, { "epoch": 0.43783303730017764, "grad_norm": 0.6530335329715642, "learning_rate": 9.997350604650123e-06, "loss": 0.0788, "step": 493 }, { "epoch": 0.43872113676731794, "grad_norm": 0.8271275114471274, "learning_rate": 9.99722295286087e-06, "loss": 0.0888, "step": 494 }, { "epoch": 0.43960923623445824, "grad_norm": 0.580368488902037, "learning_rate": 9.997092298625365e-06, "loss": 0.0753, "step": 495 }, { "epoch": 0.4404973357015986, "grad_norm": 0.6770794650763458, "learning_rate": 9.996958642022101e-06, "loss": 0.0755, "step": 496 }, { "epoch": 0.4413854351687389, "grad_norm": 0.6100575589466122, "learning_rate": 9.996821983131385e-06, "loss": 0.0701, "step": 497 }, { "epoch": 0.4422735346358792, "grad_norm": 0.6053371540629051, "learning_rate": 9.996682322035328e-06, "loss": 0.0697, "step": 498 }, { "epoch": 0.44316163410301956, "grad_norm": 0.7941985186400321, "learning_rate": 9.996539658817835e-06, "loss": 0.0914, "step": 499 }, { "epoch": 0.44404973357015987, "grad_norm": 0.6539713119848678, "learning_rate": 9.996393993564626e-06, "loss": 0.0952, "step": 500 }, { "epoch": 0.44493783303730017, "grad_norm": 0.7945600797488689, "learning_rate": 9.996245326363218e-06, "loss": 0.0773, "step": 501 }, { "epoch": 0.44582593250444047, "grad_norm": 0.7157119118716787, "learning_rate": 9.996093657302937e-06, "loss": 0.0703, "step": 502 }, { "epoch": 0.4467140319715808, "grad_norm": 0.8554534407228245, "learning_rate": 9.995938986474905e-06, "loss": 0.0727, "step": 503 }, { "epoch": 0.44760213143872113, "grad_norm": 0.8600565466728031, "learning_rate": 9.995781313972054e-06, "loss": 0.0771, "step": 504 }, { "epoch": 0.44849023090586143, "grad_norm": 0.7743541986016661, "learning_rate": 9.995620639889117e-06, "loss": 0.1039, "step": 505 }, { "epoch": 0.4493783303730018, "grad_norm": 0.7509971028833511, "learning_rate": 9.99545696432263e-06, "loss": 0.0951, "step": 506 }, { "epoch": 0.4502664298401421, "grad_norm": 0.5619687860129344, "learning_rate": 9.995290287370933e-06, "loss": 0.0731, "step": 507 }, { "epoch": 0.4511545293072824, "grad_norm": 0.7054714896174964, "learning_rate": 9.99512060913417e-06, "loss": 0.0833, "step": 508 }, { "epoch": 0.45204262877442275, "grad_norm": 0.6964202593719336, "learning_rate": 9.994947929714288e-06, "loss": 0.0939, "step": 509 }, { "epoch": 0.45293072824156305, "grad_norm": 0.7197509181407117, "learning_rate": 9.994772249215036e-06, "loss": 0.0924, "step": 510 }, { "epoch": 0.45381882770870335, "grad_norm": 0.7115124670291502, "learning_rate": 9.994593567741966e-06, "loss": 0.0789, "step": 511 }, { "epoch": 0.4547069271758437, "grad_norm": 0.49087310212235263, "learning_rate": 9.994411885402437e-06, "loss": 0.0711, "step": 512 }, { "epoch": 0.455595026642984, "grad_norm": 0.8345608095728034, "learning_rate": 9.994227202305604e-06, "loss": 0.086, "step": 513 }, { "epoch": 0.4564831261101243, "grad_norm": 0.5517219664337538, "learning_rate": 9.994039518562433e-06, "loss": 0.0662, "step": 514 }, { "epoch": 0.4573712255772647, "grad_norm": 0.7304217736850901, "learning_rate": 9.993848834285685e-06, "loss": 0.0932, "step": 515 }, { "epoch": 0.458259325044405, "grad_norm": 0.7885844049103651, "learning_rate": 9.99365514958993e-06, "loss": 0.0913, "step": 516 }, { "epoch": 0.4591474245115453, "grad_norm": 0.7683444302338543, "learning_rate": 9.993458464591534e-06, "loss": 0.0954, "step": 517 }, { "epoch": 0.46003552397868563, "grad_norm": 0.8089449634286497, "learning_rate": 9.993258779408676e-06, "loss": 0.0772, "step": 518 }, { "epoch": 0.46092362344582594, "grad_norm": 0.6423905451403036, "learning_rate": 9.993056094161326e-06, "loss": 0.0784, "step": 519 }, { "epoch": 0.46181172291296624, "grad_norm": 0.4846261470710185, "learning_rate": 9.992850408971267e-06, "loss": 0.0795, "step": 520 }, { "epoch": 0.4626998223801066, "grad_norm": 0.6644049663648053, "learning_rate": 9.992641723962076e-06, "loss": 0.0747, "step": 521 }, { "epoch": 0.4635879218472469, "grad_norm": 0.6315516410011365, "learning_rate": 9.992430039259135e-06, "loss": 0.0772, "step": 522 }, { "epoch": 0.4644760213143872, "grad_norm": 1.2012473367972696, "learning_rate": 9.992215354989633e-06, "loss": 0.0963, "step": 523 }, { "epoch": 0.46536412078152756, "grad_norm": 0.6833716695406203, "learning_rate": 9.991997671282554e-06, "loss": 0.0752, "step": 524 }, { "epoch": 0.46625222024866786, "grad_norm": 0.5524496659287114, "learning_rate": 9.991776988268687e-06, "loss": 0.082, "step": 525 }, { "epoch": 0.46714031971580816, "grad_norm": 0.883266570689509, "learning_rate": 9.991553306080627e-06, "loss": 0.0835, "step": 526 }, { "epoch": 0.46802841918294846, "grad_norm": 0.540053134317723, "learning_rate": 9.991326624852763e-06, "loss": 0.0619, "step": 527 }, { "epoch": 0.4689165186500888, "grad_norm": 0.8470097790352878, "learning_rate": 9.991096944721292e-06, "loss": 0.0597, "step": 528 }, { "epoch": 0.4698046181172291, "grad_norm": 0.8245285829911205, "learning_rate": 9.990864265824212e-06, "loss": 0.0915, "step": 529 }, { "epoch": 0.4706927175843694, "grad_norm": 0.7529196897577872, "learning_rate": 9.990628588301321e-06, "loss": 0.0722, "step": 530 }, { "epoch": 0.4715808170515098, "grad_norm": 0.5935927793864627, "learning_rate": 9.99038991229422e-06, "loss": 0.0847, "step": 531 }, { "epoch": 0.4724689165186501, "grad_norm": 0.8460052408178522, "learning_rate": 9.99014823794631e-06, "loss": 0.0928, "step": 532 }, { "epoch": 0.4733570159857904, "grad_norm": 0.7733706990544129, "learning_rate": 9.989903565402794e-06, "loss": 0.0852, "step": 533 }, { "epoch": 0.47424511545293074, "grad_norm": 0.6112274916254049, "learning_rate": 9.98965589481068e-06, "loss": 0.0746, "step": 534 }, { "epoch": 0.47513321492007105, "grad_norm": 0.6754612378104622, "learning_rate": 9.989405226318772e-06, "loss": 0.0777, "step": 535 }, { "epoch": 0.47602131438721135, "grad_norm": 0.5649455662700742, "learning_rate": 9.989151560077678e-06, "loss": 0.0808, "step": 536 }, { "epoch": 0.4769094138543517, "grad_norm": 0.6626006114678178, "learning_rate": 9.988894896239806e-06, "loss": 0.0673, "step": 537 }, { "epoch": 0.477797513321492, "grad_norm": 0.7864510413434515, "learning_rate": 9.988635234959364e-06, "loss": 0.079, "step": 538 }, { "epoch": 0.4786856127886323, "grad_norm": 0.7460100071946164, "learning_rate": 9.988372576392366e-06, "loss": 0.0875, "step": 539 }, { "epoch": 0.47957371225577267, "grad_norm": 0.6818272861244811, "learning_rate": 9.988106920696621e-06, "loss": 0.0881, "step": 540 }, { "epoch": 0.48046181172291297, "grad_norm": 0.9752286673524108, "learning_rate": 9.98783826803174e-06, "loss": 0.1038, "step": 541 }, { "epoch": 0.48134991119005327, "grad_norm": 0.7800065206458783, "learning_rate": 9.987566618559138e-06, "loss": 0.0824, "step": 542 }, { "epoch": 0.4822380106571936, "grad_norm": 0.868202329884969, "learning_rate": 9.987291972442029e-06, "loss": 0.0969, "step": 543 }, { "epoch": 0.48312611012433393, "grad_norm": 0.6019530542460936, "learning_rate": 9.987014329845422e-06, "loss": 0.073, "step": 544 }, { "epoch": 0.48401420959147423, "grad_norm": 0.8756840734553802, "learning_rate": 9.986733690936136e-06, "loss": 0.0965, "step": 545 }, { "epoch": 0.4849023090586146, "grad_norm": 0.8319714919436229, "learning_rate": 9.986450055882782e-06, "loss": 0.0877, "step": 546 }, { "epoch": 0.4857904085257549, "grad_norm": 0.8583222321909063, "learning_rate": 9.986163424855777e-06, "loss": 0.0896, "step": 547 }, { "epoch": 0.4866785079928952, "grad_norm": 0.6901166240079477, "learning_rate": 9.985873798027334e-06, "loss": 0.108, "step": 548 }, { "epoch": 0.48756660746003555, "grad_norm": 0.7331274431745097, "learning_rate": 9.985581175571467e-06, "loss": 0.096, "step": 549 }, { "epoch": 0.48845470692717585, "grad_norm": 0.800265401337623, "learning_rate": 9.985285557663993e-06, "loss": 0.0967, "step": 550 }, { "epoch": 0.48934280639431615, "grad_norm": 0.8809592610672535, "learning_rate": 9.984986944482523e-06, "loss": 0.0762, "step": 551 }, { "epoch": 0.49023090586145646, "grad_norm": 0.5636426306677593, "learning_rate": 9.984685336206472e-06, "loss": 0.0648, "step": 552 }, { "epoch": 0.4911190053285968, "grad_norm": 0.678318134751214, "learning_rate": 9.984380733017052e-06, "loss": 0.0837, "step": 553 }, { "epoch": 0.4920071047957371, "grad_norm": 0.8204528753002966, "learning_rate": 9.984073135097277e-06, "loss": 0.1014, "step": 554 }, { "epoch": 0.4928952042628774, "grad_norm": 0.6085982428845589, "learning_rate": 9.983762542631958e-06, "loss": 0.0678, "step": 555 }, { "epoch": 0.4937833037300178, "grad_norm": 0.5812489068037968, "learning_rate": 9.983448955807708e-06, "loss": 0.0842, "step": 556 }, { "epoch": 0.4946714031971581, "grad_norm": 0.6518382690202529, "learning_rate": 9.983132374812933e-06, "loss": 0.0738, "step": 557 }, { "epoch": 0.4955595026642984, "grad_norm": 0.6633074008991864, "learning_rate": 9.982812799837848e-06, "loss": 0.0954, "step": 558 }, { "epoch": 0.49644760213143874, "grad_norm": 0.816328703944021, "learning_rate": 9.982490231074455e-06, "loss": 0.0954, "step": 559 }, { "epoch": 0.49733570159857904, "grad_norm": 0.6281501641841354, "learning_rate": 9.982164668716565e-06, "loss": 0.072, "step": 560 }, { "epoch": 0.49822380106571934, "grad_norm": 0.6602879174584156, "learning_rate": 9.981836112959782e-06, "loss": 0.0838, "step": 561 }, { "epoch": 0.4991119005328597, "grad_norm": 0.6560392310252683, "learning_rate": 9.98150456400151e-06, "loss": 0.0775, "step": 562 }, { "epoch": 0.5, "grad_norm": 0.512881207998907, "learning_rate": 9.981170022040949e-06, "loss": 0.0626, "step": 563 }, { "epoch": 0.5008880994671403, "grad_norm": 0.5481928107670866, "learning_rate": 9.980832487279102e-06, "loss": 0.0713, "step": 564 }, { "epoch": 0.5017761989342806, "grad_norm": 0.6264319183486977, "learning_rate": 9.980491959918767e-06, "loss": 0.075, "step": 565 }, { "epoch": 0.5026642984014209, "grad_norm": 0.5238738729951576, "learning_rate": 9.980148440164543e-06, "loss": 0.0698, "step": 566 }, { "epoch": 0.5035523978685613, "grad_norm": 0.8378395838320835, "learning_rate": 9.979801928222819e-06, "loss": 0.0967, "step": 567 }, { "epoch": 0.5044404973357016, "grad_norm": 0.564341382022705, "learning_rate": 9.979452424301792e-06, "loss": 0.0808, "step": 568 }, { "epoch": 0.5053285968028419, "grad_norm": 0.631173282575248, "learning_rate": 9.979099928611449e-06, "loss": 0.0706, "step": 569 }, { "epoch": 0.5062166962699822, "grad_norm": 0.5825124500655912, "learning_rate": 9.978744441363582e-06, "loss": 0.0727, "step": 570 }, { "epoch": 0.5071047957371225, "grad_norm": 0.5797415025353291, "learning_rate": 9.978385962771771e-06, "loss": 0.0704, "step": 571 }, { "epoch": 0.5079928952042628, "grad_norm": 0.8573349386910596, "learning_rate": 9.9780244930514e-06, "loss": 0.0871, "step": 572 }, { "epoch": 0.5088809946714032, "grad_norm": 0.7752539245927109, "learning_rate": 9.977660032419647e-06, "loss": 0.0808, "step": 573 }, { "epoch": 0.5097690941385435, "grad_norm": 0.5071737030464216, "learning_rate": 9.977292581095493e-06, "loss": 0.0735, "step": 574 }, { "epoch": 0.5106571936056838, "grad_norm": 0.5589986959748494, "learning_rate": 9.976922139299706e-06, "loss": 0.0799, "step": 575 }, { "epoch": 0.5115452930728241, "grad_norm": 0.5184552557706661, "learning_rate": 9.976548707254857e-06, "loss": 0.0758, "step": 576 }, { "epoch": 0.5124333925399644, "grad_norm": 0.7888563604598832, "learning_rate": 9.976172285185315e-06, "loss": 0.1095, "step": 577 }, { "epoch": 0.5133214920071048, "grad_norm": 0.5650867261613388, "learning_rate": 9.97579287331724e-06, "loss": 0.071, "step": 578 }, { "epoch": 0.5142095914742452, "grad_norm": 0.6700954589743635, "learning_rate": 9.975410471878592e-06, "loss": 0.0839, "step": 579 }, { "epoch": 0.5150976909413855, "grad_norm": 0.5813447738567292, "learning_rate": 9.97502508109913e-06, "loss": 0.0624, "step": 580 }, { "epoch": 0.5159857904085258, "grad_norm": 0.6158677363797734, "learning_rate": 9.974636701210402e-06, "loss": 0.0712, "step": 581 }, { "epoch": 0.5168738898756661, "grad_norm": 0.62582136981208, "learning_rate": 9.974245332445756e-06, "loss": 0.0848, "step": 582 }, { "epoch": 0.5177619893428064, "grad_norm": 0.5107303299624117, "learning_rate": 9.973850975040335e-06, "loss": 0.0654, "step": 583 }, { "epoch": 0.5186500888099467, "grad_norm": 0.7854970444895641, "learning_rate": 9.97345362923108e-06, "loss": 0.0861, "step": 584 }, { "epoch": 0.5195381882770871, "grad_norm": 0.6754830198888271, "learning_rate": 9.973053295256725e-06, "loss": 0.0894, "step": 585 }, { "epoch": 0.5204262877442274, "grad_norm": 0.6696301919339701, "learning_rate": 9.972649973357797e-06, "loss": 0.0745, "step": 586 }, { "epoch": 0.5213143872113677, "grad_norm": 0.7380562380998862, "learning_rate": 9.972243663776626e-06, "loss": 0.0833, "step": 587 }, { "epoch": 0.522202486678508, "grad_norm": 0.8204338382294417, "learning_rate": 9.971834366757327e-06, "loss": 0.0768, "step": 588 }, { "epoch": 0.5230905861456483, "grad_norm": 0.577684776094795, "learning_rate": 9.971422082545818e-06, "loss": 0.0847, "step": 589 }, { "epoch": 0.5239786856127886, "grad_norm": 0.5842234378048432, "learning_rate": 9.971006811389807e-06, "loss": 0.0693, "step": 590 }, { "epoch": 0.5248667850799289, "grad_norm": 0.6709627494905674, "learning_rate": 9.970588553538802e-06, "loss": 0.079, "step": 591 }, { "epoch": 0.5257548845470693, "grad_norm": 0.5657394188287586, "learning_rate": 9.970167309244097e-06, "loss": 0.0743, "step": 592 }, { "epoch": 0.5266429840142096, "grad_norm": 0.5208485085677795, "learning_rate": 9.969743078758788e-06, "loss": 0.0669, "step": 593 }, { "epoch": 0.5275310834813499, "grad_norm": 0.5040140180463282, "learning_rate": 9.969315862337764e-06, "loss": 0.0538, "step": 594 }, { "epoch": 0.5284191829484902, "grad_norm": 0.6053908206358658, "learning_rate": 9.968885660237704e-06, "loss": 0.0822, "step": 595 }, { "epoch": 0.5293072824156305, "grad_norm": 0.7398344551566501, "learning_rate": 9.968452472717084e-06, "loss": 0.0918, "step": 596 }, { "epoch": 0.5301953818827708, "grad_norm": 0.4916738909621316, "learning_rate": 9.968016300036172e-06, "loss": 0.073, "step": 597 }, { "epoch": 0.5310834813499112, "grad_norm": 0.5539634431634507, "learning_rate": 9.967577142457031e-06, "loss": 0.0647, "step": 598 }, { "epoch": 0.5319715808170515, "grad_norm": 0.6671542351197542, "learning_rate": 9.96713500024352e-06, "loss": 0.0858, "step": 599 }, { "epoch": 0.5328596802841918, "grad_norm": 0.4786344609288544, "learning_rate": 9.966689873661284e-06, "loss": 0.0811, "step": 600 }, { "epoch": 0.5337477797513321, "grad_norm": 0.4690021612325154, "learning_rate": 9.966241762977767e-06, "loss": 0.0639, "step": 601 }, { "epoch": 0.5346358792184724, "grad_norm": 0.6417197726302212, "learning_rate": 9.965790668462205e-06, "loss": 0.0753, "step": 602 }, { "epoch": 0.5355239786856127, "grad_norm": 0.5376695510800252, "learning_rate": 9.965336590385623e-06, "loss": 0.0833, "step": 603 }, { "epoch": 0.5364120781527532, "grad_norm": 0.5569038501440469, "learning_rate": 9.964879529020844e-06, "loss": 0.0697, "step": 604 }, { "epoch": 0.5373001776198935, "grad_norm": 0.5443343147501704, "learning_rate": 9.964419484642482e-06, "loss": 0.0761, "step": 605 }, { "epoch": 0.5381882770870338, "grad_norm": 0.5170897128943753, "learning_rate": 9.96395645752694e-06, "loss": 0.064, "step": 606 }, { "epoch": 0.5390763765541741, "grad_norm": 0.8056275163824247, "learning_rate": 9.963490447952415e-06, "loss": 0.092, "step": 607 }, { "epoch": 0.5399644760213144, "grad_norm": 0.5236969420959864, "learning_rate": 9.963021456198898e-06, "loss": 0.0742, "step": 608 }, { "epoch": 0.5408525754884547, "grad_norm": 0.4992794018935628, "learning_rate": 9.962549482548169e-06, "loss": 0.0771, "step": 609 }, { "epoch": 0.5417406749555951, "grad_norm": 0.5755486594928395, "learning_rate": 9.962074527283803e-06, "loss": 0.0718, "step": 610 }, { "epoch": 0.5426287744227354, "grad_norm": 0.5339655506708132, "learning_rate": 9.961596590691159e-06, "loss": 0.0717, "step": 611 }, { "epoch": 0.5435168738898757, "grad_norm": 0.6875341483650086, "learning_rate": 9.961115673057397e-06, "loss": 0.0826, "step": 612 }, { "epoch": 0.544404973357016, "grad_norm": 0.5518853021351647, "learning_rate": 9.96063177467146e-06, "loss": 0.0638, "step": 613 }, { "epoch": 0.5452930728241563, "grad_norm": 0.5250033178635135, "learning_rate": 9.960144895824088e-06, "loss": 0.0533, "step": 614 }, { "epoch": 0.5461811722912966, "grad_norm": 0.9091750188672192, "learning_rate": 9.959655036807807e-06, "loss": 0.0833, "step": 615 }, { "epoch": 0.5470692717584369, "grad_norm": 0.5806570052719987, "learning_rate": 9.959162197916938e-06, "loss": 0.0765, "step": 616 }, { "epoch": 0.5479573712255773, "grad_norm": 0.6555721441709699, "learning_rate": 9.958666379447588e-06, "loss": 0.0799, "step": 617 }, { "epoch": 0.5488454706927176, "grad_norm": 0.49620238314036563, "learning_rate": 9.958167581697656e-06, "loss": 0.0802, "step": 618 }, { "epoch": 0.5497335701598579, "grad_norm": 0.6095867263904436, "learning_rate": 9.95766580496683e-06, "loss": 0.0661, "step": 619 }, { "epoch": 0.5506216696269982, "grad_norm": 0.5227562383858685, "learning_rate": 9.957161049556591e-06, "loss": 0.0613, "step": 620 }, { "epoch": 0.5515097690941385, "grad_norm": 0.6208242296786288, "learning_rate": 9.956653315770209e-06, "loss": 0.075, "step": 621 }, { "epoch": 0.5523978685612788, "grad_norm": 0.6046805080838598, "learning_rate": 9.956142603912737e-06, "loss": 0.0797, "step": 622 }, { "epoch": 0.5532859680284192, "grad_norm": 0.6444562243401362, "learning_rate": 9.95562891429103e-06, "loss": 0.0755, "step": 623 }, { "epoch": 0.5541740674955595, "grad_norm": 0.5985558732707539, "learning_rate": 9.955112247213716e-06, "loss": 0.0678, "step": 624 }, { "epoch": 0.5550621669626998, "grad_norm": 0.6138532225423613, "learning_rate": 9.954592602991226e-06, "loss": 0.0854, "step": 625 }, { "epoch": 0.5559502664298401, "grad_norm": 0.48332537237986817, "learning_rate": 9.954069981935774e-06, "loss": 0.0636, "step": 626 }, { "epoch": 0.5568383658969804, "grad_norm": 0.5307808463292392, "learning_rate": 9.95354438436136e-06, "loss": 0.0675, "step": 627 }, { "epoch": 0.5577264653641207, "grad_norm": 0.5287375678675353, "learning_rate": 9.953015810583777e-06, "loss": 0.0728, "step": 628 }, { "epoch": 0.5586145648312612, "grad_norm": 0.6196806533003894, "learning_rate": 9.952484260920605e-06, "loss": 0.0845, "step": 629 }, { "epoch": 0.5595026642984015, "grad_norm": 0.45677564015470606, "learning_rate": 9.95194973569121e-06, "loss": 0.0714, "step": 630 }, { "epoch": 0.5603907637655418, "grad_norm": 0.4304045479722019, "learning_rate": 9.951412235216744e-06, "loss": 0.0592, "step": 631 }, { "epoch": 0.5612788632326821, "grad_norm": 0.3988334991399962, "learning_rate": 9.950871759820155e-06, "loss": 0.0626, "step": 632 }, { "epoch": 0.5621669626998224, "grad_norm": 0.5596339454540106, "learning_rate": 9.950328309826172e-06, "loss": 0.0797, "step": 633 }, { "epoch": 0.5630550621669627, "grad_norm": 0.6027269400477876, "learning_rate": 9.949781885561309e-06, "loss": 0.0772, "step": 634 }, { "epoch": 0.5639431616341031, "grad_norm": 0.5128227811773718, "learning_rate": 9.949232487353873e-06, "loss": 0.0666, "step": 635 }, { "epoch": 0.5648312611012434, "grad_norm": 0.6243259838812658, "learning_rate": 9.948680115533954e-06, "loss": 0.0737, "step": 636 }, { "epoch": 0.5657193605683837, "grad_norm": 0.3721747409496252, "learning_rate": 9.94812477043343e-06, "loss": 0.0565, "step": 637 }, { "epoch": 0.566607460035524, "grad_norm": 0.5603791331693248, "learning_rate": 9.947566452385967e-06, "loss": 0.0648, "step": 638 }, { "epoch": 0.5674955595026643, "grad_norm": 0.7236695232741394, "learning_rate": 9.94700516172701e-06, "loss": 0.1008, "step": 639 }, { "epoch": 0.5683836589698046, "grad_norm": 0.5727667142554005, "learning_rate": 9.9464408987938e-06, "loss": 0.0821, "step": 640 }, { "epoch": 0.5692717584369449, "grad_norm": 0.5637693338316091, "learning_rate": 9.945873663925358e-06, "loss": 0.0731, "step": 641 }, { "epoch": 0.5701598579040853, "grad_norm": 0.6065169982263043, "learning_rate": 9.945303457462492e-06, "loss": 0.0776, "step": 642 }, { "epoch": 0.5710479573712256, "grad_norm": 0.9659163460815292, "learning_rate": 9.944730279747795e-06, "loss": 0.0889, "step": 643 }, { "epoch": 0.5719360568383659, "grad_norm": 0.543692295516828, "learning_rate": 9.944154131125643e-06, "loss": 0.0698, "step": 644 }, { "epoch": 0.5728241563055062, "grad_norm": 0.7812550740351492, "learning_rate": 9.943575011942203e-06, "loss": 0.0794, "step": 645 }, { "epoch": 0.5737122557726465, "grad_norm": 0.7273669868701744, "learning_rate": 9.94299292254542e-06, "loss": 0.0935, "step": 646 }, { "epoch": 0.5746003552397868, "grad_norm": 0.6514262782582129, "learning_rate": 9.94240786328503e-06, "loss": 0.074, "step": 647 }, { "epoch": 0.5754884547069272, "grad_norm": 0.6704934102357321, "learning_rate": 9.941819834512547e-06, "loss": 0.0844, "step": 648 }, { "epoch": 0.5763765541740675, "grad_norm": 0.6232798380743291, "learning_rate": 9.941228836581273e-06, "loss": 0.0791, "step": 649 }, { "epoch": 0.5772646536412078, "grad_norm": 0.8328031251816983, "learning_rate": 9.940634869846293e-06, "loss": 0.0672, "step": 650 }, { "epoch": 0.5781527531083481, "grad_norm": 0.5491182560832706, "learning_rate": 9.940037934664476e-06, "loss": 0.075, "step": 651 }, { "epoch": 0.5790408525754884, "grad_norm": 0.6980263709487374, "learning_rate": 9.939438031394477e-06, "loss": 0.0854, "step": 652 }, { "epoch": 0.5799289520426287, "grad_norm": 0.762719484435552, "learning_rate": 9.938835160396727e-06, "loss": 0.0659, "step": 653 }, { "epoch": 0.5808170515097691, "grad_norm": 0.6497096424621961, "learning_rate": 9.938229322033448e-06, "loss": 0.0687, "step": 654 }, { "epoch": 0.5817051509769094, "grad_norm": 0.5232236761833539, "learning_rate": 9.93762051666864e-06, "loss": 0.0723, "step": 655 }, { "epoch": 0.5825932504440497, "grad_norm": 1.9373735232345077, "learning_rate": 9.937008744668089e-06, "loss": 0.0759, "step": 656 }, { "epoch": 0.58348134991119, "grad_norm": 0.6686288174204, "learning_rate": 9.93639400639936e-06, "loss": 0.0817, "step": 657 }, { "epoch": 0.5843694493783304, "grad_norm": 0.6311724633616251, "learning_rate": 9.935776302231801e-06, "loss": 0.0781, "step": 658 }, { "epoch": 0.5852575488454707, "grad_norm": 0.7500877855603757, "learning_rate": 9.935155632536544e-06, "loss": 0.082, "step": 659 }, { "epoch": 0.5861456483126111, "grad_norm": 0.726336288113051, "learning_rate": 9.934531997686503e-06, "loss": 0.0879, "step": 660 }, { "epoch": 0.5870337477797514, "grad_norm": 0.6749131743501463, "learning_rate": 9.933905398056371e-06, "loss": 0.0781, "step": 661 }, { "epoch": 0.5879218472468917, "grad_norm": 0.6732350602477372, "learning_rate": 9.933275834022623e-06, "loss": 0.0677, "step": 662 }, { "epoch": 0.588809946714032, "grad_norm": 0.6598835277716724, "learning_rate": 9.932643305963516e-06, "loss": 0.0652, "step": 663 }, { "epoch": 0.5896980461811723, "grad_norm": 0.6889485952784423, "learning_rate": 9.932007814259088e-06, "loss": 0.0643, "step": 664 }, { "epoch": 0.5905861456483126, "grad_norm": 0.609717080517239, "learning_rate": 9.931369359291154e-06, "loss": 0.0872, "step": 665 }, { "epoch": 0.5914742451154529, "grad_norm": 0.5862816597590875, "learning_rate": 9.930727941443316e-06, "loss": 0.0747, "step": 666 }, { "epoch": 0.5923623445825933, "grad_norm": 0.577048754067645, "learning_rate": 9.930083561100952e-06, "loss": 0.0719, "step": 667 }, { "epoch": 0.5932504440497336, "grad_norm": 0.6316269392471102, "learning_rate": 9.92943621865122e-06, "loss": 0.0755, "step": 668 }, { "epoch": 0.5941385435168739, "grad_norm": 0.7232924759518332, "learning_rate": 9.928785914483056e-06, "loss": 0.0886, "step": 669 }, { "epoch": 0.5950266429840142, "grad_norm": 0.7645669950177373, "learning_rate": 9.928132648987181e-06, "loss": 0.0938, "step": 670 }, { "epoch": 0.5959147424511545, "grad_norm": 0.6739484818221317, "learning_rate": 9.927476422556091e-06, "loss": 0.0711, "step": 671 }, { "epoch": 0.5968028419182948, "grad_norm": 0.5849557845201215, "learning_rate": 9.92681723558406e-06, "loss": 0.0771, "step": 672 }, { "epoch": 0.5976909413854352, "grad_norm": 0.6778196656517596, "learning_rate": 9.926155088467145e-06, "loss": 0.0937, "step": 673 }, { "epoch": 0.5985790408525755, "grad_norm": 0.6418708651856079, "learning_rate": 9.925489981603177e-06, "loss": 0.0687, "step": 674 }, { "epoch": 0.5994671403197158, "grad_norm": 0.5525985012114619, "learning_rate": 9.92482191539177e-06, "loss": 0.0695, "step": 675 }, { "epoch": 0.6003552397868561, "grad_norm": 0.6845649631504699, "learning_rate": 9.924150890234311e-06, "loss": 0.0694, "step": 676 }, { "epoch": 0.6012433392539964, "grad_norm": 0.5137621495039753, "learning_rate": 9.923476906533971e-06, "loss": 0.0667, "step": 677 }, { "epoch": 0.6021314387211367, "grad_norm": 0.6305697583430346, "learning_rate": 9.922799964695691e-06, "loss": 0.0814, "step": 678 }, { "epoch": 0.6030195381882771, "grad_norm": 0.7029756793327437, "learning_rate": 9.922120065126192e-06, "loss": 0.0859, "step": 679 }, { "epoch": 0.6039076376554174, "grad_norm": 0.4931592497555935, "learning_rate": 9.92143720823398e-06, "loss": 0.0632, "step": 680 }, { "epoch": 0.6047957371225577, "grad_norm": 0.7076085243788406, "learning_rate": 9.920751394429326e-06, "loss": 0.0652, "step": 681 }, { "epoch": 0.605683836589698, "grad_norm": 0.5869311269808133, "learning_rate": 9.920062624124282e-06, "loss": 0.0733, "step": 682 }, { "epoch": 0.6065719360568383, "grad_norm": 0.6789599846653736, "learning_rate": 9.91937089773268e-06, "loss": 0.0919, "step": 683 }, { "epoch": 0.6074600355239786, "grad_norm": 0.7106183658768801, "learning_rate": 9.918676215670123e-06, "loss": 0.0626, "step": 684 }, { "epoch": 0.6083481349911191, "grad_norm": 0.5307298829724526, "learning_rate": 9.917978578353993e-06, "loss": 0.0698, "step": 685 }, { "epoch": 0.6092362344582594, "grad_norm": 0.5130192562586083, "learning_rate": 9.917277986203447e-06, "loss": 0.0769, "step": 686 }, { "epoch": 0.6101243339253997, "grad_norm": 0.9975118284059644, "learning_rate": 9.916574439639415e-06, "loss": 0.0797, "step": 687 }, { "epoch": 0.61101243339254, "grad_norm": 0.4501001012025509, "learning_rate": 9.915867939084606e-06, "loss": 0.0578, "step": 688 }, { "epoch": 0.6119005328596803, "grad_norm": 0.9282646718419376, "learning_rate": 9.915158484963501e-06, "loss": 0.076, "step": 689 }, { "epoch": 0.6127886323268206, "grad_norm": 0.7685809779066356, "learning_rate": 9.914446077702354e-06, "loss": 0.0684, "step": 690 }, { "epoch": 0.6136767317939609, "grad_norm": 0.5523703432888231, "learning_rate": 9.9137307177292e-06, "loss": 0.0659, "step": 691 }, { "epoch": 0.6145648312611013, "grad_norm": 0.8742460294151082, "learning_rate": 9.91301240547384e-06, "loss": 0.0997, "step": 692 }, { "epoch": 0.6154529307282416, "grad_norm": 0.4410815923031126, "learning_rate": 9.912291141367853e-06, "loss": 0.0615, "step": 693 }, { "epoch": 0.6163410301953819, "grad_norm": 0.7411410182866369, "learning_rate": 9.911566925844593e-06, "loss": 0.0665, "step": 694 }, { "epoch": 0.6172291296625222, "grad_norm": 0.9083393270663752, "learning_rate": 9.910839759339186e-06, "loss": 0.0753, "step": 695 }, { "epoch": 0.6181172291296625, "grad_norm": 0.5717516177319388, "learning_rate": 9.910109642288526e-06, "loss": 0.0753, "step": 696 }, { "epoch": 0.6190053285968028, "grad_norm": 0.7803157417708638, "learning_rate": 9.909376575131288e-06, "loss": 0.0783, "step": 697 }, { "epoch": 0.6198934280639432, "grad_norm": 0.8487363706219908, "learning_rate": 9.908640558307915e-06, "loss": 0.0879, "step": 698 }, { "epoch": 0.6207815275310835, "grad_norm": 0.7202024790138503, "learning_rate": 9.907901592260619e-06, "loss": 0.0829, "step": 699 }, { "epoch": 0.6216696269982238, "grad_norm": 0.6796685076427839, "learning_rate": 9.907159677433394e-06, "loss": 0.0683, "step": 700 }, { "epoch": 0.6225577264653641, "grad_norm": 0.6907286160016336, "learning_rate": 9.906414814271994e-06, "loss": 0.0823, "step": 701 }, { "epoch": 0.6234458259325044, "grad_norm": 0.7955497425182856, "learning_rate": 9.905667003223952e-06, "loss": 0.0911, "step": 702 }, { "epoch": 0.6243339253996447, "grad_norm": 0.6476810022421644, "learning_rate": 9.904916244738572e-06, "loss": 0.1003, "step": 703 }, { "epoch": 0.6252220248667851, "grad_norm": 0.8772932993530401, "learning_rate": 9.904162539266924e-06, "loss": 0.0747, "step": 704 }, { "epoch": 0.6261101243339254, "grad_norm": 0.5356612214005964, "learning_rate": 9.903405887261852e-06, "loss": 0.0661, "step": 705 }, { "epoch": 0.6269982238010657, "grad_norm": 0.5612010565144157, "learning_rate": 9.902646289177971e-06, "loss": 0.0663, "step": 706 }, { "epoch": 0.627886323268206, "grad_norm": 1.1208224490222134, "learning_rate": 9.901883745471665e-06, "loss": 0.0875, "step": 707 }, { "epoch": 0.6287744227353463, "grad_norm": 0.6505646562126589, "learning_rate": 9.901118256601086e-06, "loss": 0.0801, "step": 708 }, { "epoch": 0.6296625222024866, "grad_norm": 0.8700126403755859, "learning_rate": 9.900349823026161e-06, "loss": 0.0817, "step": 709 }, { "epoch": 0.6305506216696269, "grad_norm": 0.6490556908485404, "learning_rate": 9.899578445208578e-06, "loss": 0.0808, "step": 710 }, { "epoch": 0.6314387211367674, "grad_norm": 0.5297936162626927, "learning_rate": 9.8988041236118e-06, "loss": 0.0602, "step": 711 }, { "epoch": 0.6323268206039077, "grad_norm": 0.6331575154891161, "learning_rate": 9.89802685870106e-06, "loss": 0.0743, "step": 712 }, { "epoch": 0.633214920071048, "grad_norm": 0.5801710341779982, "learning_rate": 9.897246650943353e-06, "loss": 0.0706, "step": 713 }, { "epoch": 0.6341030195381883, "grad_norm": 1.1369421097074788, "learning_rate": 9.896463500807447e-06, "loss": 0.1075, "step": 714 }, { "epoch": 0.6349911190053286, "grad_norm": 0.45981576282496633, "learning_rate": 9.895677408763878e-06, "loss": 0.0702, "step": 715 }, { "epoch": 0.6358792184724689, "grad_norm": 0.8763111120413932, "learning_rate": 9.894888375284945e-06, "loss": 0.0829, "step": 716 }, { "epoch": 0.6367673179396093, "grad_norm": 0.5055918300464575, "learning_rate": 9.89409640084472e-06, "loss": 0.0748, "step": 717 }, { "epoch": 0.6376554174067496, "grad_norm": 0.5977473500840715, "learning_rate": 9.893301485919038e-06, "loss": 0.0697, "step": 718 }, { "epoch": 0.6385435168738899, "grad_norm": 0.6766781995614883, "learning_rate": 9.892503630985504e-06, "loss": 0.0833, "step": 719 }, { "epoch": 0.6394316163410302, "grad_norm": 0.9713972859003385, "learning_rate": 9.891702836523485e-06, "loss": 0.0725, "step": 720 }, { "epoch": 0.6403197158081705, "grad_norm": 0.6557626661480346, "learning_rate": 9.89089910301412e-06, "loss": 0.0746, "step": 721 }, { "epoch": 0.6412078152753108, "grad_norm": 0.6767885337992217, "learning_rate": 9.890092430940308e-06, "loss": 0.073, "step": 722 }, { "epoch": 0.6420959147424512, "grad_norm": 0.7472719120491773, "learning_rate": 9.889282820786716e-06, "loss": 0.067, "step": 723 }, { "epoch": 0.6429840142095915, "grad_norm": 0.6639286593147884, "learning_rate": 9.888470273039776e-06, "loss": 0.0829, "step": 724 }, { "epoch": 0.6438721136767318, "grad_norm": 0.5982769355913197, "learning_rate": 9.887654788187689e-06, "loss": 0.0775, "step": 725 }, { "epoch": 0.6447602131438721, "grad_norm": 0.4855877184062672, "learning_rate": 9.88683636672041e-06, "loss": 0.0676, "step": 726 }, { "epoch": 0.6456483126110124, "grad_norm": 0.5340617829892079, "learning_rate": 9.886015009129674e-06, "loss": 0.0727, "step": 727 }, { "epoch": 0.6465364120781527, "grad_norm": 0.6639238564457132, "learning_rate": 9.885190715908966e-06, "loss": 0.0802, "step": 728 }, { "epoch": 0.6474245115452931, "grad_norm": 0.6110643471699649, "learning_rate": 9.884363487553542e-06, "loss": 0.0771, "step": 729 }, { "epoch": 0.6483126110124334, "grad_norm": 0.4621339230437264, "learning_rate": 9.88353332456042e-06, "loss": 0.0449, "step": 730 }, { "epoch": 0.6492007104795737, "grad_norm": 0.678858009404123, "learning_rate": 9.88270022742838e-06, "loss": 0.0877, "step": 731 }, { "epoch": 0.650088809946714, "grad_norm": 0.6529691011694954, "learning_rate": 9.881864196657965e-06, "loss": 0.0812, "step": 732 }, { "epoch": 0.6509769094138543, "grad_norm": 0.5748660355744435, "learning_rate": 9.881025232751482e-06, "loss": 0.068, "step": 733 }, { "epoch": 0.6518650088809946, "grad_norm": 0.6375177785671727, "learning_rate": 9.880183336213002e-06, "loss": 0.0732, "step": 734 }, { "epoch": 0.6527531083481349, "grad_norm": 0.6387740003981374, "learning_rate": 9.879338507548353e-06, "loss": 0.0812, "step": 735 }, { "epoch": 0.6536412078152753, "grad_norm": 0.7518231911966612, "learning_rate": 9.87849074726513e-06, "loss": 0.0804, "step": 736 }, { "epoch": 0.6545293072824157, "grad_norm": 0.6405075473759618, "learning_rate": 9.877640055872686e-06, "loss": 0.0743, "step": 737 }, { "epoch": 0.655417406749556, "grad_norm": 0.6561685164748122, "learning_rate": 9.876786433882134e-06, "loss": 0.0655, "step": 738 }, { "epoch": 0.6563055062166963, "grad_norm": 0.6639484183813833, "learning_rate": 9.875929881806353e-06, "loss": 0.08, "step": 739 }, { "epoch": 0.6571936056838366, "grad_norm": 0.5959227008610657, "learning_rate": 9.875070400159975e-06, "loss": 0.0671, "step": 740 }, { "epoch": 0.6580817051509769, "grad_norm": 0.49191753805945415, "learning_rate": 9.8742079894594e-06, "loss": 0.0545, "step": 741 }, { "epoch": 0.6589698046181173, "grad_norm": 0.6813282250979682, "learning_rate": 9.873342650222783e-06, "loss": 0.0852, "step": 742 }, { "epoch": 0.6598579040852576, "grad_norm": 0.9250391619311136, "learning_rate": 9.87247438297004e-06, "loss": 0.0565, "step": 743 }, { "epoch": 0.6607460035523979, "grad_norm": 0.537411527063087, "learning_rate": 9.871603188222842e-06, "loss": 0.0639, "step": 744 }, { "epoch": 0.6616341030195382, "grad_norm": 0.633528068688325, "learning_rate": 9.870729066504629e-06, "loss": 0.0761, "step": 745 }, { "epoch": 0.6625222024866785, "grad_norm": 0.6043145862673834, "learning_rate": 9.869852018340588e-06, "loss": 0.0781, "step": 746 }, { "epoch": 0.6634103019538188, "grad_norm": 0.6414104245118655, "learning_rate": 9.868972044257675e-06, "loss": 0.0624, "step": 747 }, { "epoch": 0.6642984014209592, "grad_norm": 0.5109239331287874, "learning_rate": 9.868089144784593e-06, "loss": 0.067, "step": 748 }, { "epoch": 0.6651865008880995, "grad_norm": 1.6962551913633659, "learning_rate": 9.867203320451814e-06, "loss": 0.0666, "step": 749 }, { "epoch": 0.6660746003552398, "grad_norm": 0.813934879598361, "learning_rate": 9.866314571791557e-06, "loss": 0.0893, "step": 750 }, { "epoch": 0.6669626998223801, "grad_norm": 0.5511951847347283, "learning_rate": 9.865422899337805e-06, "loss": 0.0629, "step": 751 }, { "epoch": 0.6678507992895204, "grad_norm": 0.5459582838219199, "learning_rate": 9.864528303626295e-06, "loss": 0.0785, "step": 752 }, { "epoch": 0.6687388987566607, "grad_norm": 0.7705512172036647, "learning_rate": 9.86363078519452e-06, "loss": 0.0775, "step": 753 }, { "epoch": 0.6696269982238011, "grad_norm": 0.619884058455073, "learning_rate": 9.862730344581727e-06, "loss": 0.0813, "step": 754 }, { "epoch": 0.6705150976909414, "grad_norm": 0.6226342312705461, "learning_rate": 9.861826982328927e-06, "loss": 0.071, "step": 755 }, { "epoch": 0.6714031971580817, "grad_norm": 0.6044498881029556, "learning_rate": 9.860920698978878e-06, "loss": 0.075, "step": 756 }, { "epoch": 0.672291296625222, "grad_norm": 0.7913251099265589, "learning_rate": 9.860011495076094e-06, "loss": 0.085, "step": 757 }, { "epoch": 0.6731793960923623, "grad_norm": 0.7381414512209249, "learning_rate": 9.85909937116685e-06, "loss": 0.0837, "step": 758 }, { "epoch": 0.6740674955595026, "grad_norm": 0.6530467510353278, "learning_rate": 9.858184327799167e-06, "loss": 0.0914, "step": 759 }, { "epoch": 0.6749555950266429, "grad_norm": 0.6030829518086117, "learning_rate": 9.857266365522825e-06, "loss": 0.0771, "step": 760 }, { "epoch": 0.6758436944937833, "grad_norm": 0.5395337483407944, "learning_rate": 9.856345484889358e-06, "loss": 0.0699, "step": 761 }, { "epoch": 0.6767317939609236, "grad_norm": 0.5560718019400371, "learning_rate": 9.855421686452051e-06, "loss": 0.0591, "step": 762 }, { "epoch": 0.677619893428064, "grad_norm": 0.4739087807150043, "learning_rate": 9.854494970765944e-06, "loss": 0.0591, "step": 763 }, { "epoch": 0.6785079928952042, "grad_norm": 0.5773819216810755, "learning_rate": 9.853565338387829e-06, "loss": 0.0705, "step": 764 }, { "epoch": 0.6793960923623446, "grad_norm": 0.5683151590028005, "learning_rate": 9.85263278987625e-06, "loss": 0.0605, "step": 765 }, { "epoch": 0.6802841918294849, "grad_norm": 0.5668668206312913, "learning_rate": 9.851697325791505e-06, "loss": 0.0889, "step": 766 }, { "epoch": 0.6811722912966253, "grad_norm": 0.4471977890212434, "learning_rate": 9.850758946695639e-06, "loss": 0.0541, "step": 767 }, { "epoch": 0.6820603907637656, "grad_norm": 0.5726134869433405, "learning_rate": 9.849817653152455e-06, "loss": 0.0761, "step": 768 }, { "epoch": 0.6829484902309059, "grad_norm": 0.40904293323042457, "learning_rate": 9.848873445727505e-06, "loss": 0.0525, "step": 769 }, { "epoch": 0.6838365896980462, "grad_norm": 0.5812966102900733, "learning_rate": 9.847926324988085e-06, "loss": 0.0682, "step": 770 }, { "epoch": 0.6847246891651865, "grad_norm": 0.6255213845313706, "learning_rate": 9.846976291503251e-06, "loss": 0.0989, "step": 771 }, { "epoch": 0.6856127886323268, "grad_norm": 0.5932962594157872, "learning_rate": 9.846023345843805e-06, "loss": 0.069, "step": 772 }, { "epoch": 0.6865008880994672, "grad_norm": 0.4606594098023636, "learning_rate": 9.845067488582297e-06, "loss": 0.0616, "step": 773 }, { "epoch": 0.6873889875666075, "grad_norm": 0.5887167134523461, "learning_rate": 9.844108720293027e-06, "loss": 0.0736, "step": 774 }, { "epoch": 0.6882770870337478, "grad_norm": 0.4610259409194756, "learning_rate": 9.843147041552047e-06, "loss": 0.064, "step": 775 }, { "epoch": 0.6891651865008881, "grad_norm": 0.5556439414364968, "learning_rate": 9.842182452937156e-06, "loss": 0.0601, "step": 776 }, { "epoch": 0.6900532859680284, "grad_norm": 0.4963423959312719, "learning_rate": 9.8412149550279e-06, "loss": 0.0567, "step": 777 }, { "epoch": 0.6909413854351687, "grad_norm": 0.4881477349084676, "learning_rate": 9.840244548405574e-06, "loss": 0.067, "step": 778 }, { "epoch": 0.6918294849023091, "grad_norm": 0.5927913645381122, "learning_rate": 9.83927123365322e-06, "loss": 0.0706, "step": 779 }, { "epoch": 0.6927175843694494, "grad_norm": 0.6246354132335293, "learning_rate": 9.83829501135563e-06, "loss": 0.0714, "step": 780 }, { "epoch": 0.6936056838365897, "grad_norm": 0.45067414718655535, "learning_rate": 9.837315882099338e-06, "loss": 0.0769, "step": 781 }, { "epoch": 0.69449378330373, "grad_norm": 0.46638603249090305, "learning_rate": 9.83633384647263e-06, "loss": 0.0651, "step": 782 }, { "epoch": 0.6953818827708703, "grad_norm": 0.44935838390777366, "learning_rate": 9.835348905065534e-06, "loss": 0.0637, "step": 783 }, { "epoch": 0.6962699822380106, "grad_norm": 0.40165159687498664, "learning_rate": 9.834361058469824e-06, "loss": 0.0617, "step": 784 }, { "epoch": 0.6971580817051509, "grad_norm": 0.5543025139993665, "learning_rate": 9.833370307279024e-06, "loss": 0.0761, "step": 785 }, { "epoch": 0.6980461811722913, "grad_norm": 0.616612079623427, "learning_rate": 9.832376652088398e-06, "loss": 0.0874, "step": 786 }, { "epoch": 0.6989342806394316, "grad_norm": 0.5630055764947307, "learning_rate": 9.831380093494957e-06, "loss": 0.0754, "step": 787 }, { "epoch": 0.6998223801065719, "grad_norm": 0.5215681170244125, "learning_rate": 9.830380632097458e-06, "loss": 0.0642, "step": 788 }, { "epoch": 0.7007104795737122, "grad_norm": 0.6656937827559947, "learning_rate": 9.8293782684964e-06, "loss": 0.07, "step": 789 }, { "epoch": 0.7015985790408525, "grad_norm": 0.6288561251516173, "learning_rate": 9.828373003294023e-06, "loss": 0.0914, "step": 790 }, { "epoch": 0.7024866785079928, "grad_norm": 0.5164186767628194, "learning_rate": 9.827364837094315e-06, "loss": 0.0684, "step": 791 }, { "epoch": 0.7033747779751333, "grad_norm": 0.5655308094806477, "learning_rate": 9.826353770503008e-06, "loss": 0.0722, "step": 792 }, { "epoch": 0.7042628774422736, "grad_norm": 0.553941699731642, "learning_rate": 9.82533980412757e-06, "loss": 0.0696, "step": 793 }, { "epoch": 0.7051509769094139, "grad_norm": 0.5697946394267486, "learning_rate": 9.824322938577217e-06, "loss": 0.0609, "step": 794 }, { "epoch": 0.7060390763765542, "grad_norm": 0.5673274646378402, "learning_rate": 9.823303174462908e-06, "loss": 0.0612, "step": 795 }, { "epoch": 0.7069271758436945, "grad_norm": 0.41718573178468515, "learning_rate": 9.822280512397336e-06, "loss": 0.055, "step": 796 }, { "epoch": 0.7078152753108348, "grad_norm": 0.5566723528174443, "learning_rate": 9.821254952994942e-06, "loss": 0.0623, "step": 797 }, { "epoch": 0.7087033747779752, "grad_norm": 0.5929826177517866, "learning_rate": 9.820226496871904e-06, "loss": 0.0623, "step": 798 }, { "epoch": 0.7095914742451155, "grad_norm": 0.5688187309896156, "learning_rate": 9.819195144646148e-06, "loss": 0.0883, "step": 799 }, { "epoch": 0.7104795737122558, "grad_norm": 0.49122317812039684, "learning_rate": 9.818160896937326e-06, "loss": 0.0678, "step": 800 }, { "epoch": 0.7113676731793961, "grad_norm": 0.6118298125130784, "learning_rate": 9.817123754366842e-06, "loss": 0.0768, "step": 801 }, { "epoch": 0.7122557726465364, "grad_norm": 0.46128906858077734, "learning_rate": 9.816083717557836e-06, "loss": 0.0685, "step": 802 }, { "epoch": 0.7131438721136767, "grad_norm": 0.437684287621171, "learning_rate": 9.815040787135185e-06, "loss": 0.0576, "step": 803 }, { "epoch": 0.7140319715808171, "grad_norm": 0.5342097826992428, "learning_rate": 9.813994963725502e-06, "loss": 0.0678, "step": 804 }, { "epoch": 0.7149200710479574, "grad_norm": 0.5669114346058003, "learning_rate": 9.812946247957145e-06, "loss": 0.062, "step": 805 }, { "epoch": 0.7158081705150977, "grad_norm": 0.4582139918634001, "learning_rate": 9.81189464046021e-06, "loss": 0.0655, "step": 806 }, { "epoch": 0.716696269982238, "grad_norm": 0.5870243211200192, "learning_rate": 9.810840141866518e-06, "loss": 0.0775, "step": 807 }, { "epoch": 0.7175843694493783, "grad_norm": 0.631756840637851, "learning_rate": 9.809782752809644e-06, "loss": 0.0736, "step": 808 }, { "epoch": 0.7184724689165186, "grad_norm": 0.6539120301427688, "learning_rate": 9.808722473924888e-06, "loss": 0.0959, "step": 809 }, { "epoch": 0.7193605683836589, "grad_norm": 0.5032683848281911, "learning_rate": 9.807659305849291e-06, "loss": 0.064, "step": 810 }, { "epoch": 0.7202486678507993, "grad_norm": 0.610360723221449, "learning_rate": 9.80659324922163e-06, "loss": 0.0597, "step": 811 }, { "epoch": 0.7211367673179396, "grad_norm": 0.6189388212782825, "learning_rate": 9.805524304682412e-06, "loss": 0.0613, "step": 812 }, { "epoch": 0.7220248667850799, "grad_norm": 0.5840790967615844, "learning_rate": 9.804452472873886e-06, "loss": 0.0848, "step": 813 }, { "epoch": 0.7229129662522202, "grad_norm": 0.6329622168240866, "learning_rate": 9.803377754440036e-06, "loss": 0.0825, "step": 814 }, { "epoch": 0.7238010657193605, "grad_norm": 0.619006500413132, "learning_rate": 9.802300150026573e-06, "loss": 0.0712, "step": 815 }, { "epoch": 0.7246891651865008, "grad_norm": 0.6106419407921674, "learning_rate": 9.801219660280946e-06, "loss": 0.0663, "step": 816 }, { "epoch": 0.7255772646536413, "grad_norm": 0.4891178874396289, "learning_rate": 9.800136285852344e-06, "loss": 0.0649, "step": 817 }, { "epoch": 0.7264653641207816, "grad_norm": 0.5451059121272476, "learning_rate": 9.799050027391679e-06, "loss": 0.0677, "step": 818 }, { "epoch": 0.7273534635879219, "grad_norm": 0.4238153011827328, "learning_rate": 9.7979608855516e-06, "loss": 0.0534, "step": 819 }, { "epoch": 0.7282415630550622, "grad_norm": 0.6330021046191763, "learning_rate": 9.796868860986489e-06, "loss": 0.0666, "step": 820 }, { "epoch": 0.7291296625222025, "grad_norm": 0.6045734694919165, "learning_rate": 9.795773954352459e-06, "loss": 0.0547, "step": 821 }, { "epoch": 0.7300177619893428, "grad_norm": 0.5198037170364895, "learning_rate": 9.794676166307354e-06, "loss": 0.0627, "step": 822 }, { "epoch": 0.7309058614564832, "grad_norm": 0.5554149391898202, "learning_rate": 9.793575497510753e-06, "loss": 0.0612, "step": 823 }, { "epoch": 0.7317939609236235, "grad_norm": 0.5343561182075582, "learning_rate": 9.792471948623964e-06, "loss": 0.0683, "step": 824 }, { "epoch": 0.7326820603907638, "grad_norm": 0.5790623408188066, "learning_rate": 9.791365520310019e-06, "loss": 0.0724, "step": 825 }, { "epoch": 0.7335701598579041, "grad_norm": 0.8344222032090851, "learning_rate": 9.790256213233688e-06, "loss": 0.0799, "step": 826 }, { "epoch": 0.7344582593250444, "grad_norm": 0.543863835656294, "learning_rate": 9.789144028061471e-06, "loss": 0.0665, "step": 827 }, { "epoch": 0.7353463587921847, "grad_norm": 0.6778994069764115, "learning_rate": 9.788028965461592e-06, "loss": 0.0736, "step": 828 }, { "epoch": 0.7362344582593251, "grad_norm": 0.5001185210304049, "learning_rate": 9.786911026104007e-06, "loss": 0.0814, "step": 829 }, { "epoch": 0.7371225577264654, "grad_norm": 0.4318114606035255, "learning_rate": 9.785790210660397e-06, "loss": 0.0457, "step": 830 }, { "epoch": 0.7380106571936057, "grad_norm": 0.5103918503485578, "learning_rate": 9.784666519804177e-06, "loss": 0.0584, "step": 831 }, { "epoch": 0.738898756660746, "grad_norm": 0.998343595704113, "learning_rate": 9.783539954210486e-06, "loss": 0.0803, "step": 832 }, { "epoch": 0.7397868561278863, "grad_norm": 0.6415964902419391, "learning_rate": 9.782410514556188e-06, "loss": 0.0678, "step": 833 }, { "epoch": 0.7406749555950266, "grad_norm": 0.6374999797886973, "learning_rate": 9.781278201519879e-06, "loss": 0.0674, "step": 834 }, { "epoch": 0.7415630550621669, "grad_norm": 0.6393001254832945, "learning_rate": 9.780143015781878e-06, "loss": 0.0677, "step": 835 }, { "epoch": 0.7424511545293073, "grad_norm": 0.5441955402643336, "learning_rate": 9.77900495802423e-06, "loss": 0.0703, "step": 836 }, { "epoch": 0.7433392539964476, "grad_norm": 0.5118800999904295, "learning_rate": 9.777864028930705e-06, "loss": 0.0646, "step": 837 }, { "epoch": 0.7442273534635879, "grad_norm": 0.4935923907638607, "learning_rate": 9.7767202291868e-06, "loss": 0.0512, "step": 838 }, { "epoch": 0.7451154529307282, "grad_norm": 0.5948126216510479, "learning_rate": 9.775573559479739e-06, "loss": 0.0698, "step": 839 }, { "epoch": 0.7460035523978685, "grad_norm": 0.4871301284729971, "learning_rate": 9.774424020498463e-06, "loss": 0.0668, "step": 840 }, { "epoch": 0.7468916518650088, "grad_norm": 0.47479989252048854, "learning_rate": 9.773271612933646e-06, "loss": 0.0643, "step": 841 }, { "epoch": 0.7477797513321492, "grad_norm": 0.4942090866097779, "learning_rate": 9.772116337477677e-06, "loss": 0.0797, "step": 842 }, { "epoch": 0.7486678507992895, "grad_norm": 0.5473659858639748, "learning_rate": 9.770958194824673e-06, "loss": 0.0797, "step": 843 }, { "epoch": 0.7495559502664298, "grad_norm": 0.424049921592322, "learning_rate": 9.769797185670475e-06, "loss": 0.0604, "step": 844 }, { "epoch": 0.7504440497335702, "grad_norm": 0.5564834103503165, "learning_rate": 9.768633310712643e-06, "loss": 0.0657, "step": 845 }, { "epoch": 0.7513321492007105, "grad_norm": 0.48763222699568676, "learning_rate": 9.767466570650457e-06, "loss": 0.0733, "step": 846 }, { "epoch": 0.7522202486678508, "grad_norm": 1.3304763250830134, "learning_rate": 9.766296966184925e-06, "loss": 0.0805, "step": 847 }, { "epoch": 0.7531083481349912, "grad_norm": 0.48145762725920477, "learning_rate": 9.765124498018769e-06, "loss": 0.0547, "step": 848 }, { "epoch": 0.7539964476021315, "grad_norm": 0.6854989871417136, "learning_rate": 9.763949166856436e-06, "loss": 0.0695, "step": 849 }, { "epoch": 0.7548845470692718, "grad_norm": 0.5439120991144568, "learning_rate": 9.762770973404094e-06, "loss": 0.0625, "step": 850 }, { "epoch": 0.7557726465364121, "grad_norm": 0.49542823833573935, "learning_rate": 9.761589918369626e-06, "loss": 0.0697, "step": 851 }, { "epoch": 0.7566607460035524, "grad_norm": 0.42224122911916856, "learning_rate": 9.760406002462639e-06, "loss": 0.0668, "step": 852 }, { "epoch": 0.7575488454706927, "grad_norm": 0.45058473601855664, "learning_rate": 9.759219226394455e-06, "loss": 0.0571, "step": 853 }, { "epoch": 0.7584369449378331, "grad_norm": 0.6448396174926396, "learning_rate": 9.758029590878115e-06, "loss": 0.0865, "step": 854 }, { "epoch": 0.7593250444049734, "grad_norm": 0.5077144999320161, "learning_rate": 9.756837096628384e-06, "loss": 0.0657, "step": 855 }, { "epoch": 0.7602131438721137, "grad_norm": 0.4917255888597828, "learning_rate": 9.755641744361736e-06, "loss": 0.066, "step": 856 }, { "epoch": 0.761101243339254, "grad_norm": 0.5412727578928164, "learning_rate": 9.754443534796368e-06, "loss": 0.0695, "step": 857 }, { "epoch": 0.7619893428063943, "grad_norm": 0.6417173521943089, "learning_rate": 9.753242468652191e-06, "loss": 0.0709, "step": 858 }, { "epoch": 0.7628774422735346, "grad_norm": 0.3981792957258406, "learning_rate": 9.752038546650832e-06, "loss": 0.0693, "step": 859 }, { "epoch": 0.7637655417406749, "grad_norm": 0.523667116627595, "learning_rate": 9.75083176951564e-06, "loss": 0.0559, "step": 860 }, { "epoch": 0.7646536412078153, "grad_norm": 0.5112962424889065, "learning_rate": 9.749622137971666e-06, "loss": 0.0671, "step": 861 }, { "epoch": 0.7655417406749556, "grad_norm": 0.4931265226964857, "learning_rate": 9.748409652745691e-06, "loss": 0.0626, "step": 862 }, { "epoch": 0.7664298401420959, "grad_norm": 0.5921485380445096, "learning_rate": 9.747194314566203e-06, "loss": 0.0698, "step": 863 }, { "epoch": 0.7673179396092362, "grad_norm": 0.47926368786238915, "learning_rate": 9.745976124163403e-06, "loss": 0.062, "step": 864 }, { "epoch": 0.7682060390763765, "grad_norm": 0.41810806221103497, "learning_rate": 9.744755082269209e-06, "loss": 0.0569, "step": 865 }, { "epoch": 0.7690941385435168, "grad_norm": 0.5000498796532304, "learning_rate": 9.74353118961725e-06, "loss": 0.0625, "step": 866 }, { "epoch": 0.7699822380106572, "grad_norm": 0.5931936446327853, "learning_rate": 9.74230444694287e-06, "loss": 0.0884, "step": 867 }, { "epoch": 0.7708703374777975, "grad_norm": 0.44084728448733057, "learning_rate": 9.741074854983123e-06, "loss": 0.0701, "step": 868 }, { "epoch": 0.7717584369449378, "grad_norm": 0.5016984411540918, "learning_rate": 9.739842414476778e-06, "loss": 0.06, "step": 869 }, { "epoch": 0.7726465364120781, "grad_norm": 0.5018618855198906, "learning_rate": 9.738607126164313e-06, "loss": 0.0628, "step": 870 }, { "epoch": 0.7735346358792184, "grad_norm": 0.5148250537317675, "learning_rate": 9.737368990787917e-06, "loss": 0.0599, "step": 871 }, { "epoch": 0.7744227353463587, "grad_norm": 0.47205325118182295, "learning_rate": 9.73612800909149e-06, "loss": 0.06, "step": 872 }, { "epoch": 0.7753108348134992, "grad_norm": 0.5422210072587286, "learning_rate": 9.73488418182064e-06, "loss": 0.0883, "step": 873 }, { "epoch": 0.7761989342806395, "grad_norm": 0.41956223545218563, "learning_rate": 9.733637509722693e-06, "loss": 0.069, "step": 874 }, { "epoch": 0.7770870337477798, "grad_norm": 0.5573260970497077, "learning_rate": 9.732387993546672e-06, "loss": 0.0729, "step": 875 }, { "epoch": 0.7779751332149201, "grad_norm": 0.4223713644069139, "learning_rate": 9.731135634043317e-06, "loss": 0.0561, "step": 876 }, { "epoch": 0.7788632326820604, "grad_norm": 0.42386931283969925, "learning_rate": 9.729880431965077e-06, "loss": 0.0512, "step": 877 }, { "epoch": 0.7797513321492007, "grad_norm": 0.562790329681012, "learning_rate": 9.728622388066104e-06, "loss": 0.0712, "step": 878 }, { "epoch": 0.7806394316163411, "grad_norm": 0.5223750865670449, "learning_rate": 9.727361503102259e-06, "loss": 0.0651, "step": 879 }, { "epoch": 0.7815275310834814, "grad_norm": 0.46319039337808027, "learning_rate": 9.726097777831113e-06, "loss": 0.0639, "step": 880 }, { "epoch": 0.7824156305506217, "grad_norm": 0.456317921792779, "learning_rate": 9.724831213011939e-06, "loss": 0.0621, "step": 881 }, { "epoch": 0.783303730017762, "grad_norm": 0.5425120205467575, "learning_rate": 9.723561809405717e-06, "loss": 0.0671, "step": 882 }, { "epoch": 0.7841918294849023, "grad_norm": 0.5740922445463003, "learning_rate": 9.722289567775138e-06, "loss": 0.0674, "step": 883 }, { "epoch": 0.7850799289520426, "grad_norm": 0.5297609824168771, "learning_rate": 9.72101448888459e-06, "loss": 0.0639, "step": 884 }, { "epoch": 0.7859680284191829, "grad_norm": 0.4811509902387377, "learning_rate": 9.719736573500172e-06, "loss": 0.0605, "step": 885 }, { "epoch": 0.7868561278863233, "grad_norm": 0.5922264117047007, "learning_rate": 9.718455822389683e-06, "loss": 0.0669, "step": 886 }, { "epoch": 0.7877442273534636, "grad_norm": 0.8913635608071487, "learning_rate": 9.71717223632263e-06, "loss": 0.083, "step": 887 }, { "epoch": 0.7886323268206039, "grad_norm": 0.5292427013291248, "learning_rate": 9.715885816070219e-06, "loss": 0.0748, "step": 888 }, { "epoch": 0.7895204262877442, "grad_norm": 0.5678193666157058, "learning_rate": 9.714596562405363e-06, "loss": 0.0602, "step": 889 }, { "epoch": 0.7904085257548845, "grad_norm": 0.5453521799618504, "learning_rate": 9.713304476102673e-06, "loss": 0.0668, "step": 890 }, { "epoch": 0.7912966252220248, "grad_norm": 0.7169261555524354, "learning_rate": 9.712009557938464e-06, "loss": 0.0645, "step": 891 }, { "epoch": 0.7921847246891652, "grad_norm": 0.8218667759261624, "learning_rate": 9.710711808690754e-06, "loss": 0.0845, "step": 892 }, { "epoch": 0.7930728241563055, "grad_norm": 0.5376254187160413, "learning_rate": 9.70941122913926e-06, "loss": 0.0823, "step": 893 }, { "epoch": 0.7939609236234458, "grad_norm": 0.5533386595522959, "learning_rate": 9.7081078200654e-06, "loss": 0.0726, "step": 894 }, { "epoch": 0.7948490230905861, "grad_norm": 0.5507402177237228, "learning_rate": 9.706801582252294e-06, "loss": 0.0569, "step": 895 }, { "epoch": 0.7957371225577264, "grad_norm": 0.7065760418179104, "learning_rate": 9.705492516484755e-06, "loss": 0.064, "step": 896 }, { "epoch": 0.7966252220248667, "grad_norm": 0.549266372129176, "learning_rate": 9.704180623549306e-06, "loss": 0.0748, "step": 897 }, { "epoch": 0.7975133214920072, "grad_norm": 0.5719391651082385, "learning_rate": 9.702865904234155e-06, "loss": 0.0624, "step": 898 }, { "epoch": 0.7984014209591475, "grad_norm": 0.6379308308407754, "learning_rate": 9.701548359329223e-06, "loss": 0.0719, "step": 899 }, { "epoch": 0.7992895204262878, "grad_norm": 0.5418127960029306, "learning_rate": 9.700227989626115e-06, "loss": 0.0623, "step": 900 }, { "epoch": 0.8001776198934281, "grad_norm": 0.6036706215166676, "learning_rate": 9.698904795918143e-06, "loss": 0.0607, "step": 901 }, { "epoch": 0.8010657193605684, "grad_norm": 0.5752493077654007, "learning_rate": 9.697578779000311e-06, "loss": 0.068, "step": 902 }, { "epoch": 0.8019538188277087, "grad_norm": 0.7663051795905208, "learning_rate": 9.696249939669322e-06, "loss": 0.0798, "step": 903 }, { "epoch": 0.8028419182948491, "grad_norm": 0.4723612141454474, "learning_rate": 9.694918278723573e-06, "loss": 0.0601, "step": 904 }, { "epoch": 0.8037300177619894, "grad_norm": 0.5656385272418112, "learning_rate": 9.693583796963153e-06, "loss": 0.0619, "step": 905 }, { "epoch": 0.8046181172291297, "grad_norm": 0.6699550446405373, "learning_rate": 9.692246495189851e-06, "loss": 0.0734, "step": 906 }, { "epoch": 0.80550621669627, "grad_norm": 0.5664891190443317, "learning_rate": 9.69090637420715e-06, "loss": 0.0708, "step": 907 }, { "epoch": 0.8063943161634103, "grad_norm": 0.6843597251058838, "learning_rate": 9.689563434820227e-06, "loss": 0.0702, "step": 908 }, { "epoch": 0.8072824156305506, "grad_norm": 0.683445022498873, "learning_rate": 9.688217677835946e-06, "loss": 0.0653, "step": 909 }, { "epoch": 0.8081705150976909, "grad_norm": 0.5060552768099038, "learning_rate": 9.68686910406287e-06, "loss": 0.0556, "step": 910 }, { "epoch": 0.8090586145648313, "grad_norm": 0.7986819442805254, "learning_rate": 9.685517714311257e-06, "loss": 0.0795, "step": 911 }, { "epoch": 0.8099467140319716, "grad_norm": 0.501035935929038, "learning_rate": 9.68416350939305e-06, "loss": 0.0666, "step": 912 }, { "epoch": 0.8108348134991119, "grad_norm": 0.5329884471999715, "learning_rate": 9.682806490121886e-06, "loss": 0.0775, "step": 913 }, { "epoch": 0.8117229129662522, "grad_norm": 0.54135648661835, "learning_rate": 9.681446657313093e-06, "loss": 0.0657, "step": 914 }, { "epoch": 0.8126110124333925, "grad_norm": 0.5134085253589913, "learning_rate": 9.68008401178369e-06, "loss": 0.0609, "step": 915 }, { "epoch": 0.8134991119005328, "grad_norm": 0.4434095387180611, "learning_rate": 9.678718554352388e-06, "loss": 0.0654, "step": 916 }, { "epoch": 0.8143872113676732, "grad_norm": 0.5236929010711758, "learning_rate": 9.677350285839584e-06, "loss": 0.0764, "step": 917 }, { "epoch": 0.8152753108348135, "grad_norm": 0.6994365075688941, "learning_rate": 9.675979207067366e-06, "loss": 0.0806, "step": 918 }, { "epoch": 0.8161634103019538, "grad_norm": 0.530891447866391, "learning_rate": 9.674605318859504e-06, "loss": 0.0634, "step": 919 }, { "epoch": 0.8170515097690941, "grad_norm": 0.5765555950626341, "learning_rate": 9.67322862204147e-06, "loss": 0.0725, "step": 920 }, { "epoch": 0.8179396092362344, "grad_norm": 0.5596260200652119, "learning_rate": 9.671849117440408e-06, "loss": 0.0714, "step": 921 }, { "epoch": 0.8188277087033747, "grad_norm": 0.41640827300153266, "learning_rate": 9.670466805885161e-06, "loss": 0.0628, "step": 922 }, { "epoch": 0.8197158081705151, "grad_norm": 0.5640736053598292, "learning_rate": 9.669081688206252e-06, "loss": 0.0856, "step": 923 }, { "epoch": 0.8206039076376554, "grad_norm": 0.8485734792235194, "learning_rate": 9.667693765235888e-06, "loss": 0.0747, "step": 924 }, { "epoch": 0.8214920071047958, "grad_norm": 0.5669402076142084, "learning_rate": 9.66630303780797e-06, "loss": 0.0686, "step": 925 }, { "epoch": 0.822380106571936, "grad_norm": 0.4740326434702954, "learning_rate": 9.664909506758078e-06, "loss": 0.0534, "step": 926 }, { "epoch": 0.8232682060390764, "grad_norm": 0.5194347072923514, "learning_rate": 9.663513172923472e-06, "loss": 0.069, "step": 927 }, { "epoch": 0.8241563055062167, "grad_norm": 0.6010424988290933, "learning_rate": 9.662114037143111e-06, "loss": 0.0738, "step": 928 }, { "epoch": 0.8250444049733571, "grad_norm": 0.6020466117067801, "learning_rate": 9.660712100257622e-06, "loss": 0.0737, "step": 929 }, { "epoch": 0.8259325044404974, "grad_norm": 0.5396443509643355, "learning_rate": 9.659307363109318e-06, "loss": 0.0801, "step": 930 }, { "epoch": 0.8268206039076377, "grad_norm": 0.5314961474658119, "learning_rate": 9.657899826542202e-06, "loss": 0.0517, "step": 931 }, { "epoch": 0.827708703374778, "grad_norm": 0.49738878540759734, "learning_rate": 9.656489491401954e-06, "loss": 0.064, "step": 932 }, { "epoch": 0.8285968028419183, "grad_norm": 0.48381840119758085, "learning_rate": 9.655076358535936e-06, "loss": 0.0682, "step": 933 }, { "epoch": 0.8294849023090586, "grad_norm": 0.44272053171124653, "learning_rate": 9.653660428793188e-06, "loss": 0.0581, "step": 934 }, { "epoch": 0.8303730017761989, "grad_norm": 0.4691336287450691, "learning_rate": 9.652241703024433e-06, "loss": 0.0642, "step": 935 }, { "epoch": 0.8312611012433393, "grad_norm": 0.45885267998647256, "learning_rate": 9.650820182082079e-06, "loss": 0.0627, "step": 936 }, { "epoch": 0.8321492007104796, "grad_norm": 1.0219882751792455, "learning_rate": 9.649395866820204e-06, "loss": 0.0831, "step": 937 }, { "epoch": 0.8330373001776199, "grad_norm": 0.5784342450785018, "learning_rate": 9.64796875809457e-06, "loss": 0.077, "step": 938 }, { "epoch": 0.8339253996447602, "grad_norm": 0.49797883919222224, "learning_rate": 9.646538856762617e-06, "loss": 0.0716, "step": 939 }, { "epoch": 0.8348134991119005, "grad_norm": 0.5887418894719311, "learning_rate": 9.645106163683462e-06, "loss": 0.0714, "step": 940 }, { "epoch": 0.8357015985790408, "grad_norm": 0.7748244061996492, "learning_rate": 9.6436706797179e-06, "loss": 0.0704, "step": 941 }, { "epoch": 0.8365896980461812, "grad_norm": 0.6855832463744983, "learning_rate": 9.642232405728405e-06, "loss": 0.0904, "step": 942 }, { "epoch": 0.8374777975133215, "grad_norm": 0.4877569922133212, "learning_rate": 9.640791342579123e-06, "loss": 0.0624, "step": 943 }, { "epoch": 0.8383658969804618, "grad_norm": 0.5089143536321346, "learning_rate": 9.639347491135877e-06, "loss": 0.0683, "step": 944 }, { "epoch": 0.8392539964476021, "grad_norm": 0.518918007394712, "learning_rate": 9.637900852266167e-06, "loss": 0.0616, "step": 945 }, { "epoch": 0.8401420959147424, "grad_norm": 0.5840044393755744, "learning_rate": 9.636451426839168e-06, "loss": 0.0659, "step": 946 }, { "epoch": 0.8410301953818827, "grad_norm": 0.45605601768162696, "learning_rate": 9.634999215725726e-06, "loss": 0.0668, "step": 947 }, { "epoch": 0.8419182948490231, "grad_norm": 0.4868724376811619, "learning_rate": 9.633544219798364e-06, "loss": 0.0609, "step": 948 }, { "epoch": 0.8428063943161634, "grad_norm": 0.4212420772051227, "learning_rate": 9.632086439931276e-06, "loss": 0.0537, "step": 949 }, { "epoch": 0.8436944937833037, "grad_norm": 0.5078653076714996, "learning_rate": 9.630625877000333e-06, "loss": 0.0719, "step": 950 }, { "epoch": 0.844582593250444, "grad_norm": 0.6428732000326673, "learning_rate": 9.629162531883069e-06, "loss": 0.0592, "step": 951 }, { "epoch": 0.8454706927175843, "grad_norm": 0.7805808347561376, "learning_rate": 9.6276964054587e-06, "loss": 0.0793, "step": 952 }, { "epoch": 0.8463587921847247, "grad_norm": 0.48470957339414594, "learning_rate": 9.626227498608106e-06, "loss": 0.0759, "step": 953 }, { "epoch": 0.8472468916518651, "grad_norm": 0.5914749471229294, "learning_rate": 9.624755812213842e-06, "loss": 0.0731, "step": 954 }, { "epoch": 0.8481349911190054, "grad_norm": 0.6355778200039421, "learning_rate": 9.623281347160129e-06, "loss": 0.0596, "step": 955 }, { "epoch": 0.8490230905861457, "grad_norm": 0.6496047766540088, "learning_rate": 9.621804104332859e-06, "loss": 0.0866, "step": 956 }, { "epoch": 0.849911190053286, "grad_norm": 0.6056478307229467, "learning_rate": 9.620324084619597e-06, "loss": 0.0613, "step": 957 }, { "epoch": 0.8507992895204263, "grad_norm": 0.546281173408732, "learning_rate": 9.618841288909568e-06, "loss": 0.0635, "step": 958 }, { "epoch": 0.8516873889875666, "grad_norm": 0.5559378548585396, "learning_rate": 9.617355718093673e-06, "loss": 0.0707, "step": 959 }, { "epoch": 0.8525754884547069, "grad_norm": 0.5567536912334861, "learning_rate": 9.615867373064476e-06, "loss": 0.0826, "step": 960 }, { "epoch": 0.8534635879218473, "grad_norm": 0.42827668716372647, "learning_rate": 9.614376254716209e-06, "loss": 0.0551, "step": 961 }, { "epoch": 0.8543516873889876, "grad_norm": 0.6093863639678927, "learning_rate": 9.612882363944772e-06, "loss": 0.0747, "step": 962 }, { "epoch": 0.8552397868561279, "grad_norm": 0.6073144538624808, "learning_rate": 9.611385701647724e-06, "loss": 0.0874, "step": 963 }, { "epoch": 0.8561278863232682, "grad_norm": 0.44196140894399816, "learning_rate": 9.609886268724299e-06, "loss": 0.0635, "step": 964 }, { "epoch": 0.8570159857904085, "grad_norm": 0.5774340908065915, "learning_rate": 9.608384066075387e-06, "loss": 0.0656, "step": 965 }, { "epoch": 0.8579040852575488, "grad_norm": 0.44605257552608013, "learning_rate": 9.60687909460355e-06, "loss": 0.0652, "step": 966 }, { "epoch": 0.8587921847246892, "grad_norm": 0.4880651495248036, "learning_rate": 9.605371355213006e-06, "loss": 0.0644, "step": 967 }, { "epoch": 0.8596802841918295, "grad_norm": 0.5163769073738559, "learning_rate": 9.60386084880964e-06, "loss": 0.0651, "step": 968 }, { "epoch": 0.8605683836589698, "grad_norm": 0.484318416820972, "learning_rate": 9.602347576300998e-06, "loss": 0.0693, "step": 969 }, { "epoch": 0.8614564831261101, "grad_norm": 0.6745913675404646, "learning_rate": 9.600831538596294e-06, "loss": 0.069, "step": 970 }, { "epoch": 0.8623445825932504, "grad_norm": 0.5244770412855654, "learning_rate": 9.599312736606393e-06, "loss": 0.0686, "step": 971 }, { "epoch": 0.8632326820603907, "grad_norm": 0.5117748422489878, "learning_rate": 9.597791171243826e-06, "loss": 0.0695, "step": 972 }, { "epoch": 0.8641207815275311, "grad_norm": 0.4807594270645188, "learning_rate": 9.596266843422786e-06, "loss": 0.0655, "step": 973 }, { "epoch": 0.8650088809946714, "grad_norm": 0.5078261322695101, "learning_rate": 9.594739754059126e-06, "loss": 0.0725, "step": 974 }, { "epoch": 0.8658969804618117, "grad_norm": 0.5907284204895108, "learning_rate": 9.593209904070353e-06, "loss": 0.0641, "step": 975 }, { "epoch": 0.866785079928952, "grad_norm": 0.6051899041617059, "learning_rate": 9.591677294375637e-06, "loss": 0.0765, "step": 976 }, { "epoch": 0.8676731793960923, "grad_norm": 0.48009032801043344, "learning_rate": 9.590141925895806e-06, "loss": 0.0616, "step": 977 }, { "epoch": 0.8685612788632326, "grad_norm": 0.6271830421948442, "learning_rate": 9.588603799553344e-06, "loss": 0.0668, "step": 978 }, { "epoch": 0.8694493783303731, "grad_norm": 1.143136210805576, "learning_rate": 9.587062916272395e-06, "loss": 0.0892, "step": 979 }, { "epoch": 0.8703374777975134, "grad_norm": 0.4894418333744091, "learning_rate": 9.585519276978753e-06, "loss": 0.067, "step": 980 }, { "epoch": 0.8712255772646537, "grad_norm": 0.5167641117377408, "learning_rate": 9.583972882599873e-06, "loss": 0.065, "step": 981 }, { "epoch": 0.872113676731794, "grad_norm": 0.5190912686806686, "learning_rate": 9.582423734064866e-06, "loss": 0.0746, "step": 982 }, { "epoch": 0.8730017761989343, "grad_norm": 0.6206325896144623, "learning_rate": 9.580871832304495e-06, "loss": 0.0567, "step": 983 }, { "epoch": 0.8738898756660746, "grad_norm": 0.5175071446032825, "learning_rate": 9.579317178251177e-06, "loss": 0.0632, "step": 984 }, { "epoch": 0.8747779751332149, "grad_norm": 0.5117114338677473, "learning_rate": 9.577759772838986e-06, "loss": 0.0606, "step": 985 }, { "epoch": 0.8756660746003553, "grad_norm": 0.45216716484221064, "learning_rate": 9.576199617003646e-06, "loss": 0.0559, "step": 986 }, { "epoch": 0.8765541740674956, "grad_norm": 0.5279616323617633, "learning_rate": 9.574636711682534e-06, "loss": 0.064, "step": 987 }, { "epoch": 0.8774422735346359, "grad_norm": 0.5771251219977132, "learning_rate": 9.57307105781468e-06, "loss": 0.0597, "step": 988 }, { "epoch": 0.8783303730017762, "grad_norm": 0.4623690839065104, "learning_rate": 9.571502656340766e-06, "loss": 0.0563, "step": 989 }, { "epoch": 0.8792184724689165, "grad_norm": 0.6946879392998936, "learning_rate": 9.569931508203119e-06, "loss": 0.0676, "step": 990 }, { "epoch": 0.8801065719360568, "grad_norm": 0.4585146234886463, "learning_rate": 9.568357614345726e-06, "loss": 0.0733, "step": 991 }, { "epoch": 0.8809946714031972, "grad_norm": 0.6145459902022079, "learning_rate": 9.566780975714216e-06, "loss": 0.0685, "step": 992 }, { "epoch": 0.8818827708703375, "grad_norm": 0.5042544370825427, "learning_rate": 9.565201593255871e-06, "loss": 0.0636, "step": 993 }, { "epoch": 0.8827708703374778, "grad_norm": 0.527895658494514, "learning_rate": 9.563619467919618e-06, "loss": 0.064, "step": 994 }, { "epoch": 0.8836589698046181, "grad_norm": 0.8562131843407302, "learning_rate": 9.562034600656037e-06, "loss": 0.0686, "step": 995 }, { "epoch": 0.8845470692717584, "grad_norm": 0.6663629069562428, "learning_rate": 9.560446992417352e-06, "loss": 0.0712, "step": 996 }, { "epoch": 0.8854351687388987, "grad_norm": 0.5187613898577973, "learning_rate": 9.558856644157432e-06, "loss": 0.0772, "step": 997 }, { "epoch": 0.8863232682060391, "grad_norm": 0.47580394813478066, "learning_rate": 9.557263556831797e-06, "loss": 0.0527, "step": 998 }, { "epoch": 0.8872113676731794, "grad_norm": 0.7369218631811545, "learning_rate": 9.55566773139761e-06, "loss": 0.0916, "step": 999 }, { "epoch": 0.8880994671403197, "grad_norm": 0.5202148270559931, "learning_rate": 9.55406916881368e-06, "loss": 0.0719, "step": 1000 }, { "epoch": 0.88898756660746, "grad_norm": 0.4451803025522105, "learning_rate": 9.55246787004046e-06, "loss": 0.0651, "step": 1001 }, { "epoch": 0.8898756660746003, "grad_norm": 0.6365441468045436, "learning_rate": 9.550863836040046e-06, "loss": 0.0791, "step": 1002 }, { "epoch": 0.8907637655417406, "grad_norm": 0.6417144379958832, "learning_rate": 9.549257067776179e-06, "loss": 0.0648, "step": 1003 }, { "epoch": 0.8916518650088809, "grad_norm": 0.5006334639118971, "learning_rate": 9.547647566214242e-06, "loss": 0.0782, "step": 1004 }, { "epoch": 0.8925399644760214, "grad_norm": 0.6095277759149481, "learning_rate": 9.546035332321261e-06, "loss": 0.063, "step": 1005 }, { "epoch": 0.8934280639431617, "grad_norm": 0.67774395122714, "learning_rate": 9.544420367065905e-06, "loss": 0.0716, "step": 1006 }, { "epoch": 0.894316163410302, "grad_norm": 0.580267254473296, "learning_rate": 9.542802671418479e-06, "loss": 0.078, "step": 1007 }, { "epoch": 0.8952042628774423, "grad_norm": 0.7461816578324983, "learning_rate": 9.54118224635093e-06, "loss": 0.0813, "step": 1008 }, { "epoch": 0.8960923623445826, "grad_norm": 0.5764943758208013, "learning_rate": 9.539559092836853e-06, "loss": 0.0668, "step": 1009 }, { "epoch": 0.8969804618117229, "grad_norm": 0.5226817090572494, "learning_rate": 9.53793321185147e-06, "loss": 0.0665, "step": 1010 }, { "epoch": 0.8978685612788633, "grad_norm": 0.671609926008543, "learning_rate": 9.536304604371653e-06, "loss": 0.0601, "step": 1011 }, { "epoch": 0.8987566607460036, "grad_norm": 0.5644239599358581, "learning_rate": 9.534673271375902e-06, "loss": 0.0831, "step": 1012 }, { "epoch": 0.8996447602131439, "grad_norm": 0.5245491734026599, "learning_rate": 9.53303921384436e-06, "loss": 0.0652, "step": 1013 }, { "epoch": 0.9005328596802842, "grad_norm": 0.735750041713909, "learning_rate": 9.53140243275881e-06, "loss": 0.0807, "step": 1014 }, { "epoch": 0.9014209591474245, "grad_norm": 0.6719349064728679, "learning_rate": 9.529762929102662e-06, "loss": 0.0673, "step": 1015 }, { "epoch": 0.9023090586145648, "grad_norm": 0.46147220043042025, "learning_rate": 9.528120703860971e-06, "loss": 0.0552, "step": 1016 }, { "epoch": 0.9031971580817052, "grad_norm": 0.6362605578497083, "learning_rate": 9.526475758020424e-06, "loss": 0.0662, "step": 1017 }, { "epoch": 0.9040852575488455, "grad_norm": 0.6092023673577541, "learning_rate": 9.52482809256934e-06, "loss": 0.0696, "step": 1018 }, { "epoch": 0.9049733570159858, "grad_norm": 0.5550600604822477, "learning_rate": 9.523177708497677e-06, "loss": 0.0674, "step": 1019 }, { "epoch": 0.9058614564831261, "grad_norm": 0.689868458127882, "learning_rate": 9.521524606797021e-06, "loss": 0.0672, "step": 1020 }, { "epoch": 0.9067495559502664, "grad_norm": 0.5355417556080468, "learning_rate": 9.519868788460594e-06, "loss": 0.0597, "step": 1021 }, { "epoch": 0.9076376554174067, "grad_norm": 0.4786559073365717, "learning_rate": 9.518210254483252e-06, "loss": 0.0608, "step": 1022 }, { "epoch": 0.9085257548845471, "grad_norm": 0.5638304194002016, "learning_rate": 9.516549005861477e-06, "loss": 0.0749, "step": 1023 }, { "epoch": 0.9094138543516874, "grad_norm": 0.5388374617346837, "learning_rate": 9.514885043593387e-06, "loss": 0.0718, "step": 1024 }, { "epoch": 0.9103019538188277, "grad_norm": 0.46101867467829, "learning_rate": 9.513218368678727e-06, "loss": 0.0646, "step": 1025 }, { "epoch": 0.911190053285968, "grad_norm": 0.5702452001729812, "learning_rate": 9.511548982118876e-06, "loss": 0.07, "step": 1026 }, { "epoch": 0.9120781527531083, "grad_norm": 0.5077496863804494, "learning_rate": 9.50987688491684e-06, "loss": 0.0631, "step": 1027 }, { "epoch": 0.9129662522202486, "grad_norm": 0.6669979779374497, "learning_rate": 9.508202078077253e-06, "loss": 0.0641, "step": 1028 }, { "epoch": 0.9138543516873889, "grad_norm": 0.5372617408127478, "learning_rate": 9.506524562606373e-06, "loss": 0.058, "step": 1029 }, { "epoch": 0.9147424511545293, "grad_norm": 0.5388729086198248, "learning_rate": 9.504844339512096e-06, "loss": 0.0726, "step": 1030 }, { "epoch": 0.9156305506216696, "grad_norm": 0.6114003454684426, "learning_rate": 9.503161409803936e-06, "loss": 0.0711, "step": 1031 }, { "epoch": 0.91651865008881, "grad_norm": 0.4485342434354438, "learning_rate": 9.501475774493034e-06, "loss": 0.0636, "step": 1032 }, { "epoch": 0.9174067495559503, "grad_norm": 0.6828271051823804, "learning_rate": 9.499787434592162e-06, "loss": 0.0732, "step": 1033 }, { "epoch": 0.9182948490230906, "grad_norm": 0.599305022824422, "learning_rate": 9.49809639111571e-06, "loss": 0.0727, "step": 1034 }, { "epoch": 0.9191829484902309, "grad_norm": 0.5383385780080625, "learning_rate": 9.4964026450797e-06, "loss": 0.0555, "step": 1035 }, { "epoch": 0.9200710479573713, "grad_norm": 0.47192921256501275, "learning_rate": 9.494706197501768e-06, "loss": 0.0596, "step": 1036 }, { "epoch": 0.9209591474245116, "grad_norm": 0.46543843473849966, "learning_rate": 9.493007049401182e-06, "loss": 0.065, "step": 1037 }, { "epoch": 0.9218472468916519, "grad_norm": 0.4809339614623937, "learning_rate": 9.491305201798829e-06, "loss": 0.0611, "step": 1038 }, { "epoch": 0.9227353463587922, "grad_norm": 0.5478524759971349, "learning_rate": 9.489600655717217e-06, "loss": 0.0664, "step": 1039 }, { "epoch": 0.9236234458259325, "grad_norm": 0.5590802808369877, "learning_rate": 9.487893412180478e-06, "loss": 0.0776, "step": 1040 }, { "epoch": 0.9245115452930728, "grad_norm": 0.5249819885631613, "learning_rate": 9.48618347221436e-06, "loss": 0.0654, "step": 1041 }, { "epoch": 0.9253996447602132, "grad_norm": 0.6157061724908073, "learning_rate": 9.48447083684624e-06, "loss": 0.0779, "step": 1042 }, { "epoch": 0.9262877442273535, "grad_norm": 0.5191446647526017, "learning_rate": 9.482755507105102e-06, "loss": 0.0693, "step": 1043 }, { "epoch": 0.9271758436944938, "grad_norm": 0.5290036118271478, "learning_rate": 9.48103748402156e-06, "loss": 0.0748, "step": 1044 }, { "epoch": 0.9280639431616341, "grad_norm": 0.4281662222413074, "learning_rate": 9.47931676862784e-06, "loss": 0.0578, "step": 1045 }, { "epoch": 0.9289520426287744, "grad_norm": 0.6704581542680114, "learning_rate": 9.477593361957786e-06, "loss": 0.0845, "step": 1046 }, { "epoch": 0.9298401420959147, "grad_norm": 0.5123744289336686, "learning_rate": 9.475867265046865e-06, "loss": 0.0716, "step": 1047 }, { "epoch": 0.9307282415630551, "grad_norm": 0.48433634155634114, "learning_rate": 9.47413847893215e-06, "loss": 0.0634, "step": 1048 }, { "epoch": 0.9316163410301954, "grad_norm": 0.728396879512879, "learning_rate": 9.472407004652337e-06, "loss": 0.0636, "step": 1049 }, { "epoch": 0.9325044404973357, "grad_norm": 0.41668367278311613, "learning_rate": 9.470672843247739e-06, "loss": 0.0514, "step": 1050 }, { "epoch": 0.933392539964476, "grad_norm": 0.48971811331996123, "learning_rate": 9.468935995760275e-06, "loss": 0.0639, "step": 1051 }, { "epoch": 0.9342806394316163, "grad_norm": 0.5546859724046497, "learning_rate": 9.467196463233488e-06, "loss": 0.0558, "step": 1052 }, { "epoch": 0.9351687388987566, "grad_norm": 0.658139536168307, "learning_rate": 9.465454246712524e-06, "loss": 0.0709, "step": 1053 }, { "epoch": 0.9360568383658969, "grad_norm": 0.623885610850787, "learning_rate": 9.463709347244147e-06, "loss": 0.0679, "step": 1054 }, { "epoch": 0.9369449378330373, "grad_norm": 0.46245415470167184, "learning_rate": 9.461961765876739e-06, "loss": 0.0532, "step": 1055 }, { "epoch": 0.9378330373001776, "grad_norm": 0.4949200069813079, "learning_rate": 9.46021150366028e-06, "loss": 0.0693, "step": 1056 }, { "epoch": 0.9387211367673179, "grad_norm": 0.4441815788870761, "learning_rate": 9.45845856164637e-06, "loss": 0.0584, "step": 1057 }, { "epoch": 0.9396092362344582, "grad_norm": 0.4435068922625452, "learning_rate": 9.456702940888219e-06, "loss": 0.0662, "step": 1058 }, { "epoch": 0.9404973357015985, "grad_norm": 0.43640362396963367, "learning_rate": 9.45494464244064e-06, "loss": 0.0557, "step": 1059 }, { "epoch": 0.9413854351687388, "grad_norm": 0.6458370718179861, "learning_rate": 9.453183667360062e-06, "loss": 0.0677, "step": 1060 }, { "epoch": 0.9422735346358793, "grad_norm": 0.5635397337457626, "learning_rate": 9.451420016704518e-06, "loss": 0.0619, "step": 1061 }, { "epoch": 0.9431616341030196, "grad_norm": 0.47572955056498234, "learning_rate": 9.449653691533651e-06, "loss": 0.0572, "step": 1062 }, { "epoch": 0.9440497335701599, "grad_norm": 0.6126902376241518, "learning_rate": 9.44788469290871e-06, "loss": 0.0594, "step": 1063 }, { "epoch": 0.9449378330373002, "grad_norm": 0.5635487410460365, "learning_rate": 9.446113021892549e-06, "loss": 0.0649, "step": 1064 }, { "epoch": 0.9458259325044405, "grad_norm": 0.577763532992864, "learning_rate": 9.444338679549627e-06, "loss": 0.0715, "step": 1065 }, { "epoch": 0.9467140319715808, "grad_norm": 0.7331856587215069, "learning_rate": 9.442561666946013e-06, "loss": 0.0591, "step": 1066 }, { "epoch": 0.9476021314387212, "grad_norm": 0.5643191285005896, "learning_rate": 9.440781985149375e-06, "loss": 0.0651, "step": 1067 }, { "epoch": 0.9484902309058615, "grad_norm": 0.5165665085008259, "learning_rate": 9.438999635228985e-06, "loss": 0.066, "step": 1068 }, { "epoch": 0.9493783303730018, "grad_norm": 0.6171838638410556, "learning_rate": 9.437214618255725e-06, "loss": 0.0637, "step": 1069 }, { "epoch": 0.9502664298401421, "grad_norm": 0.4693377019678983, "learning_rate": 9.435426935302068e-06, "loss": 0.0588, "step": 1070 }, { "epoch": 0.9511545293072824, "grad_norm": 0.620246955872732, "learning_rate": 9.4336365874421e-06, "loss": 0.0589, "step": 1071 }, { "epoch": 0.9520426287744227, "grad_norm": 0.5373583216159635, "learning_rate": 9.4318435757515e-06, "loss": 0.0735, "step": 1072 }, { "epoch": 0.9529307282415631, "grad_norm": 0.5173177306742143, "learning_rate": 9.43004790130755e-06, "loss": 0.0587, "step": 1073 }, { "epoch": 0.9538188277087034, "grad_norm": 0.5668213584644644, "learning_rate": 9.428249565189136e-06, "loss": 0.0715, "step": 1074 }, { "epoch": 0.9547069271758437, "grad_norm": 0.5034740577735468, "learning_rate": 9.426448568476736e-06, "loss": 0.0689, "step": 1075 }, { "epoch": 0.955595026642984, "grad_norm": 0.5051585288129055, "learning_rate": 9.424644912252434e-06, "loss": 0.0611, "step": 1076 }, { "epoch": 0.9564831261101243, "grad_norm": 0.3813464929806569, "learning_rate": 9.422838597599904e-06, "loss": 0.0503, "step": 1077 }, { "epoch": 0.9573712255772646, "grad_norm": 0.4715908502621322, "learning_rate": 9.421029625604423e-06, "loss": 0.0626, "step": 1078 }, { "epoch": 0.9582593250444049, "grad_norm": 0.5345699657625882, "learning_rate": 9.419217997352863e-06, "loss": 0.0597, "step": 1079 }, { "epoch": 0.9591474245115453, "grad_norm": 0.5450263154606977, "learning_rate": 9.417403713933692e-06, "loss": 0.0764, "step": 1080 }, { "epoch": 0.9600355239786856, "grad_norm": 0.5583774630425197, "learning_rate": 9.415586776436973e-06, "loss": 0.0706, "step": 1081 }, { "epoch": 0.9609236234458259, "grad_norm": 0.5041062859022016, "learning_rate": 9.413767185954365e-06, "loss": 0.0594, "step": 1082 }, { "epoch": 0.9618117229129662, "grad_norm": 0.46181940102981356, "learning_rate": 9.411944943579117e-06, "loss": 0.0726, "step": 1083 }, { "epoch": 0.9626998223801065, "grad_norm": 0.487345534214178, "learning_rate": 9.410120050406075e-06, "loss": 0.0635, "step": 1084 }, { "epoch": 0.9635879218472468, "grad_norm": 0.5441846328667687, "learning_rate": 9.408292507531679e-06, "loss": 0.0583, "step": 1085 }, { "epoch": 0.9644760213143873, "grad_norm": 0.5494892826459017, "learning_rate": 9.406462316053954e-06, "loss": 0.064, "step": 1086 }, { "epoch": 0.9653641207815276, "grad_norm": 0.43112610113029626, "learning_rate": 9.404629477072526e-06, "loss": 0.0498, "step": 1087 }, { "epoch": 0.9662522202486679, "grad_norm": 0.5123322765087007, "learning_rate": 9.402793991688604e-06, "loss": 0.0657, "step": 1088 }, { "epoch": 0.9671403197158082, "grad_norm": 0.4178380585471896, "learning_rate": 9.40095586100499e-06, "loss": 0.0599, "step": 1089 }, { "epoch": 0.9680284191829485, "grad_norm": 0.6153530714811697, "learning_rate": 9.399115086126077e-06, "loss": 0.0743, "step": 1090 }, { "epoch": 0.9689165186500888, "grad_norm": 0.4595165540363119, "learning_rate": 9.397271668157842e-06, "loss": 0.0556, "step": 1091 }, { "epoch": 0.9698046181172292, "grad_norm": 0.5700639774522798, "learning_rate": 9.395425608207854e-06, "loss": 0.0654, "step": 1092 }, { "epoch": 0.9706927175843695, "grad_norm": 0.5820496812169939, "learning_rate": 9.393576907385268e-06, "loss": 0.0561, "step": 1093 }, { "epoch": 0.9715808170515098, "grad_norm": 0.5104423509205583, "learning_rate": 9.391725566800828e-06, "loss": 0.0572, "step": 1094 }, { "epoch": 0.9724689165186501, "grad_norm": 0.6683733021598959, "learning_rate": 9.389871587566859e-06, "loss": 0.0721, "step": 1095 }, { "epoch": 0.9733570159857904, "grad_norm": 0.5392379578662706, "learning_rate": 9.388014970797275e-06, "loss": 0.0653, "step": 1096 }, { "epoch": 0.9742451154529307, "grad_norm": 0.5184363680081238, "learning_rate": 9.386155717607575e-06, "loss": 0.0709, "step": 1097 }, { "epoch": 0.9751332149200711, "grad_norm": 0.4985516172717562, "learning_rate": 9.384293829114841e-06, "loss": 0.0585, "step": 1098 }, { "epoch": 0.9760213143872114, "grad_norm": 0.48932133345360795, "learning_rate": 9.382429306437738e-06, "loss": 0.0528, "step": 1099 }, { "epoch": 0.9769094138543517, "grad_norm": 0.5141853488077307, "learning_rate": 9.38056215069651e-06, "loss": 0.0562, "step": 1100 }, { "epoch": 0.977797513321492, "grad_norm": 0.5853949439809513, "learning_rate": 9.378692363012995e-06, "loss": 0.07, "step": 1101 }, { "epoch": 0.9786856127886323, "grad_norm": 0.6129353009897632, "learning_rate": 9.376819944510598e-06, "loss": 0.0687, "step": 1102 }, { "epoch": 0.9795737122557726, "grad_norm": 0.5566429527473071, "learning_rate": 9.374944896314314e-06, "loss": 0.0601, "step": 1103 }, { "epoch": 0.9804618117229129, "grad_norm": 0.47189365987257625, "learning_rate": 9.373067219550713e-06, "loss": 0.0619, "step": 1104 }, { "epoch": 0.9813499111900533, "grad_norm": 0.6488893886846454, "learning_rate": 9.37118691534795e-06, "loss": 0.071, "step": 1105 }, { "epoch": 0.9822380106571936, "grad_norm": 0.3888036759640904, "learning_rate": 9.36930398483575e-06, "loss": 0.0549, "step": 1106 }, { "epoch": 0.9831261101243339, "grad_norm": 0.5445655296183051, "learning_rate": 9.367418429145423e-06, "loss": 0.0679, "step": 1107 }, { "epoch": 0.9840142095914742, "grad_norm": 0.3935911621353518, "learning_rate": 9.365530249409855e-06, "loss": 0.0549, "step": 1108 }, { "epoch": 0.9849023090586145, "grad_norm": 0.6394614626639835, "learning_rate": 9.363639446763508e-06, "loss": 0.0595, "step": 1109 }, { "epoch": 0.9857904085257548, "grad_norm": 0.4321186532161739, "learning_rate": 9.36174602234242e-06, "loss": 0.0534, "step": 1110 }, { "epoch": 0.9866785079928952, "grad_norm": 0.4425938193990536, "learning_rate": 9.359849977284199e-06, "loss": 0.0647, "step": 1111 }, { "epoch": 0.9875666074600356, "grad_norm": 0.4764434667907115, "learning_rate": 9.357951312728037e-06, "loss": 0.0636, "step": 1112 }, { "epoch": 0.9884547069271759, "grad_norm": 0.5878975417629865, "learning_rate": 9.356050029814696e-06, "loss": 0.0731, "step": 1113 }, { "epoch": 0.9893428063943162, "grad_norm": 0.3979984912788497, "learning_rate": 9.354146129686507e-06, "loss": 0.0661, "step": 1114 }, { "epoch": 0.9902309058614565, "grad_norm": 0.41184644729096215, "learning_rate": 9.35223961348738e-06, "loss": 0.0527, "step": 1115 }, { "epoch": 0.9911190053285968, "grad_norm": 0.46249426522235615, "learning_rate": 9.350330482362791e-06, "loss": 0.0623, "step": 1116 }, { "epoch": 0.9920071047957372, "grad_norm": 0.36724731132242894, "learning_rate": 9.34841873745979e-06, "loss": 0.0458, "step": 1117 }, { "epoch": 0.9928952042628775, "grad_norm": 0.4579120079080996, "learning_rate": 9.346504379927001e-06, "loss": 0.0658, "step": 1118 }, { "epoch": 0.9937833037300178, "grad_norm": 0.5436563697628319, "learning_rate": 9.34458741091461e-06, "loss": 0.0713, "step": 1119 }, { "epoch": 0.9946714031971581, "grad_norm": 0.4968627889604334, "learning_rate": 9.342667831574377e-06, "loss": 0.06, "step": 1120 }, { "epoch": 0.9955595026642984, "grad_norm": 0.37477046083848015, "learning_rate": 9.34074564305963e-06, "loss": 0.0514, "step": 1121 }, { "epoch": 0.9964476021314387, "grad_norm": 0.4385411957434755, "learning_rate": 9.338820846525262e-06, "loss": 0.0507, "step": 1122 }, { "epoch": 0.9973357015985791, "grad_norm": 0.5427375302508839, "learning_rate": 9.336893443127739e-06, "loss": 0.0667, "step": 1123 }, { "epoch": 0.9982238010657194, "grad_norm": 0.7999683100644676, "learning_rate": 9.334963434025085e-06, "loss": 0.0729, "step": 1124 }, { "epoch": 0.9991119005328597, "grad_norm": 0.3699119827224097, "learning_rate": 9.333030820376896e-06, "loss": 0.0487, "step": 1125 }, { "epoch": 1.0, "grad_norm": 0.508829292943249, "learning_rate": 9.331095603344329e-06, "loss": 0.0618, "step": 1126 }, { "epoch": 1.0, "eval_loss": 0.06568732112646103, "eval_runtime": 81.3182, "eval_samples_per_second": 186.502, "eval_steps_per_second": 2.914, "step": 1126 }, { "epoch": 1.0008880994671403, "grad_norm": 0.6480601826524256, "learning_rate": 9.329157784090108e-06, "loss": 0.0648, "step": 1127 }, { "epoch": 1.0017761989342806, "grad_norm": 0.4163022145377359, "learning_rate": 9.32721736377852e-06, "loss": 0.0537, "step": 1128 }, { "epoch": 1.002664298401421, "grad_norm": 0.5133094034883717, "learning_rate": 9.325274343575412e-06, "loss": 0.0515, "step": 1129 }, { "epoch": 1.0035523978685612, "grad_norm": 0.5658073414535006, "learning_rate": 9.323328724648197e-06, "loss": 0.0601, "step": 1130 }, { "epoch": 1.0044404973357015, "grad_norm": 0.6245709824367127, "learning_rate": 9.321380508165848e-06, "loss": 0.0604, "step": 1131 }, { "epoch": 1.0053285968028418, "grad_norm": 0.5594061879879259, "learning_rate": 9.319429695298896e-06, "loss": 0.0633, "step": 1132 }, { "epoch": 1.0062166962699823, "grad_norm": 0.6301457475894502, "learning_rate": 9.317476287219436e-06, "loss": 0.0629, "step": 1133 }, { "epoch": 1.0071047957371226, "grad_norm": 0.5679741582137742, "learning_rate": 9.315520285101118e-06, "loss": 0.0658, "step": 1134 }, { "epoch": 1.007992895204263, "grad_norm": 0.45827959597629336, "learning_rate": 9.313561690119157e-06, "loss": 0.0515, "step": 1135 }, { "epoch": 1.0088809946714032, "grad_norm": 0.6916115911467748, "learning_rate": 9.311600503450318e-06, "loss": 0.0691, "step": 1136 }, { "epoch": 1.0097690941385435, "grad_norm": 0.4706381914559495, "learning_rate": 9.309636726272929e-06, "loss": 0.0563, "step": 1137 }, { "epoch": 1.0106571936056838, "grad_norm": 0.4947347542946586, "learning_rate": 9.30767035976687e-06, "loss": 0.0565, "step": 1138 }, { "epoch": 1.0115452930728241, "grad_norm": 0.5068531468493555, "learning_rate": 9.305701405113582e-06, "loss": 0.0556, "step": 1139 }, { "epoch": 1.0124333925399644, "grad_norm": 0.6182707868884152, "learning_rate": 9.303729863496057e-06, "loss": 0.0636, "step": 1140 }, { "epoch": 1.0133214920071048, "grad_norm": 0.5329607011098876, "learning_rate": 9.301755736098843e-06, "loss": 0.0752, "step": 1141 }, { "epoch": 1.014209591474245, "grad_norm": 0.5715930875894688, "learning_rate": 9.29977902410804e-06, "loss": 0.0641, "step": 1142 }, { "epoch": 1.0150976909413854, "grad_norm": 0.591692404601877, "learning_rate": 9.297799728711303e-06, "loss": 0.0589, "step": 1143 }, { "epoch": 1.0159857904085257, "grad_norm": 0.4417333733225122, "learning_rate": 9.295817851097836e-06, "loss": 0.0598, "step": 1144 }, { "epoch": 1.0168738898756662, "grad_norm": 0.5009656149763125, "learning_rate": 9.2938333924584e-06, "loss": 0.0411, "step": 1145 }, { "epoch": 1.0177619893428065, "grad_norm": 0.4530511856701877, "learning_rate": 9.291846353985301e-06, "loss": 0.0454, "step": 1146 }, { "epoch": 1.0186500888099468, "grad_norm": 0.8092573734074219, "learning_rate": 9.289856736872398e-06, "loss": 0.0721, "step": 1147 }, { "epoch": 1.019538188277087, "grad_norm": 0.6030690101170599, "learning_rate": 9.287864542315099e-06, "loss": 0.0655, "step": 1148 }, { "epoch": 1.0204262877442274, "grad_norm": 1.060691763095867, "learning_rate": 9.285869771510359e-06, "loss": 0.0603, "step": 1149 }, { "epoch": 1.0213143872113677, "grad_norm": 0.7265882392430735, "learning_rate": 9.283872425656681e-06, "loss": 0.0803, "step": 1150 }, { "epoch": 1.022202486678508, "grad_norm": 0.47803413651821886, "learning_rate": 9.281872505954121e-06, "loss": 0.062, "step": 1151 }, { "epoch": 1.0230905861456483, "grad_norm": 0.6041289456455284, "learning_rate": 9.279870013604273e-06, "loss": 0.0554, "step": 1152 }, { "epoch": 1.0239786856127886, "grad_norm": 0.5229855759529586, "learning_rate": 9.27786494981028e-06, "loss": 0.0644, "step": 1153 }, { "epoch": 1.024866785079929, "grad_norm": 0.6121251944543281, "learning_rate": 9.27585731577683e-06, "loss": 0.0692, "step": 1154 }, { "epoch": 1.0257548845470692, "grad_norm": 0.5008161653178155, "learning_rate": 9.273847112710159e-06, "loss": 0.055, "step": 1155 }, { "epoch": 1.0266429840142095, "grad_norm": 1.3550527848348757, "learning_rate": 9.271834341818043e-06, "loss": 0.0626, "step": 1156 }, { "epoch": 1.0275310834813498, "grad_norm": 0.6086989420332563, "learning_rate": 9.269819004309796e-06, "loss": 0.0696, "step": 1157 }, { "epoch": 1.0284191829484903, "grad_norm": 0.4398267573695782, "learning_rate": 9.267801101396284e-06, "loss": 0.057, "step": 1158 }, { "epoch": 1.0293072824156306, "grad_norm": 0.5023864760744993, "learning_rate": 9.265780634289905e-06, "loss": 0.0522, "step": 1159 }, { "epoch": 1.030195381882771, "grad_norm": 0.5446662737483264, "learning_rate": 9.263757604204607e-06, "loss": 0.0785, "step": 1160 }, { "epoch": 1.0310834813499112, "grad_norm": 0.5002911503950366, "learning_rate": 9.261732012355868e-06, "loss": 0.0673, "step": 1161 }, { "epoch": 1.0319715808170515, "grad_norm": 0.534388338135266, "learning_rate": 9.259703859960713e-06, "loss": 0.0606, "step": 1162 }, { "epoch": 1.0328596802841918, "grad_norm": 0.45775255151671196, "learning_rate": 9.257673148237705e-06, "loss": 0.0513, "step": 1163 }, { "epoch": 1.0337477797513321, "grad_norm": 0.5085806099929371, "learning_rate": 9.255639878406937e-06, "loss": 0.0571, "step": 1164 }, { "epoch": 1.0346358792184724, "grad_norm": 0.5955705550431354, "learning_rate": 9.253604051690047e-06, "loss": 0.0709, "step": 1165 }, { "epoch": 1.0355239786856127, "grad_norm": 0.5178355763041769, "learning_rate": 9.251565669310204e-06, "loss": 0.0601, "step": 1166 }, { "epoch": 1.036412078152753, "grad_norm": 0.48472634314685387, "learning_rate": 9.249524732492118e-06, "loss": 0.0507, "step": 1167 }, { "epoch": 1.0373001776198933, "grad_norm": 0.5582223693364302, "learning_rate": 9.24748124246203e-06, "loss": 0.0582, "step": 1168 }, { "epoch": 1.0381882770870337, "grad_norm": 0.5548798266580908, "learning_rate": 9.245435200447715e-06, "loss": 0.0726, "step": 1169 }, { "epoch": 1.0390763765541742, "grad_norm": 0.5344192014597939, "learning_rate": 9.243386607678482e-06, "loss": 0.057, "step": 1170 }, { "epoch": 1.0399644760213145, "grad_norm": 0.5022659329476004, "learning_rate": 9.241335465385171e-06, "loss": 0.0602, "step": 1171 }, { "epoch": 1.0408525754884548, "grad_norm": 0.6086703551030485, "learning_rate": 9.239281774800159e-06, "loss": 0.0581, "step": 1172 }, { "epoch": 1.041740674955595, "grad_norm": 0.5527578198795071, "learning_rate": 9.23722553715735e-06, "loss": 0.0616, "step": 1173 }, { "epoch": 1.0426287744227354, "grad_norm": 0.5043530641652352, "learning_rate": 9.235166753692176e-06, "loss": 0.0641, "step": 1174 }, { "epoch": 1.0435168738898757, "grad_norm": 0.4534779142698993, "learning_rate": 9.233105425641601e-06, "loss": 0.048, "step": 1175 }, { "epoch": 1.044404973357016, "grad_norm": 0.45940582209571523, "learning_rate": 9.23104155424412e-06, "loss": 0.0547, "step": 1176 }, { "epoch": 1.0452930728241563, "grad_norm": 0.5998582500885319, "learning_rate": 9.228975140739756e-06, "loss": 0.0673, "step": 1177 }, { "epoch": 1.0461811722912966, "grad_norm": 0.42349229659058435, "learning_rate": 9.226906186370057e-06, "loss": 0.0529, "step": 1178 }, { "epoch": 1.047069271758437, "grad_norm": 0.5217090012758729, "learning_rate": 9.224834692378095e-06, "loss": 0.0613, "step": 1179 }, { "epoch": 1.0479573712255772, "grad_norm": 0.5357568673665954, "learning_rate": 9.222760660008474e-06, "loss": 0.0563, "step": 1180 }, { "epoch": 1.0488454706927175, "grad_norm": 0.48336047622715606, "learning_rate": 9.220684090507318e-06, "loss": 0.0512, "step": 1181 }, { "epoch": 1.0497335701598578, "grad_norm": 0.4624025252082009, "learning_rate": 9.218604985122282e-06, "loss": 0.0508, "step": 1182 }, { "epoch": 1.0506216696269983, "grad_norm": 0.46561654488313764, "learning_rate": 9.216523345102534e-06, "loss": 0.0517, "step": 1183 }, { "epoch": 1.0515097690941386, "grad_norm": 0.4648713622907487, "learning_rate": 9.214439171698777e-06, "loss": 0.0607, "step": 1184 }, { "epoch": 1.052397868561279, "grad_norm": 0.5092774520008562, "learning_rate": 9.212352466163226e-06, "loss": 0.0682, "step": 1185 }, { "epoch": 1.0532859680284192, "grad_norm": 0.5088605384399475, "learning_rate": 9.210263229749626e-06, "loss": 0.0638, "step": 1186 }, { "epoch": 1.0541740674955595, "grad_norm": 0.3941412719758881, "learning_rate": 9.208171463713234e-06, "loss": 0.0516, "step": 1187 }, { "epoch": 1.0550621669626998, "grad_norm": 0.5232270076417497, "learning_rate": 9.206077169310833e-06, "loss": 0.0754, "step": 1188 }, { "epoch": 1.0559502664298401, "grad_norm": 0.6567846124696248, "learning_rate": 9.203980347800721e-06, "loss": 0.0734, "step": 1189 }, { "epoch": 1.0568383658969804, "grad_norm": 0.5024428301117129, "learning_rate": 9.20188100044272e-06, "loss": 0.0683, "step": 1190 }, { "epoch": 1.0577264653641207, "grad_norm": 0.5645115632471462, "learning_rate": 9.199779128498163e-06, "loss": 0.0671, "step": 1191 }, { "epoch": 1.058614564831261, "grad_norm": 0.46395712255910054, "learning_rate": 9.197674733229904e-06, "loss": 0.0613, "step": 1192 }, { "epoch": 1.0595026642984013, "grad_norm": 0.5656733209301367, "learning_rate": 9.195567815902313e-06, "loss": 0.0697, "step": 1193 }, { "epoch": 1.0603907637655416, "grad_norm": 0.45851005309044807, "learning_rate": 9.193458377781273e-06, "loss": 0.0638, "step": 1194 }, { "epoch": 1.061278863232682, "grad_norm": 0.5478213904125951, "learning_rate": 9.191346420134183e-06, "loss": 0.0505, "step": 1195 }, { "epoch": 1.0621669626998225, "grad_norm": 0.5255656506155585, "learning_rate": 9.189231944229957e-06, "loss": 0.0658, "step": 1196 }, { "epoch": 1.0630550621669628, "grad_norm": 0.3767442674250263, "learning_rate": 9.187114951339021e-06, "loss": 0.0444, "step": 1197 }, { "epoch": 1.063943161634103, "grad_norm": 0.47321589853516277, "learning_rate": 9.184995442733309e-06, "loss": 0.0504, "step": 1198 }, { "epoch": 1.0648312611012434, "grad_norm": 0.48396521102034207, "learning_rate": 9.182873419686273e-06, "loss": 0.0501, "step": 1199 }, { "epoch": 1.0657193605683837, "grad_norm": 0.3783512928266631, "learning_rate": 9.180748883472874e-06, "loss": 0.0563, "step": 1200 }, { "epoch": 1.066607460035524, "grad_norm": 0.5583245420130393, "learning_rate": 9.178621835369581e-06, "loss": 0.0535, "step": 1201 }, { "epoch": 1.0674955595026643, "grad_norm": 0.4365276129838906, "learning_rate": 9.176492276654373e-06, "loss": 0.0491, "step": 1202 }, { "epoch": 1.0683836589698046, "grad_norm": 0.42841506319660755, "learning_rate": 9.17436020860674e-06, "loss": 0.053, "step": 1203 }, { "epoch": 1.0692717584369449, "grad_norm": 0.4419723943599086, "learning_rate": 9.172225632507674e-06, "loss": 0.0539, "step": 1204 }, { "epoch": 1.0701598579040852, "grad_norm": 0.5044189696920915, "learning_rate": 9.170088549639679e-06, "loss": 0.0582, "step": 1205 }, { "epoch": 1.0710479573712255, "grad_norm": 0.4251103284213423, "learning_rate": 9.167948961286764e-06, "loss": 0.0515, "step": 1206 }, { "epoch": 1.071936056838366, "grad_norm": 0.4946488504568838, "learning_rate": 9.165806868734444e-06, "loss": 0.062, "step": 1207 }, { "epoch": 1.0728241563055063, "grad_norm": 0.5234508822942274, "learning_rate": 9.163662273269733e-06, "loss": 0.0669, "step": 1208 }, { "epoch": 1.0737122557726466, "grad_norm": 0.44856465319675254, "learning_rate": 9.161515176181157e-06, "loss": 0.0568, "step": 1209 }, { "epoch": 1.074600355239787, "grad_norm": 0.5214790110595106, "learning_rate": 9.15936557875874e-06, "loss": 0.0607, "step": 1210 }, { "epoch": 1.0754884547069272, "grad_norm": 0.4276901711426315, "learning_rate": 9.157213482294009e-06, "loss": 0.0533, "step": 1211 }, { "epoch": 1.0763765541740675, "grad_norm": 0.4031156583282107, "learning_rate": 9.155058888079994e-06, "loss": 0.054, "step": 1212 }, { "epoch": 1.0772646536412078, "grad_norm": 0.4638513120983917, "learning_rate": 9.152901797411224e-06, "loss": 0.0593, "step": 1213 }, { "epoch": 1.0781527531083481, "grad_norm": 0.46551869913533556, "learning_rate": 9.150742211583728e-06, "loss": 0.0636, "step": 1214 }, { "epoch": 1.0790408525754884, "grad_norm": 0.4950805741312129, "learning_rate": 9.148580131895037e-06, "loss": 0.0593, "step": 1215 }, { "epoch": 1.0799289520426287, "grad_norm": 0.38004108965114014, "learning_rate": 9.146415559644176e-06, "loss": 0.052, "step": 1216 }, { "epoch": 1.080817051509769, "grad_norm": 0.44474473573525736, "learning_rate": 9.144248496131669e-06, "loss": 0.0636, "step": 1217 }, { "epoch": 1.0817051509769093, "grad_norm": 0.4312387573207775, "learning_rate": 9.14207894265954e-06, "loss": 0.0532, "step": 1218 }, { "epoch": 1.0825932504440496, "grad_norm": 0.3986077843447081, "learning_rate": 9.139906900531304e-06, "loss": 0.0623, "step": 1219 }, { "epoch": 1.0834813499111902, "grad_norm": 0.46785671093620884, "learning_rate": 9.137732371051978e-06, "loss": 0.0602, "step": 1220 }, { "epoch": 1.0843694493783305, "grad_norm": 0.6939585586944105, "learning_rate": 9.135555355528063e-06, "loss": 0.0531, "step": 1221 }, { "epoch": 1.0852575488454708, "grad_norm": 0.5159625584290726, "learning_rate": 9.133375855267566e-06, "loss": 0.0733, "step": 1222 }, { "epoch": 1.086145648312611, "grad_norm": 0.47928519418435783, "learning_rate": 9.131193871579975e-06, "loss": 0.0598, "step": 1223 }, { "epoch": 1.0870337477797514, "grad_norm": 0.7058879880927733, "learning_rate": 9.129009405776281e-06, "loss": 0.0567, "step": 1224 }, { "epoch": 1.0879218472468917, "grad_norm": 0.31967130011089134, "learning_rate": 9.126822459168958e-06, "loss": 0.0478, "step": 1225 }, { "epoch": 1.088809946714032, "grad_norm": 0.5247237546759348, "learning_rate": 9.124633033071974e-06, "loss": 0.0613, "step": 1226 }, { "epoch": 1.0896980461811723, "grad_norm": 0.43333651070647783, "learning_rate": 9.12244112880079e-06, "loss": 0.053, "step": 1227 }, { "epoch": 1.0905861456483126, "grad_norm": 0.6511917697060805, "learning_rate": 9.120246747672347e-06, "loss": 0.0712, "step": 1228 }, { "epoch": 1.0914742451154529, "grad_norm": 0.44421544969475796, "learning_rate": 9.118049891005083e-06, "loss": 0.0528, "step": 1229 }, { "epoch": 1.0923623445825932, "grad_norm": 0.5075003675357505, "learning_rate": 9.115850560118919e-06, "loss": 0.0621, "step": 1230 }, { "epoch": 1.0932504440497335, "grad_norm": 0.497956216214096, "learning_rate": 9.113648756335265e-06, "loss": 0.0594, "step": 1231 }, { "epoch": 1.0941385435168738, "grad_norm": 0.4603671054013783, "learning_rate": 9.111444480977011e-06, "loss": 0.0588, "step": 1232 }, { "epoch": 1.0950266429840143, "grad_norm": 0.4816947592901923, "learning_rate": 9.109237735368542e-06, "loss": 0.0687, "step": 1233 }, { "epoch": 1.0959147424511546, "grad_norm": 0.5551675769805149, "learning_rate": 9.107028520835717e-06, "loss": 0.0586, "step": 1234 }, { "epoch": 1.096802841918295, "grad_norm": 0.47960446136116963, "learning_rate": 9.104816838705883e-06, "loss": 0.0678, "step": 1235 }, { "epoch": 1.0976909413854352, "grad_norm": 0.42950340850108404, "learning_rate": 9.102602690307873e-06, "loss": 0.0509, "step": 1236 }, { "epoch": 1.0985790408525755, "grad_norm": 0.43620658315770067, "learning_rate": 9.100386076971995e-06, "loss": 0.062, "step": 1237 }, { "epoch": 1.0994671403197158, "grad_norm": 0.39365669512280954, "learning_rate": 9.098167000030041e-06, "loss": 0.0446, "step": 1238 }, { "epoch": 1.1003552397868561, "grad_norm": 0.5473374177006272, "learning_rate": 9.095945460815285e-06, "loss": 0.0591, "step": 1239 }, { "epoch": 1.1012433392539964, "grad_norm": 0.49635199912421746, "learning_rate": 9.09372146066248e-06, "loss": 0.0748, "step": 1240 }, { "epoch": 1.1021314387211367, "grad_norm": 0.4630743245346236, "learning_rate": 9.09149500090785e-06, "loss": 0.0705, "step": 1241 }, { "epoch": 1.103019538188277, "grad_norm": 0.5551642930392348, "learning_rate": 9.089266082889113e-06, "loss": 0.0641, "step": 1242 }, { "epoch": 1.1039076376554173, "grad_norm": 0.5741357075684433, "learning_rate": 9.087034707945444e-06, "loss": 0.0606, "step": 1243 }, { "epoch": 1.1047957371225576, "grad_norm": 0.5578487348310134, "learning_rate": 9.084800877417513e-06, "loss": 0.0605, "step": 1244 }, { "epoch": 1.105683836589698, "grad_norm": 0.5352395623472251, "learning_rate": 9.082564592647449e-06, "loss": 0.051, "step": 1245 }, { "epoch": 1.1065719360568385, "grad_norm": 0.6366168566998481, "learning_rate": 9.080325854978869e-06, "loss": 0.0703, "step": 1246 }, { "epoch": 1.1074600355239788, "grad_norm": 0.48856849823475945, "learning_rate": 9.078084665756856e-06, "loss": 0.0553, "step": 1247 }, { "epoch": 1.108348134991119, "grad_norm": 0.5363313823992678, "learning_rate": 9.075841026327967e-06, "loss": 0.0654, "step": 1248 }, { "epoch": 1.1092362344582594, "grad_norm": 0.6250870230475393, "learning_rate": 9.073594938040231e-06, "loss": 0.0719, "step": 1249 }, { "epoch": 1.1101243339253997, "grad_norm": 0.42805727224477097, "learning_rate": 9.071346402243155e-06, "loss": 0.06, "step": 1250 }, { "epoch": 1.11101243339254, "grad_norm": 0.564525499866512, "learning_rate": 9.069095420287705e-06, "loss": 0.0699, "step": 1251 }, { "epoch": 1.1119005328596803, "grad_norm": 0.42065248951773765, "learning_rate": 9.066841993526325e-06, "loss": 0.0596, "step": 1252 }, { "epoch": 1.1127886323268206, "grad_norm": 0.5328604744333013, "learning_rate": 9.064586123312926e-06, "loss": 0.0578, "step": 1253 }, { "epoch": 1.1136767317939609, "grad_norm": 0.35638540080650655, "learning_rate": 9.062327811002885e-06, "loss": 0.0544, "step": 1254 }, { "epoch": 1.1145648312611012, "grad_norm": 0.4377613323303497, "learning_rate": 9.060067057953049e-06, "loss": 0.0567, "step": 1255 }, { "epoch": 1.1154529307282415, "grad_norm": 0.5594310008177616, "learning_rate": 9.05780386552173e-06, "loss": 0.0705, "step": 1256 }, { "epoch": 1.116341030195382, "grad_norm": 0.5130264103556861, "learning_rate": 9.055538235068706e-06, "loss": 0.0437, "step": 1257 }, { "epoch": 1.1172291296625223, "grad_norm": 0.43041305807536706, "learning_rate": 9.05327016795522e-06, "loss": 0.0498, "step": 1258 }, { "epoch": 1.1181172291296626, "grad_norm": 0.45207274454326535, "learning_rate": 9.05099966554398e-06, "loss": 0.0462, "step": 1259 }, { "epoch": 1.119005328596803, "grad_norm": 0.41480525694898, "learning_rate": 9.048726729199153e-06, "loss": 0.0612, "step": 1260 }, { "epoch": 1.1198934280639432, "grad_norm": 0.36469040454064994, "learning_rate": 9.046451360286372e-06, "loss": 0.0455, "step": 1261 }, { "epoch": 1.1207815275310835, "grad_norm": 0.6199125426861651, "learning_rate": 9.044173560172734e-06, "loss": 0.0775, "step": 1262 }, { "epoch": 1.1216696269982238, "grad_norm": 0.5056571834693232, "learning_rate": 9.04189333022679e-06, "loss": 0.0571, "step": 1263 }, { "epoch": 1.1225577264653641, "grad_norm": 0.4087481618230986, "learning_rate": 9.039610671818557e-06, "loss": 0.0568, "step": 1264 }, { "epoch": 1.1234458259325044, "grad_norm": 0.3487200395719137, "learning_rate": 9.037325586319507e-06, "loss": 0.0527, "step": 1265 }, { "epoch": 1.1243339253996447, "grad_norm": 0.3772716895620028, "learning_rate": 9.03503807510257e-06, "loss": 0.0535, "step": 1266 }, { "epoch": 1.125222024866785, "grad_norm": 0.42781524365718604, "learning_rate": 9.032748139542143e-06, "loss": 0.0497, "step": 1267 }, { "epoch": 1.1261101243339253, "grad_norm": 0.40021805208371675, "learning_rate": 9.030455781014062e-06, "loss": 0.0566, "step": 1268 }, { "epoch": 1.1269982238010656, "grad_norm": 0.42462869175507223, "learning_rate": 9.028161000895633e-06, "loss": 0.0554, "step": 1269 }, { "epoch": 1.1278863232682061, "grad_norm": 0.7907863161810198, "learning_rate": 9.025863800565614e-06, "loss": 0.0632, "step": 1270 }, { "epoch": 1.1287744227353464, "grad_norm": 0.5600540481669183, "learning_rate": 9.023564181404214e-06, "loss": 0.0683, "step": 1271 }, { "epoch": 1.1296625222024868, "grad_norm": 0.46520643754289215, "learning_rate": 9.021262144793097e-06, "loss": 0.0556, "step": 1272 }, { "epoch": 1.130550621669627, "grad_norm": 0.3557023137760619, "learning_rate": 9.01895769211538e-06, "loss": 0.0433, "step": 1273 }, { "epoch": 1.1314387211367674, "grad_norm": 0.44882962300109963, "learning_rate": 9.016650824755631e-06, "loss": 0.0509, "step": 1274 }, { "epoch": 1.1323268206039077, "grad_norm": 0.4700799341239974, "learning_rate": 9.014341544099867e-06, "loss": 0.0548, "step": 1275 }, { "epoch": 1.133214920071048, "grad_norm": 0.6097884107024176, "learning_rate": 9.01202985153556e-06, "loss": 0.0604, "step": 1276 }, { "epoch": 1.1341030195381883, "grad_norm": 0.5302403802978701, "learning_rate": 9.009715748451625e-06, "loss": 0.0453, "step": 1277 }, { "epoch": 1.1349911190053286, "grad_norm": 0.4405789861583618, "learning_rate": 9.007399236238431e-06, "loss": 0.0526, "step": 1278 }, { "epoch": 1.1358792184724689, "grad_norm": 0.4426044799205729, "learning_rate": 9.00508031628779e-06, "loss": 0.0485, "step": 1279 }, { "epoch": 1.1367673179396092, "grad_norm": 0.41768050019952824, "learning_rate": 9.00275898999296e-06, "loss": 0.0467, "step": 1280 }, { "epoch": 1.1376554174067495, "grad_norm": 0.5378318629821901, "learning_rate": 9.000435258748654e-06, "loss": 0.0644, "step": 1281 }, { "epoch": 1.1385435168738898, "grad_norm": 0.40617647077937363, "learning_rate": 8.998109123951018e-06, "loss": 0.0518, "step": 1282 }, { "epoch": 1.1394316163410303, "grad_norm": 0.4721038219531133, "learning_rate": 8.995780586997647e-06, "loss": 0.0666, "step": 1283 }, { "epoch": 1.1403197158081706, "grad_norm": 0.5340266166884385, "learning_rate": 8.99344964928758e-06, "loss": 0.0523, "step": 1284 }, { "epoch": 1.141207815275311, "grad_norm": 0.3706596577324069, "learning_rate": 8.9911163122213e-06, "loss": 0.0502, "step": 1285 }, { "epoch": 1.1420959147424512, "grad_norm": 0.5716931124223276, "learning_rate": 8.988780577200725e-06, "loss": 0.0723, "step": 1286 }, { "epoch": 1.1429840142095915, "grad_norm": 0.4432282087167379, "learning_rate": 8.986442445629223e-06, "loss": 0.0564, "step": 1287 }, { "epoch": 1.1438721136767318, "grad_norm": 0.5446664959084349, "learning_rate": 8.984101918911596e-06, "loss": 0.0709, "step": 1288 }, { "epoch": 1.144760213143872, "grad_norm": 0.4609866760289082, "learning_rate": 8.981758998454084e-06, "loss": 0.068, "step": 1289 }, { "epoch": 1.1456483126110124, "grad_norm": 0.45249331321924663, "learning_rate": 8.979413685664368e-06, "loss": 0.0564, "step": 1290 }, { "epoch": 1.1465364120781527, "grad_norm": 0.3810603314693275, "learning_rate": 8.977065981951567e-06, "loss": 0.0476, "step": 1291 }, { "epoch": 1.147424511545293, "grad_norm": 0.44008903615910266, "learning_rate": 8.974715888726237e-06, "loss": 0.0671, "step": 1292 }, { "epoch": 1.1483126110124333, "grad_norm": 0.4057835303831334, "learning_rate": 8.972363407400363e-06, "loss": 0.0526, "step": 1293 }, { "epoch": 1.1492007104795736, "grad_norm": 0.42891192395424976, "learning_rate": 8.970008539387373e-06, "loss": 0.0502, "step": 1294 }, { "epoch": 1.150088809946714, "grad_norm": 0.48816395571984783, "learning_rate": 8.967651286102125e-06, "loss": 0.0717, "step": 1295 }, { "epoch": 1.1509769094138544, "grad_norm": 0.40035060600918576, "learning_rate": 8.965291648960914e-06, "loss": 0.0623, "step": 1296 }, { "epoch": 1.1518650088809947, "grad_norm": 0.3497708657532295, "learning_rate": 8.962929629381458e-06, "loss": 0.0395, "step": 1297 }, { "epoch": 1.152753108348135, "grad_norm": 0.6099565604093532, "learning_rate": 8.960565228782918e-06, "loss": 0.059, "step": 1298 }, { "epoch": 1.1536412078152753, "grad_norm": 0.38396000043396333, "learning_rate": 8.958198448585877e-06, "loss": 0.0494, "step": 1299 }, { "epoch": 1.1545293072824157, "grad_norm": 0.3689900063182137, "learning_rate": 8.955829290212352e-06, "loss": 0.0459, "step": 1300 }, { "epoch": 1.155417406749556, "grad_norm": 1.096067697838026, "learning_rate": 8.953457755085788e-06, "loss": 0.0606, "step": 1301 }, { "epoch": 1.1563055062166963, "grad_norm": 0.42149161110941696, "learning_rate": 8.951083844631056e-06, "loss": 0.0515, "step": 1302 }, { "epoch": 1.1571936056838366, "grad_norm": 0.4316581327135203, "learning_rate": 8.948707560274458e-06, "loss": 0.061, "step": 1303 }, { "epoch": 1.1580817051509769, "grad_norm": 0.6841774110734303, "learning_rate": 8.946328903443721e-06, "loss": 0.0646, "step": 1304 }, { "epoch": 1.1589698046181172, "grad_norm": 0.5965702797799218, "learning_rate": 8.943947875567993e-06, "loss": 0.0555, "step": 1305 }, { "epoch": 1.1598579040852575, "grad_norm": 1.0044954129750756, "learning_rate": 8.941564478077853e-06, "loss": 0.0632, "step": 1306 }, { "epoch": 1.160746003552398, "grad_norm": 0.6131401211707806, "learning_rate": 8.9391787124053e-06, "loss": 0.0677, "step": 1307 }, { "epoch": 1.161634103019538, "grad_norm": 0.48062753599742797, "learning_rate": 8.936790579983759e-06, "loss": 0.0529, "step": 1308 }, { "epoch": 1.1625222024866786, "grad_norm": 0.5714788031695348, "learning_rate": 8.93440008224807e-06, "loss": 0.054, "step": 1309 }, { "epoch": 1.163410301953819, "grad_norm": 0.5084421088016741, "learning_rate": 8.932007220634504e-06, "loss": 0.0657, "step": 1310 }, { "epoch": 1.1642984014209592, "grad_norm": 0.43120545860433124, "learning_rate": 8.929611996580743e-06, "loss": 0.0649, "step": 1311 }, { "epoch": 1.1651865008880995, "grad_norm": 0.6763349385926654, "learning_rate": 8.927214411525895e-06, "loss": 0.0591, "step": 1312 }, { "epoch": 1.1660746003552398, "grad_norm": 0.46637995940230653, "learning_rate": 8.924814466910483e-06, "loss": 0.0536, "step": 1313 }, { "epoch": 1.16696269982238, "grad_norm": 0.7188377730906739, "learning_rate": 8.922412164176451e-06, "loss": 0.062, "step": 1314 }, { "epoch": 1.1678507992895204, "grad_norm": 0.6445433237920782, "learning_rate": 8.920007504767154e-06, "loss": 0.0638, "step": 1315 }, { "epoch": 1.1687388987566607, "grad_norm": 0.6179906421694086, "learning_rate": 8.917600490127369e-06, "loss": 0.0652, "step": 1316 }, { "epoch": 1.169626998223801, "grad_norm": 0.46166867855276844, "learning_rate": 8.915191121703286e-06, "loss": 0.0619, "step": 1317 }, { "epoch": 1.1705150976909413, "grad_norm": 0.6636117139219818, "learning_rate": 8.912779400942508e-06, "loss": 0.0751, "step": 1318 }, { "epoch": 1.1714031971580816, "grad_norm": 0.5545918844554153, "learning_rate": 8.910365329294053e-06, "loss": 0.0522, "step": 1319 }, { "epoch": 1.1722912966252221, "grad_norm": 0.5529232213705042, "learning_rate": 8.907948908208348e-06, "loss": 0.0542, "step": 1320 }, { "epoch": 1.1731793960923624, "grad_norm": 0.5572245790921896, "learning_rate": 8.905530139137237e-06, "loss": 0.0563, "step": 1321 }, { "epoch": 1.1740674955595027, "grad_norm": 0.5234606984843095, "learning_rate": 8.903109023533973e-06, "loss": 0.0504, "step": 1322 }, { "epoch": 1.174955595026643, "grad_norm": 0.5065711780095631, "learning_rate": 8.900685562853214e-06, "loss": 0.056, "step": 1323 }, { "epoch": 1.1758436944937833, "grad_norm": 0.5187604950510653, "learning_rate": 8.898259758551034e-06, "loss": 0.0613, "step": 1324 }, { "epoch": 1.1767317939609236, "grad_norm": 0.5327828015491809, "learning_rate": 8.895831612084909e-06, "loss": 0.0678, "step": 1325 }, { "epoch": 1.177619893428064, "grad_norm": 0.39890964975590437, "learning_rate": 8.893401124913727e-06, "loss": 0.0495, "step": 1326 }, { "epoch": 1.1785079928952042, "grad_norm": 0.5199797120399963, "learning_rate": 8.890968298497783e-06, "loss": 0.0657, "step": 1327 }, { "epoch": 1.1793960923623446, "grad_norm": 0.5754922883095701, "learning_rate": 8.88853313429877e-06, "loss": 0.063, "step": 1328 }, { "epoch": 1.1802841918294849, "grad_norm": 0.4571192744782011, "learning_rate": 8.886095633779791e-06, "loss": 0.0461, "step": 1329 }, { "epoch": 1.1811722912966252, "grad_norm": 0.4329356837295055, "learning_rate": 8.883655798405358e-06, "loss": 0.0464, "step": 1330 }, { "epoch": 1.1820603907637655, "grad_norm": 0.44880379408517723, "learning_rate": 8.881213629641375e-06, "loss": 0.0498, "step": 1331 }, { "epoch": 1.1829484902309058, "grad_norm": 0.6826846866890449, "learning_rate": 8.878769128955153e-06, "loss": 0.0674, "step": 1332 }, { "epoch": 1.1838365896980463, "grad_norm": 0.47443992499617, "learning_rate": 8.876322297815406e-06, "loss": 0.0581, "step": 1333 }, { "epoch": 1.1847246891651866, "grad_norm": 0.4539546989361826, "learning_rate": 8.873873137692245e-06, "loss": 0.0563, "step": 1334 }, { "epoch": 1.1856127886323269, "grad_norm": 0.5998094035285374, "learning_rate": 8.871421650057184e-06, "loss": 0.0517, "step": 1335 }, { "epoch": 1.1865008880994672, "grad_norm": 0.49449962599266933, "learning_rate": 8.868967836383128e-06, "loss": 0.0562, "step": 1336 }, { "epoch": 1.1873889875666075, "grad_norm": 0.5202566548733357, "learning_rate": 8.866511698144392e-06, "loss": 0.0589, "step": 1337 }, { "epoch": 1.1882770870337478, "grad_norm": 0.4525532415700337, "learning_rate": 8.864053236816675e-06, "loss": 0.0513, "step": 1338 }, { "epoch": 1.189165186500888, "grad_norm": 0.49480944203176547, "learning_rate": 8.861592453877077e-06, "loss": 0.0522, "step": 1339 }, { "epoch": 1.1900532859680284, "grad_norm": 0.519556670613474, "learning_rate": 8.859129350804095e-06, "loss": 0.0628, "step": 1340 }, { "epoch": 1.1909413854351687, "grad_norm": 0.4581631626495746, "learning_rate": 8.856663929077615e-06, "loss": 0.0597, "step": 1341 }, { "epoch": 1.191829484902309, "grad_norm": 0.581342185965847, "learning_rate": 8.854196190178922e-06, "loss": 0.0512, "step": 1342 }, { "epoch": 1.1927175843694493, "grad_norm": 0.45221328804309713, "learning_rate": 8.85172613559069e-06, "loss": 0.0581, "step": 1343 }, { "epoch": 1.1936056838365896, "grad_norm": 0.45561203604976397, "learning_rate": 8.849253766796982e-06, "loss": 0.0539, "step": 1344 }, { "epoch": 1.19449378330373, "grad_norm": 0.3953246995448872, "learning_rate": 8.846779085283255e-06, "loss": 0.0554, "step": 1345 }, { "epoch": 1.1953818827708704, "grad_norm": 0.5479191632194884, "learning_rate": 8.844302092536357e-06, "loss": 0.0632, "step": 1346 }, { "epoch": 1.1962699822380107, "grad_norm": 0.5165362440829403, "learning_rate": 8.841822790044519e-06, "loss": 0.0738, "step": 1347 }, { "epoch": 1.197158081705151, "grad_norm": 0.45246299380103777, "learning_rate": 8.839341179297363e-06, "loss": 0.0509, "step": 1348 }, { "epoch": 1.1980461811722913, "grad_norm": 0.4520530861715212, "learning_rate": 8.8368572617859e-06, "loss": 0.0604, "step": 1349 }, { "epoch": 1.1989342806394316, "grad_norm": 0.4315426009039334, "learning_rate": 8.834371039002523e-06, "loss": 0.0583, "step": 1350 }, { "epoch": 1.199822380106572, "grad_norm": 0.58482645144901, "learning_rate": 8.83188251244101e-06, "loss": 0.061, "step": 1351 }, { "epoch": 1.2007104795737122, "grad_norm": 0.5362274737959619, "learning_rate": 8.829391683596528e-06, "loss": 0.0635, "step": 1352 }, { "epoch": 1.2015985790408525, "grad_norm": 0.3838263244907273, "learning_rate": 8.826898553965624e-06, "loss": 0.0539, "step": 1353 }, { "epoch": 1.2024866785079928, "grad_norm": 0.6316574808669704, "learning_rate": 8.824403125046225e-06, "loss": 0.0652, "step": 1354 }, { "epoch": 1.2033747779751331, "grad_norm": 0.43602766988187674, "learning_rate": 8.821905398337645e-06, "loss": 0.0528, "step": 1355 }, { "epoch": 1.2042628774422734, "grad_norm": 0.4289281757587253, "learning_rate": 8.819405375340573e-06, "loss": 0.0547, "step": 1356 }, { "epoch": 1.205150976909414, "grad_norm": 0.5202563473870233, "learning_rate": 8.81690305755708e-06, "loss": 0.0545, "step": 1357 }, { "epoch": 1.206039076376554, "grad_norm": 0.6122205843521804, "learning_rate": 8.814398446490619e-06, "loss": 0.0736, "step": 1358 }, { "epoch": 1.2069271758436946, "grad_norm": 0.4929556651394607, "learning_rate": 8.811891543646015e-06, "loss": 0.0549, "step": 1359 }, { "epoch": 1.2078152753108349, "grad_norm": 0.39153224898843647, "learning_rate": 8.809382350529474e-06, "loss": 0.0538, "step": 1360 }, { "epoch": 1.2087033747779752, "grad_norm": 0.5587191889924773, "learning_rate": 8.806870868648576e-06, "loss": 0.0791, "step": 1361 }, { "epoch": 1.2095914742451155, "grad_norm": 0.506286043805143, "learning_rate": 8.804357099512279e-06, "loss": 0.0526, "step": 1362 }, { "epoch": 1.2104795737122558, "grad_norm": 0.4383105018704721, "learning_rate": 8.80184104463091e-06, "loss": 0.0578, "step": 1363 }, { "epoch": 1.211367673179396, "grad_norm": 0.5667322270754526, "learning_rate": 8.799322705516174e-06, "loss": 0.0737, "step": 1364 }, { "epoch": 1.2122557726465364, "grad_norm": 0.5519873093279143, "learning_rate": 8.796802083681149e-06, "loss": 0.0542, "step": 1365 }, { "epoch": 1.2131438721136767, "grad_norm": 0.40449215131526517, "learning_rate": 8.79427918064028e-06, "loss": 0.0579, "step": 1366 }, { "epoch": 1.214031971580817, "grad_norm": 0.35529012811045607, "learning_rate": 8.791753997909384e-06, "loss": 0.0558, "step": 1367 }, { "epoch": 1.2149200710479573, "grad_norm": 0.47815233822568254, "learning_rate": 8.789226537005651e-06, "loss": 0.051, "step": 1368 }, { "epoch": 1.2158081705150976, "grad_norm": 0.4312255381241265, "learning_rate": 8.786696799447638e-06, "loss": 0.0486, "step": 1369 }, { "epoch": 1.2166962699822381, "grad_norm": 0.42906496410040296, "learning_rate": 8.784164786755268e-06, "loss": 0.0549, "step": 1370 }, { "epoch": 1.2175843694493784, "grad_norm": 0.5772074687235432, "learning_rate": 8.781630500449833e-06, "loss": 0.0528, "step": 1371 }, { "epoch": 1.2184724689165187, "grad_norm": 0.4262348332812471, "learning_rate": 8.779093942053989e-06, "loss": 0.0625, "step": 1372 }, { "epoch": 1.219360568383659, "grad_norm": 0.47326752992090304, "learning_rate": 8.77655511309176e-06, "loss": 0.0477, "step": 1373 }, { "epoch": 1.2202486678507993, "grad_norm": 0.6018578470735128, "learning_rate": 8.774014015088533e-06, "loss": 0.0688, "step": 1374 }, { "epoch": 1.2211367673179396, "grad_norm": 0.6063939648321814, "learning_rate": 8.771470649571056e-06, "loss": 0.0563, "step": 1375 }, { "epoch": 1.22202486678508, "grad_norm": 0.5629535965936708, "learning_rate": 8.768925018067445e-06, "loss": 0.051, "step": 1376 }, { "epoch": 1.2229129662522202, "grad_norm": 0.47636128599043764, "learning_rate": 8.76637712210717e-06, "loss": 0.0521, "step": 1377 }, { "epoch": 1.2238010657193605, "grad_norm": 0.5940845743589005, "learning_rate": 8.763826963221067e-06, "loss": 0.0544, "step": 1378 }, { "epoch": 1.2246891651865008, "grad_norm": 0.6114080307110875, "learning_rate": 8.761274542941329e-06, "loss": 0.0538, "step": 1379 }, { "epoch": 1.2255772646536411, "grad_norm": 0.5421878884736326, "learning_rate": 8.75871986280151e-06, "loss": 0.0597, "step": 1380 }, { "epoch": 1.2264653641207814, "grad_norm": 0.42808699209934453, "learning_rate": 8.756162924336522e-06, "loss": 0.0436, "step": 1381 }, { "epoch": 1.2273534635879217, "grad_norm": 0.5262320114348207, "learning_rate": 8.753603729082629e-06, "loss": 0.0598, "step": 1382 }, { "epoch": 1.2282415630550623, "grad_norm": 0.4774662835691713, "learning_rate": 8.751042278577455e-06, "loss": 0.0545, "step": 1383 }, { "epoch": 1.2291296625222026, "grad_norm": 0.45697823548933714, "learning_rate": 8.74847857435998e-06, "loss": 0.0577, "step": 1384 }, { "epoch": 1.2300177619893429, "grad_norm": 0.37780797322946186, "learning_rate": 8.745912617970534e-06, "loss": 0.043, "step": 1385 }, { "epoch": 1.2309058614564832, "grad_norm": 0.49860752947976433, "learning_rate": 8.743344410950804e-06, "loss": 0.0569, "step": 1386 }, { "epoch": 1.2317939609236235, "grad_norm": 0.4259825457515346, "learning_rate": 8.740773954843828e-06, "loss": 0.0541, "step": 1387 }, { "epoch": 1.2326820603907638, "grad_norm": 0.4399101307272299, "learning_rate": 8.738201251193993e-06, "loss": 0.055, "step": 1388 }, { "epoch": 1.233570159857904, "grad_norm": 0.4591677679252911, "learning_rate": 8.735626301547042e-06, "loss": 0.0558, "step": 1389 }, { "epoch": 1.2344582593250444, "grad_norm": 0.5111308336974323, "learning_rate": 8.733049107450063e-06, "loss": 0.0498, "step": 1390 }, { "epoch": 1.2353463587921847, "grad_norm": 0.5975734851527944, "learning_rate": 8.73046967045149e-06, "loss": 0.0576, "step": 1391 }, { "epoch": 1.236234458259325, "grad_norm": 0.45342634735673404, "learning_rate": 8.727887992101108e-06, "loss": 0.0489, "step": 1392 }, { "epoch": 1.2371225577264653, "grad_norm": 0.35032485349199205, "learning_rate": 8.725304073950054e-06, "loss": 0.0451, "step": 1393 }, { "epoch": 1.2380106571936056, "grad_norm": 0.4010843131050206, "learning_rate": 8.7227179175508e-06, "loss": 0.0466, "step": 1394 }, { "epoch": 1.238898756660746, "grad_norm": 0.4213918581386476, "learning_rate": 8.72012952445717e-06, "loss": 0.0541, "step": 1395 }, { "epoch": 1.2397868561278864, "grad_norm": 0.4706730549027241, "learning_rate": 8.717538896224333e-06, "loss": 0.0552, "step": 1396 }, { "epoch": 1.2406749555950267, "grad_norm": 0.35322390853742186, "learning_rate": 8.714946034408793e-06, "loss": 0.0486, "step": 1397 }, { "epoch": 1.241563055062167, "grad_norm": 0.415940827305024, "learning_rate": 8.712350940568403e-06, "loss": 0.0622, "step": 1398 }, { "epoch": 1.2424511545293073, "grad_norm": 0.36378994202820725, "learning_rate": 8.709753616262355e-06, "loss": 0.0648, "step": 1399 }, { "epoch": 1.2433392539964476, "grad_norm": 0.4597903087653849, "learning_rate": 8.70715406305118e-06, "loss": 0.0516, "step": 1400 }, { "epoch": 1.244227353463588, "grad_norm": 0.41001654230599194, "learning_rate": 8.704552282496752e-06, "loss": 0.0402, "step": 1401 }, { "epoch": 1.2451154529307282, "grad_norm": 0.43791431676495707, "learning_rate": 8.701948276162277e-06, "loss": 0.0523, "step": 1402 }, { "epoch": 1.2460035523978685, "grad_norm": 0.4875749596099277, "learning_rate": 8.699342045612304e-06, "loss": 0.0665, "step": 1403 }, { "epoch": 1.2468916518650088, "grad_norm": 0.5643094148020755, "learning_rate": 8.696733592412717e-06, "loss": 0.0532, "step": 1404 }, { "epoch": 1.2477797513321491, "grad_norm": 0.41507047223785865, "learning_rate": 8.694122918130732e-06, "loss": 0.0465, "step": 1405 }, { "epoch": 1.2486678507992894, "grad_norm": 0.41302290601101155, "learning_rate": 8.691510024334903e-06, "loss": 0.0482, "step": 1406 }, { "epoch": 1.24955595026643, "grad_norm": 0.5133229562261602, "learning_rate": 8.688894912595116e-06, "loss": 0.0618, "step": 1407 }, { "epoch": 1.25044404973357, "grad_norm": 0.3971774806483017, "learning_rate": 8.686277584482592e-06, "loss": 0.0417, "step": 1408 }, { "epoch": 1.2513321492007106, "grad_norm": 0.40694314461486464, "learning_rate": 8.68365804156988e-06, "loss": 0.0476, "step": 1409 }, { "epoch": 1.2522202486678509, "grad_norm": 0.5396595665280396, "learning_rate": 8.681036285430864e-06, "loss": 0.0603, "step": 1410 }, { "epoch": 1.2531083481349912, "grad_norm": 0.4952521043170604, "learning_rate": 8.678412317640753e-06, "loss": 0.055, "step": 1411 }, { "epoch": 1.2539964476021315, "grad_norm": 0.40182177205070296, "learning_rate": 8.675786139776085e-06, "loss": 0.0617, "step": 1412 }, { "epoch": 1.2548845470692718, "grad_norm": 0.43728542560992206, "learning_rate": 8.673157753414733e-06, "loss": 0.052, "step": 1413 }, { "epoch": 1.255772646536412, "grad_norm": 0.39273900995788635, "learning_rate": 8.670527160135888e-06, "loss": 0.0514, "step": 1414 }, { "epoch": 1.2566607460035524, "grad_norm": 0.42385266693033374, "learning_rate": 8.667894361520073e-06, "loss": 0.0621, "step": 1415 }, { "epoch": 1.2575488454706927, "grad_norm": 0.7726227937125522, "learning_rate": 8.665259359149132e-06, "loss": 0.0527, "step": 1416 }, { "epoch": 1.258436944937833, "grad_norm": 0.4025470687207859, "learning_rate": 8.662622154606238e-06, "loss": 0.0566, "step": 1417 }, { "epoch": 1.2593250444049733, "grad_norm": 0.37050503239021354, "learning_rate": 8.65998274947588e-06, "loss": 0.0549, "step": 1418 }, { "epoch": 1.2602131438721136, "grad_norm": 0.4780706131062481, "learning_rate": 8.657341145343876e-06, "loss": 0.056, "step": 1419 }, { "epoch": 1.261101243339254, "grad_norm": 0.6251755282173199, "learning_rate": 8.65469734379736e-06, "loss": 0.0709, "step": 1420 }, { "epoch": 1.2619893428063942, "grad_norm": 0.46926370918419985, "learning_rate": 8.652051346424792e-06, "loss": 0.0545, "step": 1421 }, { "epoch": 1.2628774422735347, "grad_norm": 0.6219857665197569, "learning_rate": 8.649403154815946e-06, "loss": 0.0632, "step": 1422 }, { "epoch": 1.263765541740675, "grad_norm": 0.3997124347103753, "learning_rate": 8.646752770561917e-06, "loss": 0.046, "step": 1423 }, { "epoch": 1.2646536412078153, "grad_norm": 0.4623517522205548, "learning_rate": 8.644100195255114e-06, "loss": 0.0544, "step": 1424 }, { "epoch": 1.2655417406749556, "grad_norm": 0.4577135539831789, "learning_rate": 8.64144543048927e-06, "loss": 0.0505, "step": 1425 }, { "epoch": 1.266429840142096, "grad_norm": 0.6400434262376272, "learning_rate": 8.638788477859424e-06, "loss": 0.0646, "step": 1426 }, { "epoch": 1.2673179396092362, "grad_norm": 0.3860193622343788, "learning_rate": 8.636129338961936e-06, "loss": 0.0426, "step": 1427 }, { "epoch": 1.2682060390763765, "grad_norm": 0.4060254048456331, "learning_rate": 8.633468015394479e-06, "loss": 0.048, "step": 1428 }, { "epoch": 1.2690941385435168, "grad_norm": 0.49627951706197526, "learning_rate": 8.630804508756035e-06, "loss": 0.0501, "step": 1429 }, { "epoch": 1.2699822380106571, "grad_norm": 0.5434086582577016, "learning_rate": 8.6281388206469e-06, "loss": 0.0463, "step": 1430 }, { "epoch": 1.2708703374777977, "grad_norm": 0.6291277946566355, "learning_rate": 8.625470952668685e-06, "loss": 0.0665, "step": 1431 }, { "epoch": 1.2717584369449377, "grad_norm": 0.4254592538836703, "learning_rate": 8.6228009064243e-06, "loss": 0.0467, "step": 1432 }, { "epoch": 1.2726465364120783, "grad_norm": 0.5303595485506196, "learning_rate": 8.620128683517972e-06, "loss": 0.0672, "step": 1433 }, { "epoch": 1.2735346358792183, "grad_norm": 0.7894791528828095, "learning_rate": 8.617454285555238e-06, "loss": 0.0761, "step": 1434 }, { "epoch": 1.2744227353463589, "grad_norm": 0.49384509382527925, "learning_rate": 8.614777714142932e-06, "loss": 0.0545, "step": 1435 }, { "epoch": 1.2753108348134992, "grad_norm": 0.5218293268530874, "learning_rate": 8.612098970889203e-06, "loss": 0.0576, "step": 1436 }, { "epoch": 1.2761989342806395, "grad_norm": 0.5905674492999149, "learning_rate": 8.609418057403501e-06, "loss": 0.0647, "step": 1437 }, { "epoch": 1.2770870337477798, "grad_norm": 0.4595219577988148, "learning_rate": 8.606734975296578e-06, "loss": 0.0523, "step": 1438 }, { "epoch": 1.27797513321492, "grad_norm": 0.505100325766144, "learning_rate": 8.604049726180494e-06, "loss": 0.0593, "step": 1439 }, { "epoch": 1.2788632326820604, "grad_norm": 0.3912563276111262, "learning_rate": 8.601362311668609e-06, "loss": 0.0604, "step": 1440 }, { "epoch": 1.2797513321492007, "grad_norm": 0.4489114067665813, "learning_rate": 8.598672733375579e-06, "loss": 0.0599, "step": 1441 }, { "epoch": 1.280639431616341, "grad_norm": 0.47885917161477914, "learning_rate": 8.59598099291737e-06, "loss": 0.0589, "step": 1442 }, { "epoch": 1.2815275310834813, "grad_norm": 0.5199443246043078, "learning_rate": 8.593287091911236e-06, "loss": 0.062, "step": 1443 }, { "epoch": 1.2824156305506218, "grad_norm": 0.49746222098434373, "learning_rate": 8.590591031975738e-06, "loss": 0.0582, "step": 1444 }, { "epoch": 1.2833037300177619, "grad_norm": 0.35960418251256066, "learning_rate": 8.58789281473073e-06, "loss": 0.0444, "step": 1445 }, { "epoch": 1.2841918294849024, "grad_norm": 0.48349444187022905, "learning_rate": 8.585192441797362e-06, "loss": 0.0536, "step": 1446 }, { "epoch": 1.2850799289520427, "grad_norm": 0.5204695091756877, "learning_rate": 8.582489914798083e-06, "loss": 0.0483, "step": 1447 }, { "epoch": 1.285968028419183, "grad_norm": 0.4179875305235931, "learning_rate": 8.579785235356628e-06, "loss": 0.049, "step": 1448 }, { "epoch": 1.2868561278863233, "grad_norm": 0.4445812605614399, "learning_rate": 8.577078405098036e-06, "loss": 0.0584, "step": 1449 }, { "epoch": 1.2877442273534636, "grad_norm": 0.65983909043929, "learning_rate": 8.574369425648629e-06, "loss": 0.0534, "step": 1450 }, { "epoch": 1.288632326820604, "grad_norm": 0.5371606702884453, "learning_rate": 8.571658298636026e-06, "loss": 0.0643, "step": 1451 }, { "epoch": 1.2895204262877442, "grad_norm": 0.47934231287327855, "learning_rate": 8.568945025689132e-06, "loss": 0.0597, "step": 1452 }, { "epoch": 1.2904085257548845, "grad_norm": 0.49682055572492245, "learning_rate": 8.566229608438146e-06, "loss": 0.0569, "step": 1453 }, { "epoch": 1.2912966252220248, "grad_norm": 0.45968426831417825, "learning_rate": 8.563512048514552e-06, "loss": 0.0581, "step": 1454 }, { "epoch": 1.2921847246891651, "grad_norm": 0.42824351622073825, "learning_rate": 8.560792347551124e-06, "loss": 0.06, "step": 1455 }, { "epoch": 1.2930728241563054, "grad_norm": 0.6305542843938704, "learning_rate": 8.558070507181919e-06, "loss": 0.0646, "step": 1456 }, { "epoch": 1.293960923623446, "grad_norm": 0.3462779083068886, "learning_rate": 8.555346529042281e-06, "loss": 0.0426, "step": 1457 }, { "epoch": 1.294849023090586, "grad_norm": 0.5127804931629231, "learning_rate": 8.552620414768839e-06, "loss": 0.0461, "step": 1458 }, { "epoch": 1.2957371225577266, "grad_norm": 0.46916166862406744, "learning_rate": 8.549892165999505e-06, "loss": 0.0537, "step": 1459 }, { "epoch": 1.2966252220248669, "grad_norm": 0.40942997066412934, "learning_rate": 8.547161784373475e-06, "loss": 0.0483, "step": 1460 }, { "epoch": 1.2975133214920072, "grad_norm": 0.6235498343590126, "learning_rate": 8.544429271531224e-06, "loss": 0.0631, "step": 1461 }, { "epoch": 1.2984014209591475, "grad_norm": 0.5393186646359277, "learning_rate": 8.541694629114506e-06, "loss": 0.0523, "step": 1462 }, { "epoch": 1.2992895204262878, "grad_norm": 0.38281650665148176, "learning_rate": 8.53895785876636e-06, "loss": 0.0438, "step": 1463 }, { "epoch": 1.300177619893428, "grad_norm": 0.4467187838121198, "learning_rate": 8.536218962131102e-06, "loss": 0.0532, "step": 1464 }, { "epoch": 1.3010657193605684, "grad_norm": 0.3937399574367662, "learning_rate": 8.533477940854317e-06, "loss": 0.0506, "step": 1465 }, { "epoch": 1.3019538188277087, "grad_norm": 0.4698156992764309, "learning_rate": 8.53073479658288e-06, "loss": 0.0488, "step": 1466 }, { "epoch": 1.302841918294849, "grad_norm": 0.43459002711500133, "learning_rate": 8.527989530964934e-06, "loss": 0.0533, "step": 1467 }, { "epoch": 1.3037300177619893, "grad_norm": 0.36028233828067874, "learning_rate": 8.525242145649895e-06, "loss": 0.0575, "step": 1468 }, { "epoch": 1.3046181172291296, "grad_norm": 0.6517374756236427, "learning_rate": 8.522492642288458e-06, "loss": 0.072, "step": 1469 }, { "epoch": 1.30550621669627, "grad_norm": 0.548678187698675, "learning_rate": 8.519741022532585e-06, "loss": 0.0628, "step": 1470 }, { "epoch": 1.3063943161634102, "grad_norm": 0.4037151468281861, "learning_rate": 8.516987288035515e-06, "loss": 0.0509, "step": 1471 }, { "epoch": 1.3072824156305507, "grad_norm": 0.44596859794092103, "learning_rate": 8.514231440451753e-06, "loss": 0.0438, "step": 1472 }, { "epoch": 1.308170515097691, "grad_norm": 0.5465102266648206, "learning_rate": 8.511473481437079e-06, "loss": 0.0481, "step": 1473 }, { "epoch": 1.3090586145648313, "grad_norm": 0.480704475035662, "learning_rate": 8.50871341264853e-06, "loss": 0.0451, "step": 1474 }, { "epoch": 1.3099467140319716, "grad_norm": 0.36863949736884805, "learning_rate": 8.505951235744427e-06, "loss": 0.0466, "step": 1475 }, { "epoch": 1.310834813499112, "grad_norm": 0.6238227028327276, "learning_rate": 8.503186952384346e-06, "loss": 0.0646, "step": 1476 }, { "epoch": 1.3117229129662522, "grad_norm": 0.8111377642642721, "learning_rate": 8.500420564229133e-06, "loss": 0.0526, "step": 1477 }, { "epoch": 1.3126110124333925, "grad_norm": 0.5640201873462473, "learning_rate": 8.497652072940896e-06, "loss": 0.0591, "step": 1478 }, { "epoch": 1.3134991119005328, "grad_norm": 0.4018870174647557, "learning_rate": 8.494881480183009e-06, "loss": 0.0573, "step": 1479 }, { "epoch": 1.3143872113676731, "grad_norm": 0.5311441613629344, "learning_rate": 8.492108787620106e-06, "loss": 0.0607, "step": 1480 }, { "epoch": 1.3152753108348134, "grad_norm": 1.155062480170173, "learning_rate": 8.489333996918087e-06, "loss": 0.0662, "step": 1481 }, { "epoch": 1.3161634103019537, "grad_norm": 0.45595005579384046, "learning_rate": 8.486557109744106e-06, "loss": 0.0617, "step": 1482 }, { "epoch": 1.3170515097690942, "grad_norm": 0.47490667013003973, "learning_rate": 8.483778127766587e-06, "loss": 0.0553, "step": 1483 }, { "epoch": 1.3179396092362343, "grad_norm": 0.792155455739816, "learning_rate": 8.480997052655203e-06, "loss": 0.0687, "step": 1484 }, { "epoch": 1.3188277087033748, "grad_norm": 0.4163969332342965, "learning_rate": 8.478213886080883e-06, "loss": 0.0541, "step": 1485 }, { "epoch": 1.3197158081705151, "grad_norm": 0.4473583978385226, "learning_rate": 8.475428629715825e-06, "loss": 0.0563, "step": 1486 }, { "epoch": 1.3206039076376554, "grad_norm": 0.4301317785599763, "learning_rate": 8.47264128523347e-06, "loss": 0.0545, "step": 1487 }, { "epoch": 1.3214920071047958, "grad_norm": 0.4595297952515238, "learning_rate": 8.469851854308522e-06, "loss": 0.0434, "step": 1488 }, { "epoch": 1.322380106571936, "grad_norm": 0.5166301986880361, "learning_rate": 8.467060338616931e-06, "loss": 0.0565, "step": 1489 }, { "epoch": 1.3232682060390764, "grad_norm": 0.4419532686675486, "learning_rate": 8.464266739835906e-06, "loss": 0.0503, "step": 1490 }, { "epoch": 1.3241563055062167, "grad_norm": 0.5410592441277755, "learning_rate": 8.461471059643907e-06, "loss": 0.048, "step": 1491 }, { "epoch": 1.325044404973357, "grad_norm": 0.4449123853152512, "learning_rate": 8.458673299720639e-06, "loss": 0.0435, "step": 1492 }, { "epoch": 1.3259325044404973, "grad_norm": 0.4790481415566278, "learning_rate": 8.455873461747064e-06, "loss": 0.0588, "step": 1493 }, { "epoch": 1.3268206039076378, "grad_norm": 0.5342704537961382, "learning_rate": 8.453071547405387e-06, "loss": 0.0634, "step": 1494 }, { "epoch": 1.3277087033747779, "grad_norm": 0.4170166866754002, "learning_rate": 8.450267558379063e-06, "loss": 0.0559, "step": 1495 }, { "epoch": 1.3285968028419184, "grad_norm": 0.5802552581740366, "learning_rate": 8.447461496352791e-06, "loss": 0.0547, "step": 1496 }, { "epoch": 1.3294849023090587, "grad_norm": 0.6286196611464454, "learning_rate": 8.444653363012521e-06, "loss": 0.0688, "step": 1497 }, { "epoch": 1.330373001776199, "grad_norm": 0.36039387770317033, "learning_rate": 8.441843160045443e-06, "loss": 0.0514, "step": 1498 }, { "epoch": 1.3312611012433393, "grad_norm": 0.5411986034980915, "learning_rate": 8.439030889139987e-06, "loss": 0.063, "step": 1499 }, { "epoch": 1.3321492007104796, "grad_norm": 0.49451007108822237, "learning_rate": 8.436216551985836e-06, "loss": 0.0603, "step": 1500 }, { "epoch": 1.33303730017762, "grad_norm": 0.6786698927795038, "learning_rate": 8.433400150273907e-06, "loss": 0.0584, "step": 1501 }, { "epoch": 1.3339253996447602, "grad_norm": 0.5372460305082452, "learning_rate": 8.430581685696356e-06, "loss": 0.0529, "step": 1502 }, { "epoch": 1.3348134991119005, "grad_norm": 0.44577665924664855, "learning_rate": 8.427761159946584e-06, "loss": 0.0467, "step": 1503 }, { "epoch": 1.3357015985790408, "grad_norm": 0.3425848176226401, "learning_rate": 8.424938574719224e-06, "loss": 0.0445, "step": 1504 }, { "epoch": 1.336589698046181, "grad_norm": 0.6350760384909353, "learning_rate": 8.422113931710156e-06, "loss": 0.063, "step": 1505 }, { "epoch": 1.3374777975133214, "grad_norm": 0.43821156257968646, "learning_rate": 8.419287232616484e-06, "loss": 0.0609, "step": 1506 }, { "epoch": 1.338365896980462, "grad_norm": 0.762015230863103, "learning_rate": 8.416458479136558e-06, "loss": 0.0539, "step": 1507 }, { "epoch": 1.339253996447602, "grad_norm": 0.39381448397625796, "learning_rate": 8.413627672969958e-06, "loss": 0.0501, "step": 1508 }, { "epoch": 1.3401420959147425, "grad_norm": 0.5136660363183541, "learning_rate": 8.410794815817494e-06, "loss": 0.0497, "step": 1509 }, { "epoch": 1.3410301953818828, "grad_norm": 0.46079688680757713, "learning_rate": 8.407959909381214e-06, "loss": 0.0535, "step": 1510 }, { "epoch": 1.3419182948490231, "grad_norm": 0.6640295890749003, "learning_rate": 8.405122955364394e-06, "loss": 0.0597, "step": 1511 }, { "epoch": 1.3428063943161634, "grad_norm": 0.4732830529420499, "learning_rate": 8.402283955471541e-06, "loss": 0.0441, "step": 1512 }, { "epoch": 1.3436944937833037, "grad_norm": 0.45163064734160446, "learning_rate": 8.399442911408393e-06, "loss": 0.0482, "step": 1513 }, { "epoch": 1.344582593250444, "grad_norm": 0.4004202627674794, "learning_rate": 8.396599824881913e-06, "loss": 0.0459, "step": 1514 }, { "epoch": 1.3454706927175843, "grad_norm": 0.5085992052846169, "learning_rate": 8.393754697600291e-06, "loss": 0.0573, "step": 1515 }, { "epoch": 1.3463587921847247, "grad_norm": 0.4924731869436927, "learning_rate": 8.390907531272951e-06, "loss": 0.0659, "step": 1516 }, { "epoch": 1.347246891651865, "grad_norm": 0.5452365800224888, "learning_rate": 8.388058327610528e-06, "loss": 0.0516, "step": 1517 }, { "epoch": 1.3481349911190053, "grad_norm": 0.5704325365049103, "learning_rate": 8.385207088324894e-06, "loss": 0.0484, "step": 1518 }, { "epoch": 1.3490230905861456, "grad_norm": 0.5016393880689081, "learning_rate": 8.382353815129136e-06, "loss": 0.0616, "step": 1519 }, { "epoch": 1.349911190053286, "grad_norm": 0.3981430116519234, "learning_rate": 8.37949850973757e-06, "loss": 0.0498, "step": 1520 }, { "epoch": 1.3507992895204262, "grad_norm": 0.47622194032636755, "learning_rate": 8.376641173865724e-06, "loss": 0.0476, "step": 1521 }, { "epoch": 1.3516873889875667, "grad_norm": 0.46749798952082183, "learning_rate": 8.373781809230355e-06, "loss": 0.0496, "step": 1522 }, { "epoch": 1.352575488454707, "grad_norm": 0.404985475436921, "learning_rate": 8.370920417549433e-06, "loss": 0.0441, "step": 1523 }, { "epoch": 1.3534635879218473, "grad_norm": 0.38619516691976297, "learning_rate": 8.36805700054215e-06, "loss": 0.0466, "step": 1524 }, { "epoch": 1.3543516873889876, "grad_norm": 0.3893869282188238, "learning_rate": 8.365191559928908e-06, "loss": 0.0474, "step": 1525 }, { "epoch": 1.355239786856128, "grad_norm": 0.4369571034155886, "learning_rate": 8.362324097431335e-06, "loss": 0.0503, "step": 1526 }, { "epoch": 1.3561278863232682, "grad_norm": 0.5264090102795924, "learning_rate": 8.359454614772264e-06, "loss": 0.0652, "step": 1527 }, { "epoch": 1.3570159857904085, "grad_norm": 0.35886809356369237, "learning_rate": 8.356583113675749e-06, "loss": 0.0473, "step": 1528 }, { "epoch": 1.3579040852575488, "grad_norm": 0.3970066798407407, "learning_rate": 8.353709595867052e-06, "loss": 0.0456, "step": 1529 }, { "epoch": 1.358792184724689, "grad_norm": 0.3384342848322592, "learning_rate": 8.350834063072651e-06, "loss": 0.0373, "step": 1530 }, { "epoch": 1.3596802841918294, "grad_norm": 0.47453111740475157, "learning_rate": 8.347956517020227e-06, "loss": 0.0549, "step": 1531 }, { "epoch": 1.3605683836589697, "grad_norm": 0.4829163729790234, "learning_rate": 8.34507695943868e-06, "loss": 0.0544, "step": 1532 }, { "epoch": 1.3614564831261102, "grad_norm": 0.47628665800148995, "learning_rate": 8.342195392058113e-06, "loss": 0.0533, "step": 1533 }, { "epoch": 1.3623445825932503, "grad_norm": 0.4453596523266883, "learning_rate": 8.339311816609838e-06, "loss": 0.0596, "step": 1534 }, { "epoch": 1.3632326820603908, "grad_norm": 0.4231359352448745, "learning_rate": 8.336426234826374e-06, "loss": 0.0542, "step": 1535 }, { "epoch": 1.3641207815275311, "grad_norm": 0.3782868139162873, "learning_rate": 8.333538648441444e-06, "loss": 0.0496, "step": 1536 }, { "epoch": 1.3650088809946714, "grad_norm": 0.5542789896302543, "learning_rate": 8.330649059189975e-06, "loss": 0.0535, "step": 1537 }, { "epoch": 1.3658969804618117, "grad_norm": 0.4814896663589956, "learning_rate": 8.3277574688081e-06, "loss": 0.0515, "step": 1538 }, { "epoch": 1.366785079928952, "grad_norm": 0.6046455283468285, "learning_rate": 8.324863879033153e-06, "loss": 0.0585, "step": 1539 }, { "epoch": 1.3676731793960923, "grad_norm": 0.4118867357435279, "learning_rate": 8.321968291603669e-06, "loss": 0.0537, "step": 1540 }, { "epoch": 1.3685612788632326, "grad_norm": 0.4478074627392541, "learning_rate": 8.319070708259383e-06, "loss": 0.055, "step": 1541 }, { "epoch": 1.369449378330373, "grad_norm": 0.6756323939696056, "learning_rate": 8.31617113074123e-06, "loss": 0.0564, "step": 1542 }, { "epoch": 1.3703374777975132, "grad_norm": 0.3979245077082011, "learning_rate": 8.313269560791343e-06, "loss": 0.0549, "step": 1543 }, { "epoch": 1.3712255772646538, "grad_norm": 0.45049907570969744, "learning_rate": 8.31036600015305e-06, "loss": 0.0556, "step": 1544 }, { "epoch": 1.3721136767317939, "grad_norm": 0.6898831561463261, "learning_rate": 8.307460450570879e-06, "loss": 0.0613, "step": 1545 }, { "epoch": 1.3730017761989344, "grad_norm": 0.38427443065782585, "learning_rate": 8.30455291379055e-06, "loss": 0.0492, "step": 1546 }, { "epoch": 1.3738898756660747, "grad_norm": 0.3904995137122518, "learning_rate": 8.301643391558977e-06, "loss": 0.051, "step": 1547 }, { "epoch": 1.374777975133215, "grad_norm": 0.4675032072811224, "learning_rate": 8.298731885624272e-06, "loss": 0.0541, "step": 1548 }, { "epoch": 1.3756660746003553, "grad_norm": 0.6363678435023608, "learning_rate": 8.295818397735727e-06, "loss": 0.0521, "step": 1549 }, { "epoch": 1.3765541740674956, "grad_norm": 0.46452327793605236, "learning_rate": 8.29290292964384e-06, "loss": 0.0497, "step": 1550 }, { "epoch": 1.3774422735346359, "grad_norm": 0.3869276938537254, "learning_rate": 8.289985483100284e-06, "loss": 0.0464, "step": 1551 }, { "epoch": 1.3783303730017762, "grad_norm": 0.5705722937069527, "learning_rate": 8.287066059857936e-06, "loss": 0.0609, "step": 1552 }, { "epoch": 1.3792184724689165, "grad_norm": 0.3749300790504764, "learning_rate": 8.284144661670845e-06, "loss": 0.0503, "step": 1553 }, { "epoch": 1.3801065719360568, "grad_norm": 0.5450754778814413, "learning_rate": 8.28122129029426e-06, "loss": 0.0824, "step": 1554 }, { "epoch": 1.380994671403197, "grad_norm": 0.49668812306186516, "learning_rate": 8.278295947484605e-06, "loss": 0.0627, "step": 1555 }, { "epoch": 1.3818827708703374, "grad_norm": 0.4744427878395548, "learning_rate": 8.275368634999495e-06, "loss": 0.0539, "step": 1556 }, { "epoch": 1.382770870337478, "grad_norm": 0.583350862165481, "learning_rate": 8.272439354597728e-06, "loss": 0.0561, "step": 1557 }, { "epoch": 1.383658969804618, "grad_norm": 0.36980203867676986, "learning_rate": 8.269508108039279e-06, "loss": 0.0521, "step": 1558 }, { "epoch": 1.3845470692717585, "grad_norm": 0.5120540940254161, "learning_rate": 8.266574897085312e-06, "loss": 0.0473, "step": 1559 }, { "epoch": 1.3854351687388988, "grad_norm": 0.4827772130451817, "learning_rate": 8.263639723498167e-06, "loss": 0.0737, "step": 1560 }, { "epoch": 1.3863232682060391, "grad_norm": 0.4535918145216589, "learning_rate": 8.260702589041361e-06, "loss": 0.0485, "step": 1561 }, { "epoch": 1.3872113676731794, "grad_norm": 0.5217199430258538, "learning_rate": 8.257763495479596e-06, "loss": 0.0545, "step": 1562 }, { "epoch": 1.3880994671403197, "grad_norm": 0.4027009124705943, "learning_rate": 8.254822444578742e-06, "loss": 0.0503, "step": 1563 }, { "epoch": 1.38898756660746, "grad_norm": 0.4852312091691364, "learning_rate": 8.251879438105854e-06, "loss": 0.0438, "step": 1564 }, { "epoch": 1.3898756660746003, "grad_norm": 0.4268369610359005, "learning_rate": 8.248934477829154e-06, "loss": 0.0495, "step": 1565 }, { "epoch": 1.3907637655417406, "grad_norm": 0.45782613166201763, "learning_rate": 8.245987565518047e-06, "loss": 0.048, "step": 1566 }, { "epoch": 1.391651865008881, "grad_norm": 0.4389272824898538, "learning_rate": 8.2430387029431e-06, "loss": 0.0513, "step": 1567 }, { "epoch": 1.3925399644760212, "grad_norm": 0.36951403873805333, "learning_rate": 8.240087891876061e-06, "loss": 0.0489, "step": 1568 }, { "epoch": 1.3934280639431615, "grad_norm": 0.45586866631218004, "learning_rate": 8.237135134089842e-06, "loss": 0.0476, "step": 1569 }, { "epoch": 1.394316163410302, "grad_norm": 0.4549045157497352, "learning_rate": 8.234180431358534e-06, "loss": 0.0474, "step": 1570 }, { "epoch": 1.3952042628774421, "grad_norm": 0.4565821267928718, "learning_rate": 8.23122378545738e-06, "loss": 0.0559, "step": 1571 }, { "epoch": 1.3960923623445827, "grad_norm": 0.4701002068827714, "learning_rate": 8.22826519816281e-06, "loss": 0.0527, "step": 1572 }, { "epoch": 1.396980461811723, "grad_norm": 0.6596291184863229, "learning_rate": 8.225304671252406e-06, "loss": 0.0548, "step": 1573 }, { "epoch": 1.3978685612788633, "grad_norm": 0.38186387958597295, "learning_rate": 8.222342206504922e-06, "loss": 0.0501, "step": 1574 }, { "epoch": 1.3987566607460036, "grad_norm": 0.4833881974668152, "learning_rate": 8.219377805700277e-06, "loss": 0.0528, "step": 1575 }, { "epoch": 1.3996447602131439, "grad_norm": 0.4523554837124434, "learning_rate": 8.216411470619547e-06, "loss": 0.069, "step": 1576 }, { "epoch": 1.4005328596802842, "grad_norm": 0.47654859451639997, "learning_rate": 8.21344320304498e-06, "loss": 0.0546, "step": 1577 }, { "epoch": 1.4014209591474245, "grad_norm": 0.4398119796047219, "learning_rate": 8.21047300475997e-06, "loss": 0.0565, "step": 1578 }, { "epoch": 1.4023090586145648, "grad_norm": 0.5207074172769443, "learning_rate": 8.207500877549089e-06, "loss": 0.0587, "step": 1579 }, { "epoch": 1.403197158081705, "grad_norm": 0.48650562407801845, "learning_rate": 8.204526823198055e-06, "loss": 0.0506, "step": 1580 }, { "epoch": 1.4040852575488454, "grad_norm": 0.48287027505973873, "learning_rate": 8.201550843493748e-06, "loss": 0.053, "step": 1581 }, { "epoch": 1.4049733570159857, "grad_norm": 0.38947995151850845, "learning_rate": 8.198572940224208e-06, "loss": 0.0461, "step": 1582 }, { "epoch": 1.4058614564831262, "grad_norm": 0.6148736212194573, "learning_rate": 8.195593115178626e-06, "loss": 0.0663, "step": 1583 }, { "epoch": 1.4067495559502663, "grad_norm": 0.4121009501774294, "learning_rate": 8.192611370147347e-06, "loss": 0.046, "step": 1584 }, { "epoch": 1.4076376554174068, "grad_norm": 0.5049591962876894, "learning_rate": 8.189627706921876e-06, "loss": 0.0534, "step": 1585 }, { "epoch": 1.4085257548845471, "grad_norm": 0.43800379560380437, "learning_rate": 8.186642127294863e-06, "loss": 0.0598, "step": 1586 }, { "epoch": 1.4094138543516874, "grad_norm": 0.5704135811442727, "learning_rate": 8.183654633060114e-06, "loss": 0.072, "step": 1587 }, { "epoch": 1.4103019538188277, "grad_norm": 0.3892089524421799, "learning_rate": 8.180665226012585e-06, "loss": 0.0484, "step": 1588 }, { "epoch": 1.411190053285968, "grad_norm": 0.461051514875073, "learning_rate": 8.177673907948378e-06, "loss": 0.066, "step": 1589 }, { "epoch": 1.4120781527531083, "grad_norm": 0.7247921988831233, "learning_rate": 8.174680680664748e-06, "loss": 0.0686, "step": 1590 }, { "epoch": 1.4129662522202486, "grad_norm": 0.39580557698655033, "learning_rate": 8.171685545960095e-06, "loss": 0.0443, "step": 1591 }, { "epoch": 1.413854351687389, "grad_norm": 0.45187941937848386, "learning_rate": 8.168688505633962e-06, "loss": 0.056, "step": 1592 }, { "epoch": 1.4147424511545292, "grad_norm": 0.48617085607247623, "learning_rate": 8.16568956148704e-06, "loss": 0.0675, "step": 1593 }, { "epoch": 1.4156305506216698, "grad_norm": 0.6178370439855861, "learning_rate": 8.162688715321165e-06, "loss": 0.0609, "step": 1594 }, { "epoch": 1.4165186500888098, "grad_norm": 0.36070783666679784, "learning_rate": 8.159685968939315e-06, "loss": 0.0494, "step": 1595 }, { "epoch": 1.4174067495559504, "grad_norm": 0.33555797226139794, "learning_rate": 8.156681324145605e-06, "loss": 0.0455, "step": 1596 }, { "epoch": 1.4182948490230907, "grad_norm": 0.44810990333980477, "learning_rate": 8.1536747827453e-06, "loss": 0.0498, "step": 1597 }, { "epoch": 1.419182948490231, "grad_norm": 0.467364659598731, "learning_rate": 8.150666346544794e-06, "loss": 0.0418, "step": 1598 }, { "epoch": 1.4200710479573713, "grad_norm": 0.5506948501799759, "learning_rate": 8.147656017351629e-06, "loss": 0.0511, "step": 1599 }, { "epoch": 1.4209591474245116, "grad_norm": 0.5381193231690307, "learning_rate": 8.144643796974477e-06, "loss": 0.0581, "step": 1600 }, { "epoch": 1.4218472468916519, "grad_norm": 0.37909069381598576, "learning_rate": 8.141629687223149e-06, "loss": 0.0459, "step": 1601 }, { "epoch": 1.4227353463587922, "grad_norm": 0.5968848612394052, "learning_rate": 8.138613689908592e-06, "loss": 0.0523, "step": 1602 }, { "epoch": 1.4236234458259325, "grad_norm": 0.44627052951661905, "learning_rate": 8.13559580684289e-06, "loss": 0.06, "step": 1603 }, { "epoch": 1.4245115452930728, "grad_norm": 0.5174906428310386, "learning_rate": 8.132576039839248e-06, "loss": 0.0535, "step": 1604 }, { "epoch": 1.425399644760213, "grad_norm": 0.4570062922280352, "learning_rate": 8.12955439071202e-06, "loss": 0.0447, "step": 1605 }, { "epoch": 1.4262877442273534, "grad_norm": 0.4904082398243398, "learning_rate": 8.126530861276677e-06, "loss": 0.0532, "step": 1606 }, { "epoch": 1.427175843694494, "grad_norm": 0.42793939717884144, "learning_rate": 8.123505453349828e-06, "loss": 0.051, "step": 1607 }, { "epoch": 1.428063943161634, "grad_norm": 0.3451762736415589, "learning_rate": 8.120478168749201e-06, "loss": 0.0483, "step": 1608 }, { "epoch": 1.4289520426287745, "grad_norm": 0.4279245108056465, "learning_rate": 8.117449009293668e-06, "loss": 0.0595, "step": 1609 }, { "epoch": 1.4298401420959148, "grad_norm": 0.37575357615088556, "learning_rate": 8.114417976803212e-06, "loss": 0.0541, "step": 1610 }, { "epoch": 1.4307282415630551, "grad_norm": 0.44406572585728893, "learning_rate": 8.111385073098946e-06, "loss": 0.0483, "step": 1611 }, { "epoch": 1.4316163410301954, "grad_norm": 0.3826166328358062, "learning_rate": 8.10835030000311e-06, "loss": 0.0538, "step": 1612 }, { "epoch": 1.4325044404973357, "grad_norm": 0.35617527077092515, "learning_rate": 8.105313659339065e-06, "loss": 0.0406, "step": 1613 }, { "epoch": 1.433392539964476, "grad_norm": 0.47394559066290226, "learning_rate": 8.102275152931294e-06, "loss": 0.0476, "step": 1614 }, { "epoch": 1.4342806394316163, "grad_norm": 0.42072502125896794, "learning_rate": 8.099234782605404e-06, "loss": 0.0462, "step": 1615 }, { "epoch": 1.4351687388987566, "grad_norm": 0.4124712231895515, "learning_rate": 8.096192550188113e-06, "loss": 0.0474, "step": 1616 }, { "epoch": 1.436056838365897, "grad_norm": 0.358379122077467, "learning_rate": 8.093148457507272e-06, "loss": 0.0393, "step": 1617 }, { "epoch": 1.4369449378330372, "grad_norm": 0.4731175736396712, "learning_rate": 8.090102506391835e-06, "loss": 0.0609, "step": 1618 }, { "epoch": 1.4378330373001775, "grad_norm": 0.4612618118512498, "learning_rate": 8.087054698671884e-06, "loss": 0.0687, "step": 1619 }, { "epoch": 1.438721136767318, "grad_norm": 0.38400775436146545, "learning_rate": 8.084005036178608e-06, "loss": 0.0587, "step": 1620 }, { "epoch": 1.4396092362344581, "grad_norm": 0.3864071686719111, "learning_rate": 8.080953520744319e-06, "loss": 0.0444, "step": 1621 }, { "epoch": 1.4404973357015987, "grad_norm": 0.5102453425146569, "learning_rate": 8.077900154202432e-06, "loss": 0.0618, "step": 1622 }, { "epoch": 1.441385435168739, "grad_norm": 0.37693596322117345, "learning_rate": 8.074844938387485e-06, "loss": 0.0445, "step": 1623 }, { "epoch": 1.4422735346358793, "grad_norm": 0.43727362483985777, "learning_rate": 8.071787875135116e-06, "loss": 0.0557, "step": 1624 }, { "epoch": 1.4431616341030196, "grad_norm": 0.4845514139285602, "learning_rate": 8.068728966282084e-06, "loss": 0.0642, "step": 1625 }, { "epoch": 1.4440497335701599, "grad_norm": 0.4033972637944131, "learning_rate": 8.065668213666252e-06, "loss": 0.055, "step": 1626 }, { "epoch": 1.4449378330373002, "grad_norm": 0.5384839931180158, "learning_rate": 8.062605619126585e-06, "loss": 0.0553, "step": 1627 }, { "epoch": 1.4458259325044405, "grad_norm": 0.5790748920097467, "learning_rate": 8.059541184503163e-06, "loss": 0.0561, "step": 1628 }, { "epoch": 1.4467140319715808, "grad_norm": 0.8618873360176641, "learning_rate": 8.05647491163717e-06, "loss": 0.0533, "step": 1629 }, { "epoch": 1.447602131438721, "grad_norm": 0.5534422282504271, "learning_rate": 8.053406802370892e-06, "loss": 0.0546, "step": 1630 }, { "epoch": 1.4484902309058614, "grad_norm": 0.5408854599299867, "learning_rate": 8.05033685854772e-06, "loss": 0.0536, "step": 1631 }, { "epoch": 1.4493783303730017, "grad_norm": 0.35642300584211933, "learning_rate": 8.047265082012145e-06, "loss": 0.0407, "step": 1632 }, { "epoch": 1.4502664298401422, "grad_norm": 0.38358783786879375, "learning_rate": 8.044191474609762e-06, "loss": 0.0549, "step": 1633 }, { "epoch": 1.4511545293072823, "grad_norm": 0.6098586672152925, "learning_rate": 8.041116038187266e-06, "loss": 0.0567, "step": 1634 }, { "epoch": 1.4520426287744228, "grad_norm": 0.7964422165476598, "learning_rate": 8.038038774592449e-06, "loss": 0.0577, "step": 1635 }, { "epoch": 1.452930728241563, "grad_norm": 0.39671778431004695, "learning_rate": 8.034959685674199e-06, "loss": 0.0479, "step": 1636 }, { "epoch": 1.4538188277087034, "grad_norm": 0.5714020613486274, "learning_rate": 8.031878773282507e-06, "loss": 0.0589, "step": 1637 }, { "epoch": 1.4547069271758437, "grad_norm": 0.5655019695958018, "learning_rate": 8.028796039268455e-06, "loss": 0.0724, "step": 1638 }, { "epoch": 1.455595026642984, "grad_norm": 0.45811594042660747, "learning_rate": 8.02571148548422e-06, "loss": 0.0566, "step": 1639 }, { "epoch": 1.4564831261101243, "grad_norm": 0.4297749875770322, "learning_rate": 8.022625113783071e-06, "loss": 0.0605, "step": 1640 }, { "epoch": 1.4573712255772646, "grad_norm": 0.34519408855532796, "learning_rate": 8.019536926019373e-06, "loss": 0.0452, "step": 1641 }, { "epoch": 1.458259325044405, "grad_norm": 0.3839507109056651, "learning_rate": 8.016446924048582e-06, "loss": 0.0455, "step": 1642 }, { "epoch": 1.4591474245115452, "grad_norm": 0.4504413415312184, "learning_rate": 8.013355109727237e-06, "loss": 0.0559, "step": 1643 }, { "epoch": 1.4600355239786857, "grad_norm": 0.41509316012858266, "learning_rate": 8.010261484912974e-06, "loss": 0.0528, "step": 1644 }, { "epoch": 1.4609236234458258, "grad_norm": 0.4419276919161984, "learning_rate": 8.007166051464515e-06, "loss": 0.0518, "step": 1645 }, { "epoch": 1.4618117229129663, "grad_norm": 0.47221828061702026, "learning_rate": 8.004068811241666e-06, "loss": 0.0528, "step": 1646 }, { "epoch": 1.4626998223801067, "grad_norm": 0.35945339568820633, "learning_rate": 8.000969766105317e-06, "loss": 0.0459, "step": 1647 }, { "epoch": 1.463587921847247, "grad_norm": 0.45825253444887293, "learning_rate": 7.997868917917453e-06, "loss": 0.061, "step": 1648 }, { "epoch": 1.4644760213143873, "grad_norm": 0.4298496576221392, "learning_rate": 7.994766268541127e-06, "loss": 0.0516, "step": 1649 }, { "epoch": 1.4653641207815276, "grad_norm": 0.5922664838557191, "learning_rate": 7.991661819840486e-06, "loss": 0.0618, "step": 1650 }, { "epoch": 1.4662522202486679, "grad_norm": 0.3709128715491371, "learning_rate": 7.988555573680753e-06, "loss": 0.0461, "step": 1651 }, { "epoch": 1.4671403197158082, "grad_norm": 0.4102307970961409, "learning_rate": 7.985447531928234e-06, "loss": 0.0451, "step": 1652 }, { "epoch": 1.4680284191829485, "grad_norm": 0.45617458628874286, "learning_rate": 7.982337696450309e-06, "loss": 0.0504, "step": 1653 }, { "epoch": 1.4689165186500888, "grad_norm": 0.4619976657842071, "learning_rate": 7.979226069115438e-06, "loss": 0.0522, "step": 1654 }, { "epoch": 1.469804618117229, "grad_norm": 0.3989223785652348, "learning_rate": 7.976112651793162e-06, "loss": 0.0473, "step": 1655 }, { "epoch": 1.4706927175843694, "grad_norm": 0.5098062947975681, "learning_rate": 7.972997446354091e-06, "loss": 0.05, "step": 1656 }, { "epoch": 1.47158081705151, "grad_norm": 0.3552857864695806, "learning_rate": 7.969880454669912e-06, "loss": 0.0437, "step": 1657 }, { "epoch": 1.47246891651865, "grad_norm": 0.4763143967476609, "learning_rate": 7.966761678613385e-06, "loss": 0.0415, "step": 1658 }, { "epoch": 1.4733570159857905, "grad_norm": 0.46591629758296743, "learning_rate": 7.963641120058341e-06, "loss": 0.0464, "step": 1659 }, { "epoch": 1.4742451154529308, "grad_norm": 0.5211145953421122, "learning_rate": 7.960518780879688e-06, "loss": 0.0539, "step": 1660 }, { "epoch": 1.475133214920071, "grad_norm": 0.5228037756990918, "learning_rate": 7.957394662953395e-06, "loss": 0.0641, "step": 1661 }, { "epoch": 1.4760213143872114, "grad_norm": 0.5671403584205867, "learning_rate": 7.954268768156504e-06, "loss": 0.0518, "step": 1662 }, { "epoch": 1.4769094138543517, "grad_norm": 0.4488867432581175, "learning_rate": 7.951141098367125e-06, "loss": 0.0505, "step": 1663 }, { "epoch": 1.477797513321492, "grad_norm": 0.5898047326800395, "learning_rate": 7.948011655464435e-06, "loss": 0.0569, "step": 1664 }, { "epoch": 1.4786856127886323, "grad_norm": 0.36299263667907605, "learning_rate": 7.944880441328673e-06, "loss": 0.051, "step": 1665 }, { "epoch": 1.4795737122557726, "grad_norm": 0.7758824677337602, "learning_rate": 7.941747457841145e-06, "loss": 0.0633, "step": 1666 }, { "epoch": 1.480461811722913, "grad_norm": 0.4267977686777056, "learning_rate": 7.938612706884221e-06, "loss": 0.0541, "step": 1667 }, { "epoch": 1.4813499111900532, "grad_norm": 0.3853287094386836, "learning_rate": 7.935476190341329e-06, "loss": 0.0373, "step": 1668 }, { "epoch": 1.4822380106571935, "grad_norm": 0.43538914862836475, "learning_rate": 7.93233791009696e-06, "loss": 0.0543, "step": 1669 }, { "epoch": 1.483126110124334, "grad_norm": 0.45485804483725983, "learning_rate": 7.92919786803667e-06, "loss": 0.0505, "step": 1670 }, { "epoch": 1.4840142095914741, "grad_norm": 0.5662436734745963, "learning_rate": 7.926056066047058e-06, "loss": 0.047, "step": 1671 }, { "epoch": 1.4849023090586146, "grad_norm": 0.48064756279116727, "learning_rate": 7.922912506015803e-06, "loss": 0.0471, "step": 1672 }, { "epoch": 1.485790408525755, "grad_norm": 0.5239235119594058, "learning_rate": 7.91976718983162e-06, "loss": 0.0507, "step": 1673 }, { "epoch": 1.4866785079928952, "grad_norm": 0.47340647494697957, "learning_rate": 7.916620119384289e-06, "loss": 0.0573, "step": 1674 }, { "epoch": 1.4875666074600356, "grad_norm": 0.4257589822557975, "learning_rate": 7.913471296564641e-06, "loss": 0.0455, "step": 1675 }, { "epoch": 1.4884547069271759, "grad_norm": 0.4007316318150693, "learning_rate": 7.910320723264563e-06, "loss": 0.0458, "step": 1676 }, { "epoch": 1.4893428063943162, "grad_norm": 0.4826000147485177, "learning_rate": 7.907168401376993e-06, "loss": 0.0529, "step": 1677 }, { "epoch": 1.4902309058614565, "grad_norm": 0.37370910532349544, "learning_rate": 7.904014332795915e-06, "loss": 0.043, "step": 1678 }, { "epoch": 1.4911190053285968, "grad_norm": 0.49968928171230986, "learning_rate": 7.900858519416368e-06, "loss": 0.0544, "step": 1679 }, { "epoch": 1.492007104795737, "grad_norm": 0.4528316894124392, "learning_rate": 7.897700963134436e-06, "loss": 0.0529, "step": 1680 }, { "epoch": 1.4928952042628774, "grad_norm": 0.4222293956218277, "learning_rate": 7.894541665847253e-06, "loss": 0.0441, "step": 1681 }, { "epoch": 1.4937833037300177, "grad_norm": 0.43844080754157805, "learning_rate": 7.891380629452994e-06, "loss": 0.05, "step": 1682 }, { "epoch": 1.4946714031971582, "grad_norm": 0.41426204357656377, "learning_rate": 7.888217855850885e-06, "loss": 0.0505, "step": 1683 }, { "epoch": 1.4955595026642983, "grad_norm": 0.3721657025423876, "learning_rate": 7.885053346941193e-06, "loss": 0.0457, "step": 1684 }, { "epoch": 1.4964476021314388, "grad_norm": 0.5019826083758884, "learning_rate": 7.881887104625226e-06, "loss": 0.0743, "step": 1685 }, { "epoch": 1.497335701598579, "grad_norm": 0.42722738957810236, "learning_rate": 7.878719130805336e-06, "loss": 0.0541, "step": 1686 }, { "epoch": 1.4982238010657194, "grad_norm": 0.4124265597348239, "learning_rate": 7.875549427384916e-06, "loss": 0.0424, "step": 1687 }, { "epoch": 1.4991119005328597, "grad_norm": 0.38316319603171595, "learning_rate": 7.872377996268394e-06, "loss": 0.0471, "step": 1688 }, { "epoch": 1.5, "grad_norm": 0.5459557077565556, "learning_rate": 7.869204839361238e-06, "loss": 0.0566, "step": 1689 }, { "epoch": 1.5008880994671403, "grad_norm": 0.41423317774594537, "learning_rate": 7.866029958569956e-06, "loss": 0.0524, "step": 1690 }, { "epoch": 1.5017761989342806, "grad_norm": 0.37634240216102716, "learning_rate": 7.862853355802089e-06, "loss": 0.0489, "step": 1691 }, { "epoch": 1.502664298401421, "grad_norm": 0.5726844461852207, "learning_rate": 7.859675032966213e-06, "loss": 0.0639, "step": 1692 }, { "epoch": 1.5035523978685612, "grad_norm": 0.4360976829462697, "learning_rate": 7.856494991971933e-06, "loss": 0.0444, "step": 1693 }, { "epoch": 1.5044404973357017, "grad_norm": 0.40187458371621043, "learning_rate": 7.853313234729896e-06, "loss": 0.0443, "step": 1694 }, { "epoch": 1.5053285968028418, "grad_norm": 0.5477166486760292, "learning_rate": 7.850129763151773e-06, "loss": 0.0552, "step": 1695 }, { "epoch": 1.5062166962699823, "grad_norm": 0.4998916623686671, "learning_rate": 7.846944579150266e-06, "loss": 0.0714, "step": 1696 }, { "epoch": 1.5071047957371224, "grad_norm": 0.5352597644238419, "learning_rate": 7.843757684639108e-06, "loss": 0.0601, "step": 1697 }, { "epoch": 1.507992895204263, "grad_norm": 0.45412501834567376, "learning_rate": 7.840569081533058e-06, "loss": 0.0574, "step": 1698 }, { "epoch": 1.5088809946714032, "grad_norm": 0.3785546516205003, "learning_rate": 7.8373787717479e-06, "loss": 0.0477, "step": 1699 }, { "epoch": 1.5097690941385435, "grad_norm": 0.47783613914291184, "learning_rate": 7.834186757200448e-06, "loss": 0.0614, "step": 1700 }, { "epoch": 1.5106571936056838, "grad_norm": 0.6507710007364977, "learning_rate": 7.830993039808537e-06, "loss": 0.0611, "step": 1701 }, { "epoch": 1.5115452930728241, "grad_norm": 0.5108180337745931, "learning_rate": 7.827797621491025e-06, "loss": 0.0499, "step": 1702 }, { "epoch": 1.5124333925399644, "grad_norm": 0.4578325365843359, "learning_rate": 7.824600504167793e-06, "loss": 0.0502, "step": 1703 }, { "epoch": 1.5133214920071048, "grad_norm": 0.36602192544459716, "learning_rate": 7.82140168975974e-06, "loss": 0.0522, "step": 1704 }, { "epoch": 1.5142095914742453, "grad_norm": 0.40292599196527157, "learning_rate": 7.818201180188794e-06, "loss": 0.0542, "step": 1705 }, { "epoch": 1.5150976909413854, "grad_norm": 0.541855291094372, "learning_rate": 7.814998977377886e-06, "loss": 0.0562, "step": 1706 }, { "epoch": 1.5159857904085259, "grad_norm": 0.42728006868013124, "learning_rate": 7.811795083250979e-06, "loss": 0.0524, "step": 1707 }, { "epoch": 1.516873889875666, "grad_norm": 0.5918224235333536, "learning_rate": 7.808589499733045e-06, "loss": 0.0515, "step": 1708 }, { "epoch": 1.5177619893428065, "grad_norm": 0.3453187120825003, "learning_rate": 7.80538222875007e-06, "loss": 0.0431, "step": 1709 }, { "epoch": 1.5186500888099466, "grad_norm": 0.434267114063888, "learning_rate": 7.802173272229058e-06, "loss": 0.055, "step": 1710 }, { "epoch": 1.519538188277087, "grad_norm": 0.5718702323522821, "learning_rate": 7.798962632098024e-06, "loss": 0.0525, "step": 1711 }, { "epoch": 1.5204262877442274, "grad_norm": 0.3870661493171084, "learning_rate": 7.795750310285993e-06, "loss": 0.055, "step": 1712 }, { "epoch": 1.5213143872113677, "grad_norm": 0.5491931559022314, "learning_rate": 7.792536308723001e-06, "loss": 0.0509, "step": 1713 }, { "epoch": 1.522202486678508, "grad_norm": 0.34261387324707615, "learning_rate": 7.789320629340096e-06, "loss": 0.0374, "step": 1714 }, { "epoch": 1.5230905861456483, "grad_norm": 0.5522900715341141, "learning_rate": 7.786103274069331e-06, "loss": 0.0437, "step": 1715 }, { "epoch": 1.5239786856127886, "grad_norm": 0.5101094231820494, "learning_rate": 7.782884244843766e-06, "loss": 0.0695, "step": 1716 }, { "epoch": 1.524866785079929, "grad_norm": 0.4303875587604101, "learning_rate": 7.779663543597471e-06, "loss": 0.0519, "step": 1717 }, { "epoch": 1.5257548845470694, "grad_norm": 0.38600267270958155, "learning_rate": 7.776441172265511e-06, "loss": 0.0459, "step": 1718 }, { "epoch": 1.5266429840142095, "grad_norm": 0.4958890806451229, "learning_rate": 7.773217132783968e-06, "loss": 0.0647, "step": 1719 }, { "epoch": 1.52753108348135, "grad_norm": 0.4257022840174458, "learning_rate": 7.769991427089915e-06, "loss": 0.0558, "step": 1720 }, { "epoch": 1.52841918294849, "grad_norm": 0.549842543803982, "learning_rate": 7.76676405712143e-06, "loss": 0.0646, "step": 1721 }, { "epoch": 1.5293072824156306, "grad_norm": 0.5460234756719656, "learning_rate": 7.76353502481759e-06, "loss": 0.051, "step": 1722 }, { "epoch": 1.5301953818827707, "grad_norm": 0.4107897311322575, "learning_rate": 7.760304332118475e-06, "loss": 0.0525, "step": 1723 }, { "epoch": 1.5310834813499112, "grad_norm": 0.354086275064959, "learning_rate": 7.757071980965156e-06, "loss": 0.0441, "step": 1724 }, { "epoch": 1.5319715808170515, "grad_norm": 0.5648261454732384, "learning_rate": 7.753837973299706e-06, "loss": 0.0623, "step": 1725 }, { "epoch": 1.5328596802841918, "grad_norm": 0.3421281078802634, "learning_rate": 7.75060231106519e-06, "loss": 0.0332, "step": 1726 }, { "epoch": 1.5337477797513321, "grad_norm": 0.4674384959414192, "learning_rate": 7.747364996205669e-06, "loss": 0.0542, "step": 1727 }, { "epoch": 1.5346358792184724, "grad_norm": 0.3837912909909046, "learning_rate": 7.744126030666193e-06, "loss": 0.0473, "step": 1728 }, { "epoch": 1.5355239786856127, "grad_norm": 0.45716040673835834, "learning_rate": 7.740885416392811e-06, "loss": 0.0448, "step": 1729 }, { "epoch": 1.536412078152753, "grad_norm": 0.8260383553950559, "learning_rate": 7.737643155332559e-06, "loss": 0.0676, "step": 1730 }, { "epoch": 1.5373001776198936, "grad_norm": 0.5703793064204835, "learning_rate": 7.734399249433458e-06, "loss": 0.0583, "step": 1731 }, { "epoch": 1.5381882770870337, "grad_norm": 0.3251350830631577, "learning_rate": 7.73115370064452e-06, "loss": 0.0423, "step": 1732 }, { "epoch": 1.5390763765541742, "grad_norm": 0.3359864790300054, "learning_rate": 7.727906510915751e-06, "loss": 0.0416, "step": 1733 }, { "epoch": 1.5399644760213143, "grad_norm": 0.46875800830883074, "learning_rate": 7.724657682198133e-06, "loss": 0.0572, "step": 1734 }, { "epoch": 1.5408525754884548, "grad_norm": 0.46807351907670214, "learning_rate": 7.72140721644364e-06, "loss": 0.0489, "step": 1735 }, { "epoch": 1.541740674955595, "grad_norm": 0.5636691387582213, "learning_rate": 7.718155115605222e-06, "loss": 0.0682, "step": 1736 }, { "epoch": 1.5426287744227354, "grad_norm": 0.40135850551698504, "learning_rate": 7.714901381636819e-06, "loss": 0.0481, "step": 1737 }, { "epoch": 1.5435168738898757, "grad_norm": 0.48739636744157866, "learning_rate": 7.711646016493348e-06, "loss": 0.0482, "step": 1738 }, { "epoch": 1.544404973357016, "grad_norm": 0.43889147398324563, "learning_rate": 7.708389022130706e-06, "loss": 0.0492, "step": 1739 }, { "epoch": 1.5452930728241563, "grad_norm": 0.49191135418751325, "learning_rate": 7.705130400505774e-06, "loss": 0.0557, "step": 1740 }, { "epoch": 1.5461811722912966, "grad_norm": 0.34228005697150915, "learning_rate": 7.701870153576402e-06, "loss": 0.0453, "step": 1741 }, { "epoch": 1.547069271758437, "grad_norm": 0.4140692891317913, "learning_rate": 7.698608283301426e-06, "loss": 0.0505, "step": 1742 }, { "epoch": 1.5479573712255772, "grad_norm": 0.46045025491100045, "learning_rate": 7.695344791640648e-06, "loss": 0.0522, "step": 1743 }, { "epoch": 1.5488454706927177, "grad_norm": 0.4438077452461078, "learning_rate": 7.692079680554849e-06, "loss": 0.0438, "step": 1744 }, { "epoch": 1.5497335701598578, "grad_norm": 0.3985589283930428, "learning_rate": 7.688812952005786e-06, "loss": 0.046, "step": 1745 }, { "epoch": 1.5506216696269983, "grad_norm": 0.34141045183330265, "learning_rate": 7.685544607956182e-06, "loss": 0.0443, "step": 1746 }, { "epoch": 1.5515097690941384, "grad_norm": 0.35622382164888206, "learning_rate": 7.682274650369734e-06, "loss": 0.0431, "step": 1747 }, { "epoch": 1.552397868561279, "grad_norm": 0.5399825580211707, "learning_rate": 7.679003081211107e-06, "loss": 0.0616, "step": 1748 }, { "epoch": 1.5532859680284192, "grad_norm": 0.45712644197578145, "learning_rate": 7.675729902445937e-06, "loss": 0.0702, "step": 1749 }, { "epoch": 1.5541740674955595, "grad_norm": 0.674681003807884, "learning_rate": 7.67245511604082e-06, "loss": 0.0599, "step": 1750 }, { "epoch": 1.5550621669626998, "grad_norm": 0.34498352977043173, "learning_rate": 7.669178723963327e-06, "loss": 0.0404, "step": 1751 }, { "epoch": 1.5559502664298401, "grad_norm": 0.4222733940661997, "learning_rate": 7.665900728181989e-06, "loss": 0.046, "step": 1752 }, { "epoch": 1.5568383658969804, "grad_norm": 0.475618676968953, "learning_rate": 7.6626211306663e-06, "loss": 0.0604, "step": 1753 }, { "epoch": 1.5577264653641207, "grad_norm": 0.3770933077341249, "learning_rate": 7.659339933386718e-06, "loss": 0.0457, "step": 1754 }, { "epoch": 1.5586145648312613, "grad_norm": 0.4178934100132517, "learning_rate": 7.656057138314663e-06, "loss": 0.0476, "step": 1755 }, { "epoch": 1.5595026642984013, "grad_norm": 0.36640173060430026, "learning_rate": 7.652772747422512e-06, "loss": 0.0643, "step": 1756 }, { "epoch": 1.5603907637655419, "grad_norm": 0.44741988081457607, "learning_rate": 7.649486762683602e-06, "loss": 0.0562, "step": 1757 }, { "epoch": 1.561278863232682, "grad_norm": 0.576605656132711, "learning_rate": 7.646199186072229e-06, "loss": 0.0817, "step": 1758 }, { "epoch": 1.5621669626998225, "grad_norm": 0.36683552194563107, "learning_rate": 7.642910019563646e-06, "loss": 0.0478, "step": 1759 }, { "epoch": 1.5630550621669625, "grad_norm": 0.37934832255837464, "learning_rate": 7.639619265134056e-06, "loss": 0.0466, "step": 1760 }, { "epoch": 1.563943161634103, "grad_norm": 0.399945495147686, "learning_rate": 7.636326924760622e-06, "loss": 0.0505, "step": 1761 }, { "epoch": 1.5648312611012434, "grad_norm": 0.37470621259966547, "learning_rate": 7.63303300042146e-06, "loss": 0.0482, "step": 1762 }, { "epoch": 1.5657193605683837, "grad_norm": 0.46143179083831476, "learning_rate": 7.62973749409563e-06, "loss": 0.0514, "step": 1763 }, { "epoch": 1.566607460035524, "grad_norm": 0.3549802079949632, "learning_rate": 7.626440407763151e-06, "loss": 0.0391, "step": 1764 }, { "epoch": 1.5674955595026643, "grad_norm": 0.3526063297024799, "learning_rate": 7.6231417434049896e-06, "loss": 0.0463, "step": 1765 }, { "epoch": 1.5683836589698046, "grad_norm": 0.3625783448175399, "learning_rate": 7.619841503003056e-06, "loss": 0.0435, "step": 1766 }, { "epoch": 1.5692717584369449, "grad_norm": 0.3535200750509916, "learning_rate": 7.616539688540212e-06, "loss": 0.0467, "step": 1767 }, { "epoch": 1.5701598579040854, "grad_norm": 0.6134778577785767, "learning_rate": 7.613236302000265e-06, "loss": 0.0587, "step": 1768 }, { "epoch": 1.5710479573712255, "grad_norm": 0.47615307783222455, "learning_rate": 7.609931345367962e-06, "loss": 0.0467, "step": 1769 }, { "epoch": 1.571936056838366, "grad_norm": 0.3624745755144589, "learning_rate": 7.606624820628998e-06, "loss": 0.0491, "step": 1770 }, { "epoch": 1.572824156305506, "grad_norm": 0.5421904381130762, "learning_rate": 7.603316729770012e-06, "loss": 0.054, "step": 1771 }, { "epoch": 1.5737122557726466, "grad_norm": 0.45513120933546486, "learning_rate": 7.600007074778577e-06, "loss": 0.0734, "step": 1772 }, { "epoch": 1.5746003552397867, "grad_norm": 0.6563110917147711, "learning_rate": 7.596695857643211e-06, "loss": 0.0628, "step": 1773 }, { "epoch": 1.5754884547069272, "grad_norm": 0.43269624599939793, "learning_rate": 7.593383080353369e-06, "loss": 0.0534, "step": 1774 }, { "epoch": 1.5763765541740675, "grad_norm": 0.42259017056440773, "learning_rate": 7.590068744899444e-06, "loss": 0.058, "step": 1775 }, { "epoch": 1.5772646536412078, "grad_norm": 0.40956595502519033, "learning_rate": 7.586752853272765e-06, "loss": 0.0427, "step": 1776 }, { "epoch": 1.5781527531083481, "grad_norm": 0.3946458429155628, "learning_rate": 7.583435407465596e-06, "loss": 0.0407, "step": 1777 }, { "epoch": 1.5790408525754884, "grad_norm": 0.42541576285989563, "learning_rate": 7.580116409471134e-06, "loss": 0.0457, "step": 1778 }, { "epoch": 1.5799289520426287, "grad_norm": 0.4464409900609385, "learning_rate": 7.576795861283508e-06, "loss": 0.0626, "step": 1779 }, { "epoch": 1.580817051509769, "grad_norm": 0.41728552046426953, "learning_rate": 7.573473764897783e-06, "loss": 0.0476, "step": 1780 }, { "epoch": 1.5817051509769096, "grad_norm": 0.51250804492596, "learning_rate": 7.570150122309947e-06, "loss": 0.0576, "step": 1781 }, { "epoch": 1.5825932504440496, "grad_norm": 0.47986207085025157, "learning_rate": 7.566824935516924e-06, "loss": 0.0487, "step": 1782 }, { "epoch": 1.5834813499111902, "grad_norm": 0.5060248238325927, "learning_rate": 7.563498206516561e-06, "loss": 0.0598, "step": 1783 }, { "epoch": 1.5843694493783302, "grad_norm": 0.42043139495950965, "learning_rate": 7.560169937307633e-06, "loss": 0.0509, "step": 1784 }, { "epoch": 1.5852575488454708, "grad_norm": 0.5192341057560327, "learning_rate": 7.55684012988984e-06, "loss": 0.0579, "step": 1785 }, { "epoch": 1.586145648312611, "grad_norm": 0.3872785990169641, "learning_rate": 7.553508786263808e-06, "loss": 0.0573, "step": 1786 }, { "epoch": 1.5870337477797514, "grad_norm": 0.428936043137043, "learning_rate": 7.5501759084310834e-06, "loss": 0.0587, "step": 1787 }, { "epoch": 1.5879218472468917, "grad_norm": 0.40158672150687236, "learning_rate": 7.546841498394137e-06, "loss": 0.049, "step": 1788 }, { "epoch": 1.588809946714032, "grad_norm": 0.2965004935478777, "learning_rate": 7.543505558156359e-06, "loss": 0.0397, "step": 1789 }, { "epoch": 1.5896980461811723, "grad_norm": 0.3348089962247074, "learning_rate": 7.5401680897220575e-06, "loss": 0.0414, "step": 1790 }, { "epoch": 1.5905861456483126, "grad_norm": 0.3579386134392553, "learning_rate": 7.5368290950964616e-06, "loss": 0.046, "step": 1791 }, { "epoch": 1.5914742451154529, "grad_norm": 0.6344177863935818, "learning_rate": 7.533488576285714e-06, "loss": 0.0514, "step": 1792 }, { "epoch": 1.5923623445825932, "grad_norm": 0.4172607942552748, "learning_rate": 7.5301465352968775e-06, "loss": 0.0541, "step": 1793 }, { "epoch": 1.5932504440497337, "grad_norm": 0.4296116049176287, "learning_rate": 7.526802974137925e-06, "loss": 0.0469, "step": 1794 }, { "epoch": 1.5941385435168738, "grad_norm": 0.3704480570317141, "learning_rate": 7.523457894817745e-06, "loss": 0.0515, "step": 1795 }, { "epoch": 1.5950266429840143, "grad_norm": 0.4521460965601276, "learning_rate": 7.52011129934614e-06, "loss": 0.0443, "step": 1796 }, { "epoch": 1.5959147424511544, "grad_norm": 0.4394300498345061, "learning_rate": 7.51676318973382e-06, "loss": 0.0523, "step": 1797 }, { "epoch": 1.596802841918295, "grad_norm": 0.4760170297828387, "learning_rate": 7.513413567992405e-06, "loss": 0.049, "step": 1798 }, { "epoch": 1.5976909413854352, "grad_norm": 0.43812502523322966, "learning_rate": 7.510062436134426e-06, "loss": 0.0451, "step": 1799 }, { "epoch": 1.5985790408525755, "grad_norm": 0.3881160105648138, "learning_rate": 7.506709796173319e-06, "loss": 0.0426, "step": 1800 }, { "epoch": 1.5994671403197158, "grad_norm": 0.3930153590052019, "learning_rate": 7.503355650123427e-06, "loss": 0.05, "step": 1801 }, { "epoch": 1.6003552397868561, "grad_norm": 0.4945648869122955, "learning_rate": 7.500000000000001e-06, "loss": 0.0471, "step": 1802 }, { "epoch": 1.6012433392539964, "grad_norm": 0.44340722104644353, "learning_rate": 7.496642847819189e-06, "loss": 0.048, "step": 1803 }, { "epoch": 1.6021314387211367, "grad_norm": 0.4575391302567239, "learning_rate": 7.493284195598046e-06, "loss": 0.0565, "step": 1804 }, { "epoch": 1.6030195381882772, "grad_norm": 0.48191658932881076, "learning_rate": 7.489924045354527e-06, "loss": 0.0609, "step": 1805 }, { "epoch": 1.6039076376554173, "grad_norm": 0.5562535999796999, "learning_rate": 7.4865623991074894e-06, "loss": 0.0573, "step": 1806 }, { "epoch": 1.6047957371225579, "grad_norm": 0.3548939944794682, "learning_rate": 7.483199258876685e-06, "loss": 0.0395, "step": 1807 }, { "epoch": 1.605683836589698, "grad_norm": 0.37932803121585146, "learning_rate": 7.479834626682768e-06, "loss": 0.04, "step": 1808 }, { "epoch": 1.6065719360568385, "grad_norm": 0.38464769545242555, "learning_rate": 7.476468504547284e-06, "loss": 0.0468, "step": 1809 }, { "epoch": 1.6074600355239785, "grad_norm": 0.5426310282126411, "learning_rate": 7.473100894492679e-06, "loss": 0.0569, "step": 1810 }, { "epoch": 1.608348134991119, "grad_norm": 0.33572094350124215, "learning_rate": 7.469731798542288e-06, "loss": 0.0439, "step": 1811 }, { "epoch": 1.6092362344582594, "grad_norm": 0.4676095836985924, "learning_rate": 7.466361218720344e-06, "loss": 0.0541, "step": 1812 }, { "epoch": 1.6101243339253997, "grad_norm": 0.47157874619201895, "learning_rate": 7.462989157051965e-06, "loss": 0.0559, "step": 1813 }, { "epoch": 1.61101243339254, "grad_norm": 0.4272596303563746, "learning_rate": 7.459615615563166e-06, "loss": 0.051, "step": 1814 }, { "epoch": 1.6119005328596803, "grad_norm": 0.4824238196116946, "learning_rate": 7.456240596280848e-06, "loss": 0.0575, "step": 1815 }, { "epoch": 1.6127886323268206, "grad_norm": 0.9305803356665563, "learning_rate": 7.452864101232798e-06, "loss": 0.0529, "step": 1816 }, { "epoch": 1.6136767317939609, "grad_norm": 0.4136268375325825, "learning_rate": 7.449486132447694e-06, "loss": 0.056, "step": 1817 }, { "epoch": 1.6145648312611014, "grad_norm": 0.4565426594135729, "learning_rate": 7.446106691955097e-06, "loss": 0.0474, "step": 1818 }, { "epoch": 1.6154529307282415, "grad_norm": 0.5214072414287675, "learning_rate": 7.442725781785451e-06, "loss": 0.0674, "step": 1819 }, { "epoch": 1.616341030195382, "grad_norm": 0.3898726364029089, "learning_rate": 7.439343403970085e-06, "loss": 0.0414, "step": 1820 }, { "epoch": 1.617229129662522, "grad_norm": 0.39409589873524503, "learning_rate": 7.43595956054121e-06, "loss": 0.042, "step": 1821 }, { "epoch": 1.6181172291296626, "grad_norm": 0.5133191051412453, "learning_rate": 7.432574253531917e-06, "loss": 0.0491, "step": 1822 }, { "epoch": 1.6190053285968027, "grad_norm": 0.4201841096539901, "learning_rate": 7.429187484976172e-06, "loss": 0.0495, "step": 1823 }, { "epoch": 1.6198934280639432, "grad_norm": 0.4031322720823592, "learning_rate": 7.42579925690883e-06, "loss": 0.0407, "step": 1824 }, { "epoch": 1.6207815275310835, "grad_norm": 0.3666136430256817, "learning_rate": 7.422409571365612e-06, "loss": 0.0476, "step": 1825 }, { "epoch": 1.6216696269982238, "grad_norm": 0.6472614138905696, "learning_rate": 7.419018430383118e-06, "loss": 0.0469, "step": 1826 }, { "epoch": 1.6225577264653641, "grad_norm": 0.7981018598945092, "learning_rate": 7.415625835998828e-06, "loss": 0.0623, "step": 1827 }, { "epoch": 1.6234458259325044, "grad_norm": 0.3974535875556267, "learning_rate": 7.412231790251085e-06, "loss": 0.0436, "step": 1828 }, { "epoch": 1.6243339253996447, "grad_norm": 0.3923193715579734, "learning_rate": 7.408836295179114e-06, "loss": 0.053, "step": 1829 }, { "epoch": 1.625222024866785, "grad_norm": 0.5310667299265248, "learning_rate": 7.4054393528230025e-06, "loss": 0.0612, "step": 1830 }, { "epoch": 1.6261101243339255, "grad_norm": 0.5529502932529136, "learning_rate": 7.4020409652237165e-06, "loss": 0.0591, "step": 1831 }, { "epoch": 1.6269982238010656, "grad_norm": 0.4441596346548294, "learning_rate": 7.398641134423081e-06, "loss": 0.0422, "step": 1832 }, { "epoch": 1.6278863232682061, "grad_norm": 0.48736618114748104, "learning_rate": 7.395239862463794e-06, "loss": 0.0486, "step": 1833 }, { "epoch": 1.6287744227353462, "grad_norm": 0.5389730081774947, "learning_rate": 7.391837151389416e-06, "loss": 0.0555, "step": 1834 }, { "epoch": 1.6296625222024868, "grad_norm": 0.5114644149008559, "learning_rate": 7.388433003244377e-06, "loss": 0.047, "step": 1835 }, { "epoch": 1.6305506216696268, "grad_norm": 0.4288743852943751, "learning_rate": 7.385027420073965e-06, "loss": 0.0451, "step": 1836 }, { "epoch": 1.6314387211367674, "grad_norm": 0.5328497940204218, "learning_rate": 7.381620403924333e-06, "loss": 0.0556, "step": 1837 }, { "epoch": 1.6323268206039077, "grad_norm": 0.4812793644698344, "learning_rate": 7.378211956842496e-06, "loss": 0.0518, "step": 1838 }, { "epoch": 1.633214920071048, "grad_norm": 0.3728723626240057, "learning_rate": 7.374802080876324e-06, "loss": 0.0428, "step": 1839 }, { "epoch": 1.6341030195381883, "grad_norm": 0.47107517819965083, "learning_rate": 7.371390778074552e-06, "loss": 0.0492, "step": 1840 }, { "epoch": 1.6349911190053286, "grad_norm": 0.5033676728454763, "learning_rate": 7.367978050486768e-06, "loss": 0.0434, "step": 1841 }, { "epoch": 1.6358792184724689, "grad_norm": 0.36156424991827696, "learning_rate": 7.364563900163416e-06, "loss": 0.0417, "step": 1842 }, { "epoch": 1.6367673179396092, "grad_norm": 0.472226259653632, "learning_rate": 7.361148329155798e-06, "loss": 0.0496, "step": 1843 }, { "epoch": 1.6376554174067497, "grad_norm": 0.4612065513815727, "learning_rate": 7.357731339516067e-06, "loss": 0.0453, "step": 1844 }, { "epoch": 1.6385435168738898, "grad_norm": 0.37788789100539816, "learning_rate": 7.354312933297225e-06, "loss": 0.0437, "step": 1845 }, { "epoch": 1.6394316163410303, "grad_norm": 0.47442424835246877, "learning_rate": 7.350893112553134e-06, "loss": 0.0434, "step": 1846 }, { "epoch": 1.6403197158081704, "grad_norm": 0.5064539251676949, "learning_rate": 7.3474718793384995e-06, "loss": 0.0518, "step": 1847 }, { "epoch": 1.641207815275311, "grad_norm": 0.35894236613374486, "learning_rate": 7.3440492357088746e-06, "loss": 0.0401, "step": 1848 }, { "epoch": 1.6420959147424512, "grad_norm": 0.45759082265965767, "learning_rate": 7.340625183720664e-06, "loss": 0.0519, "step": 1849 }, { "epoch": 1.6429840142095915, "grad_norm": 0.39750803011604363, "learning_rate": 7.3371997254311165e-06, "loss": 0.0449, "step": 1850 }, { "epoch": 1.6438721136767318, "grad_norm": 0.3392691207859647, "learning_rate": 7.333772862898328e-06, "loss": 0.0403, "step": 1851 }, { "epoch": 1.644760213143872, "grad_norm": 0.4128996936237093, "learning_rate": 7.330344598181231e-06, "loss": 0.05, "step": 1852 }, { "epoch": 1.6456483126110124, "grad_norm": 0.5078054091289931, "learning_rate": 7.326914933339612e-06, "loss": 0.0518, "step": 1853 }, { "epoch": 1.6465364120781527, "grad_norm": 0.42840596236871603, "learning_rate": 7.323483870434087e-06, "loss": 0.0518, "step": 1854 }, { "epoch": 1.6474245115452932, "grad_norm": 0.5155458763080173, "learning_rate": 7.32005141152612e-06, "loss": 0.0517, "step": 1855 }, { "epoch": 1.6483126110124333, "grad_norm": 0.47345843257098125, "learning_rate": 7.316617558678011e-06, "loss": 0.0606, "step": 1856 }, { "epoch": 1.6492007104795738, "grad_norm": 0.3448701612007873, "learning_rate": 7.313182313952897e-06, "loss": 0.0471, "step": 1857 }, { "epoch": 1.650088809946714, "grad_norm": 0.6912772116733018, "learning_rate": 7.309745679414751e-06, "loss": 0.0598, "step": 1858 }, { "epoch": 1.6509769094138544, "grad_norm": 0.4514055160584369, "learning_rate": 7.3063076571283864e-06, "loss": 0.0511, "step": 1859 }, { "epoch": 1.6518650088809945, "grad_norm": 0.3931950881071841, "learning_rate": 7.302868249159441e-06, "loss": 0.0372, "step": 1860 }, { "epoch": 1.652753108348135, "grad_norm": 0.43739040998352097, "learning_rate": 7.29942745757439e-06, "loss": 0.0475, "step": 1861 }, { "epoch": 1.6536412078152753, "grad_norm": 0.4663761588583565, "learning_rate": 7.295985284440543e-06, "loss": 0.0521, "step": 1862 }, { "epoch": 1.6545293072824157, "grad_norm": 0.4626929162880625, "learning_rate": 7.292541731826035e-06, "loss": 0.0533, "step": 1863 }, { "epoch": 1.655417406749556, "grad_norm": 0.4813664691991621, "learning_rate": 7.289096801799831e-06, "loss": 0.0576, "step": 1864 }, { "epoch": 1.6563055062166963, "grad_norm": 0.3835863890156243, "learning_rate": 7.285650496431726e-06, "loss": 0.0452, "step": 1865 }, { "epoch": 1.6571936056838366, "grad_norm": 0.3982495988142279, "learning_rate": 7.282202817792337e-06, "loss": 0.0439, "step": 1866 }, { "epoch": 1.6580817051509769, "grad_norm": 0.584980684226663, "learning_rate": 7.2787537679531105e-06, "loss": 0.0456, "step": 1867 }, { "epoch": 1.6589698046181174, "grad_norm": 0.5324889054540743, "learning_rate": 7.2753033489863134e-06, "loss": 0.0487, "step": 1868 }, { "epoch": 1.6598579040852575, "grad_norm": 0.42218813341259764, "learning_rate": 7.271851562965037e-06, "loss": 0.0556, "step": 1869 }, { "epoch": 1.660746003552398, "grad_norm": 0.355553460224428, "learning_rate": 7.268398411963193e-06, "loss": 0.0439, "step": 1870 }, { "epoch": 1.661634103019538, "grad_norm": 0.3771293907326771, "learning_rate": 7.2649438980555165e-06, "loss": 0.0387, "step": 1871 }, { "epoch": 1.6625222024866786, "grad_norm": 0.448586691919345, "learning_rate": 7.261488023317555e-06, "loss": 0.0486, "step": 1872 }, { "epoch": 1.6634103019538187, "grad_norm": 0.4634017416752472, "learning_rate": 7.2580307898256805e-06, "loss": 0.047, "step": 1873 }, { "epoch": 1.6642984014209592, "grad_norm": 0.43490416915633406, "learning_rate": 7.254572199657075e-06, "loss": 0.0451, "step": 1874 }, { "epoch": 1.6651865008880995, "grad_norm": 0.4248897081866145, "learning_rate": 7.2511122548897426e-06, "loss": 0.0535, "step": 1875 }, { "epoch": 1.6660746003552398, "grad_norm": 0.34537012276279605, "learning_rate": 7.2476509576024945e-06, "loss": 0.052, "step": 1876 }, { "epoch": 1.66696269982238, "grad_norm": 0.42620719932561724, "learning_rate": 7.244188309874959e-06, "loss": 0.053, "step": 1877 }, { "epoch": 1.6678507992895204, "grad_norm": 0.3788018868784469, "learning_rate": 7.2407243137875725e-06, "loss": 0.0528, "step": 1878 }, { "epoch": 1.6687388987566607, "grad_norm": 0.4205270076037411, "learning_rate": 7.237258971421587e-06, "loss": 0.0442, "step": 1879 }, { "epoch": 1.669626998223801, "grad_norm": 0.3725818744304128, "learning_rate": 7.233792284859056e-06, "loss": 0.0477, "step": 1880 }, { "epoch": 1.6705150976909415, "grad_norm": 0.44835567030216367, "learning_rate": 7.230324256182848e-06, "loss": 0.0579, "step": 1881 }, { "epoch": 1.6714031971580816, "grad_norm": 0.4402031076371474, "learning_rate": 7.226854887476629e-06, "loss": 0.049, "step": 1882 }, { "epoch": 1.6722912966252221, "grad_norm": 0.46233583500915776, "learning_rate": 7.223384180824881e-06, "loss": 0.0594, "step": 1883 }, { "epoch": 1.6731793960923622, "grad_norm": 0.3974887552165766, "learning_rate": 7.219912138312881e-06, "loss": 0.0421, "step": 1884 }, { "epoch": 1.6740674955595027, "grad_norm": 0.36712835030146823, "learning_rate": 7.216438762026714e-06, "loss": 0.0452, "step": 1885 }, { "epoch": 1.6749555950266428, "grad_norm": 0.3917454645087735, "learning_rate": 7.212964054053259e-06, "loss": 0.0463, "step": 1886 }, { "epoch": 1.6758436944937833, "grad_norm": 0.32488984173426766, "learning_rate": 7.2094880164802074e-06, "loss": 0.0474, "step": 1887 }, { "epoch": 1.6767317939609236, "grad_norm": 0.4632793391754857, "learning_rate": 7.2060106513960395e-06, "loss": 0.0523, "step": 1888 }, { "epoch": 1.677619893428064, "grad_norm": 0.5034328004717088, "learning_rate": 7.202531960890033e-06, "loss": 0.0447, "step": 1889 }, { "epoch": 1.6785079928952042, "grad_norm": 0.42110845571449806, "learning_rate": 7.19905194705227e-06, "loss": 0.0406, "step": 1890 }, { "epoch": 1.6793960923623446, "grad_norm": 0.4171462159066183, "learning_rate": 7.195570611973619e-06, "loss": 0.0446, "step": 1891 }, { "epoch": 1.6802841918294849, "grad_norm": 0.4111064851477334, "learning_rate": 7.192087957745748e-06, "loss": 0.0527, "step": 1892 }, { "epoch": 1.6811722912966252, "grad_norm": 0.36207107670250294, "learning_rate": 7.1886039864611145e-06, "loss": 0.0359, "step": 1893 }, { "epoch": 1.6820603907637657, "grad_norm": 0.5772595367079205, "learning_rate": 7.18511870021297e-06, "loss": 0.0499, "step": 1894 }, { "epoch": 1.6829484902309058, "grad_norm": 0.4735776014638674, "learning_rate": 7.1816321010953525e-06, "loss": 0.0501, "step": 1895 }, { "epoch": 1.6838365896980463, "grad_norm": 0.547770694294487, "learning_rate": 7.178144191203091e-06, "loss": 0.0473, "step": 1896 }, { "epoch": 1.6847246891651864, "grad_norm": 0.46026886801118827, "learning_rate": 7.1746549726318025e-06, "loss": 0.058, "step": 1897 }, { "epoch": 1.6856127886323269, "grad_norm": 0.4277151288632358, "learning_rate": 7.17116444747789e-06, "loss": 0.0402, "step": 1898 }, { "epoch": 1.6865008880994672, "grad_norm": 0.3679088895246344, "learning_rate": 7.1676726178385415e-06, "loss": 0.0437, "step": 1899 }, { "epoch": 1.6873889875666075, "grad_norm": 0.5011965656365193, "learning_rate": 7.164179485811728e-06, "loss": 0.0457, "step": 1900 }, { "epoch": 1.6882770870337478, "grad_norm": 0.42516927463117626, "learning_rate": 7.160685053496201e-06, "loss": 0.045, "step": 1901 }, { "epoch": 1.689165186500888, "grad_norm": 0.5231918792756166, "learning_rate": 7.157189322991499e-06, "loss": 0.053, "step": 1902 }, { "epoch": 1.6900532859680284, "grad_norm": 0.485304561864495, "learning_rate": 7.153692296397936e-06, "loss": 0.061, "step": 1903 }, { "epoch": 1.6909413854351687, "grad_norm": 0.45690268485356994, "learning_rate": 7.150193975816606e-06, "loss": 0.0476, "step": 1904 }, { "epoch": 1.6918294849023092, "grad_norm": 0.40781644672009093, "learning_rate": 7.146694363349378e-06, "loss": 0.0573, "step": 1905 }, { "epoch": 1.6927175843694493, "grad_norm": 0.8996341745938609, "learning_rate": 7.1431934610989025e-06, "loss": 0.0757, "step": 1906 }, { "epoch": 1.6936056838365898, "grad_norm": 0.33465511049130725, "learning_rate": 7.139691271168601e-06, "loss": 0.0412, "step": 1907 }, { "epoch": 1.69449378330373, "grad_norm": 0.4006689443792775, "learning_rate": 7.136187795662669e-06, "loss": 0.0554, "step": 1908 }, { "epoch": 1.6953818827708704, "grad_norm": 0.7051429296225967, "learning_rate": 7.132683036686076e-06, "loss": 0.0507, "step": 1909 }, { "epoch": 1.6962699822380105, "grad_norm": 0.4855643688496919, "learning_rate": 7.129176996344561e-06, "loss": 0.0574, "step": 1910 }, { "epoch": 1.697158081705151, "grad_norm": 0.39594997967435885, "learning_rate": 7.125669676744633e-06, "loss": 0.0492, "step": 1911 }, { "epoch": 1.6980461811722913, "grad_norm": 0.4612011985894259, "learning_rate": 7.122161079993572e-06, "loss": 0.0581, "step": 1912 }, { "epoch": 1.6989342806394316, "grad_norm": 0.4641660554767729, "learning_rate": 7.1186512081994205e-06, "loss": 0.052, "step": 1913 }, { "epoch": 1.699822380106572, "grad_norm": 0.41522639372046943, "learning_rate": 7.1151400634709935e-06, "loss": 0.0536, "step": 1914 }, { "epoch": 1.7007104795737122, "grad_norm": 0.4658818662135742, "learning_rate": 7.111627647917865e-06, "loss": 0.0511, "step": 1915 }, { "epoch": 1.7015985790408525, "grad_norm": 0.39128896391754786, "learning_rate": 7.108113963650376e-06, "loss": 0.048, "step": 1916 }, { "epoch": 1.7024866785079928, "grad_norm": 0.4359605153766341, "learning_rate": 7.104599012779627e-06, "loss": 0.0559, "step": 1917 }, { "epoch": 1.7033747779751334, "grad_norm": 0.493215315257364, "learning_rate": 7.101082797417482e-06, "loss": 0.0576, "step": 1918 }, { "epoch": 1.7042628774422734, "grad_norm": 0.4323465077548411, "learning_rate": 7.0975653196765656e-06, "loss": 0.0524, "step": 1919 }, { "epoch": 1.705150976909414, "grad_norm": 0.5104717566429638, "learning_rate": 7.094046581670258e-06, "loss": 0.0583, "step": 1920 }, { "epoch": 1.706039076376554, "grad_norm": 0.4148091979839278, "learning_rate": 7.090526585512696e-06, "loss": 0.0395, "step": 1921 }, { "epoch": 1.7069271758436946, "grad_norm": 0.40257516478991223, "learning_rate": 7.087005333318777e-06, "loss": 0.0425, "step": 1922 }, { "epoch": 1.7078152753108347, "grad_norm": 0.4167751143461822, "learning_rate": 7.083482827204148e-06, "loss": 0.0517, "step": 1923 }, { "epoch": 1.7087033747779752, "grad_norm": 0.45737278550364424, "learning_rate": 7.0799590692852126e-06, "loss": 0.0479, "step": 1924 }, { "epoch": 1.7095914742451155, "grad_norm": 0.717527412678889, "learning_rate": 7.076434061679125e-06, "loss": 0.0587, "step": 1925 }, { "epoch": 1.7104795737122558, "grad_norm": 0.40188478381671133, "learning_rate": 7.072907806503792e-06, "loss": 0.0464, "step": 1926 }, { "epoch": 1.711367673179396, "grad_norm": 0.353119606059219, "learning_rate": 7.0693803058778646e-06, "loss": 0.0429, "step": 1927 }, { "epoch": 1.7122557726465364, "grad_norm": 0.461018127057418, "learning_rate": 7.065851561920751e-06, "loss": 0.0444, "step": 1928 }, { "epoch": 1.7131438721136767, "grad_norm": 0.4038407291080364, "learning_rate": 7.0623215767526e-06, "loss": 0.0433, "step": 1929 }, { "epoch": 1.714031971580817, "grad_norm": 0.4481265980646673, "learning_rate": 7.058790352494307e-06, "loss": 0.0525, "step": 1930 }, { "epoch": 1.7149200710479575, "grad_norm": 0.4927074025855776, "learning_rate": 7.0552578912675115e-06, "loss": 0.0645, "step": 1931 }, { "epoch": 1.7158081705150976, "grad_norm": 0.3051738955531093, "learning_rate": 7.0517241951946e-06, "loss": 0.0425, "step": 1932 }, { "epoch": 1.7166962699822381, "grad_norm": 0.4909899725260639, "learning_rate": 7.048189266398696e-06, "loss": 0.0543, "step": 1933 }, { "epoch": 1.7175843694493782, "grad_norm": 0.4121445195542596, "learning_rate": 7.044653107003668e-06, "loss": 0.054, "step": 1934 }, { "epoch": 1.7184724689165187, "grad_norm": 0.7287238660340769, "learning_rate": 7.04111571913412e-06, "loss": 0.0518, "step": 1935 }, { "epoch": 1.7193605683836588, "grad_norm": 0.37118408041840495, "learning_rate": 7.037577104915396e-06, "loss": 0.0498, "step": 1936 }, { "epoch": 1.7202486678507993, "grad_norm": 0.3625408648841396, "learning_rate": 7.034037266473578e-06, "loss": 0.0441, "step": 1937 }, { "epoch": 1.7211367673179396, "grad_norm": 0.430633704859871, "learning_rate": 7.0304962059354805e-06, "loss": 0.0469, "step": 1938 }, { "epoch": 1.72202486678508, "grad_norm": 0.3885989041574132, "learning_rate": 7.026953925428655e-06, "loss": 0.0524, "step": 1939 }, { "epoch": 1.7229129662522202, "grad_norm": 0.5726371868643075, "learning_rate": 7.023410427081384e-06, "loss": 0.0509, "step": 1940 }, { "epoch": 1.7238010657193605, "grad_norm": 0.45585758969960544, "learning_rate": 7.0198657130226835e-06, "loss": 0.0584, "step": 1941 }, { "epoch": 1.7246891651865008, "grad_norm": 0.4167656428047505, "learning_rate": 7.0163197853822975e-06, "loss": 0.0628, "step": 1942 }, { "epoch": 1.7255772646536411, "grad_norm": 0.4024300064639876, "learning_rate": 7.0127726462907035e-06, "loss": 0.0479, "step": 1943 }, { "epoch": 1.7264653641207817, "grad_norm": 0.44392765991101496, "learning_rate": 7.0092242978791026e-06, "loss": 0.0487, "step": 1944 }, { "epoch": 1.7273534635879217, "grad_norm": 0.4668187295820353, "learning_rate": 7.005674742279423e-06, "loss": 0.0463, "step": 1945 }, { "epoch": 1.7282415630550623, "grad_norm": 0.7327420324761663, "learning_rate": 7.00212398162432e-06, "loss": 0.0518, "step": 1946 }, { "epoch": 1.7291296625222023, "grad_norm": 0.443942234501367, "learning_rate": 6.998572018047174e-06, "loss": 0.044, "step": 1947 }, { "epoch": 1.7300177619893429, "grad_norm": 0.4038559181234273, "learning_rate": 6.995018853682083e-06, "loss": 0.0439, "step": 1948 }, { "epoch": 1.7309058614564832, "grad_norm": 0.6384867272348951, "learning_rate": 6.991464490663871e-06, "loss": 0.0636, "step": 1949 }, { "epoch": 1.7317939609236235, "grad_norm": 0.4065243718832035, "learning_rate": 6.9879089311280815e-06, "loss": 0.0527, "step": 1950 }, { "epoch": 1.7326820603907638, "grad_norm": 0.45005679129742787, "learning_rate": 6.9843521772109765e-06, "loss": 0.0516, "step": 1951 }, { "epoch": 1.733570159857904, "grad_norm": 0.4468306498195739, "learning_rate": 6.980794231049534e-06, "loss": 0.0591, "step": 1952 }, { "epoch": 1.7344582593250444, "grad_norm": 0.3841813900716927, "learning_rate": 6.977235094781452e-06, "loss": 0.0385, "step": 1953 }, { "epoch": 1.7353463587921847, "grad_norm": 0.5905425816395963, "learning_rate": 6.973674770545138e-06, "loss": 0.0571, "step": 1954 }, { "epoch": 1.7362344582593252, "grad_norm": 0.41403831422730447, "learning_rate": 6.970113260479719e-06, "loss": 0.0619, "step": 1955 }, { "epoch": 1.7371225577264653, "grad_norm": 0.6090544014088589, "learning_rate": 6.96655056672503e-06, "loss": 0.048, "step": 1956 }, { "epoch": 1.7380106571936058, "grad_norm": 0.5193654110281936, "learning_rate": 6.962986691421623e-06, "loss": 0.0549, "step": 1957 }, { "epoch": 1.738898756660746, "grad_norm": 0.44306594429871254, "learning_rate": 6.959421636710751e-06, "loss": 0.0449, "step": 1958 }, { "epoch": 1.7397868561278864, "grad_norm": 0.3545538930856097, "learning_rate": 6.955855404734384e-06, "loss": 0.0411, "step": 1959 }, { "epoch": 1.7406749555950265, "grad_norm": 0.41983040586963427, "learning_rate": 6.952287997635195e-06, "loss": 0.0437, "step": 1960 }, { "epoch": 1.741563055062167, "grad_norm": 0.43214311364759633, "learning_rate": 6.9487194175565655e-06, "loss": 0.0405, "step": 1961 }, { "epoch": 1.7424511545293073, "grad_norm": 0.4423198943447773, "learning_rate": 6.945149666642579e-06, "loss": 0.0564, "step": 1962 }, { "epoch": 1.7433392539964476, "grad_norm": 0.5594861180326316, "learning_rate": 6.941578747038024e-06, "loss": 0.0661, "step": 1963 }, { "epoch": 1.744227353463588, "grad_norm": 0.6452081661120449, "learning_rate": 6.93800666088839e-06, "loss": 0.0732, "step": 1964 }, { "epoch": 1.7451154529307282, "grad_norm": 0.3920723014733711, "learning_rate": 6.934433410339869e-06, "loss": 0.0445, "step": 1965 }, { "epoch": 1.7460035523978685, "grad_norm": 0.40025988359791975, "learning_rate": 6.9308589975393535e-06, "loss": 0.0482, "step": 1966 }, { "epoch": 1.7468916518650088, "grad_norm": 0.3779755023052511, "learning_rate": 6.9272834246344325e-06, "loss": 0.0521, "step": 1967 }, { "epoch": 1.7477797513321494, "grad_norm": 0.4762875832461135, "learning_rate": 6.923706693773388e-06, "loss": 0.0579, "step": 1968 }, { "epoch": 1.7486678507992894, "grad_norm": 0.41862535081175484, "learning_rate": 6.9201288071052074e-06, "loss": 0.0538, "step": 1969 }, { "epoch": 1.74955595026643, "grad_norm": 0.4359239583319757, "learning_rate": 6.916549766779564e-06, "loss": 0.048, "step": 1970 }, { "epoch": 1.75044404973357, "grad_norm": 0.47230583650019586, "learning_rate": 6.912969574946829e-06, "loss": 0.0457, "step": 1971 }, { "epoch": 1.7513321492007106, "grad_norm": 0.5066145503687762, "learning_rate": 6.909388233758063e-06, "loss": 0.0569, "step": 1972 }, { "epoch": 1.7522202486678506, "grad_norm": 0.5125342759701539, "learning_rate": 6.9058057453650195e-06, "loss": 0.0518, "step": 1973 }, { "epoch": 1.7531083481349912, "grad_norm": 0.34865490084429174, "learning_rate": 6.902222111920136e-06, "loss": 0.0385, "step": 1974 }, { "epoch": 1.7539964476021315, "grad_norm": 0.37974407165365953, "learning_rate": 6.8986373355765464e-06, "loss": 0.0613, "step": 1975 }, { "epoch": 1.7548845470692718, "grad_norm": 0.4271229747862285, "learning_rate": 6.895051418488064e-06, "loss": 0.058, "step": 1976 }, { "epoch": 1.755772646536412, "grad_norm": 0.8497766043435947, "learning_rate": 6.89146436280919e-06, "loss": 0.0662, "step": 1977 }, { "epoch": 1.7566607460035524, "grad_norm": 0.3620990172114698, "learning_rate": 6.88787617069511e-06, "loss": 0.0422, "step": 1978 }, { "epoch": 1.7575488454706927, "grad_norm": 0.39493366973231053, "learning_rate": 6.884286844301693e-06, "loss": 0.0424, "step": 1979 }, { "epoch": 1.758436944937833, "grad_norm": 0.4773129835930439, "learning_rate": 6.880696385785488e-06, "loss": 0.0665, "step": 1980 }, { "epoch": 1.7593250444049735, "grad_norm": 0.5921073490200456, "learning_rate": 6.877104797303725e-06, "loss": 0.0469, "step": 1981 }, { "epoch": 1.7602131438721136, "grad_norm": 0.5994443271449283, "learning_rate": 6.873512081014313e-06, "loss": 0.0561, "step": 1982 }, { "epoch": 1.761101243339254, "grad_norm": 0.3903562920098094, "learning_rate": 6.869918239075838e-06, "loss": 0.046, "step": 1983 }, { "epoch": 1.7619893428063942, "grad_norm": 0.4998592198797589, "learning_rate": 6.866323273647564e-06, "loss": 0.0463, "step": 1984 }, { "epoch": 1.7628774422735347, "grad_norm": 0.5220993731117153, "learning_rate": 6.8627271868894275e-06, "loss": 0.0499, "step": 1985 }, { "epoch": 1.7637655417406748, "grad_norm": 0.3641182003004395, "learning_rate": 6.85912998096204e-06, "loss": 0.0472, "step": 1986 }, { "epoch": 1.7646536412078153, "grad_norm": 0.40271526098467064, "learning_rate": 6.855531658026684e-06, "loss": 0.0461, "step": 1987 }, { "epoch": 1.7655417406749556, "grad_norm": 0.4682792796060245, "learning_rate": 6.851932220245318e-06, "loss": 0.0503, "step": 1988 }, { "epoch": 1.766429840142096, "grad_norm": 0.5012466280614905, "learning_rate": 6.848331669780564e-06, "loss": 0.0527, "step": 1989 }, { "epoch": 1.7673179396092362, "grad_norm": 0.3614697970514256, "learning_rate": 6.844730008795716e-06, "loss": 0.0438, "step": 1990 }, { "epoch": 1.7682060390763765, "grad_norm": 0.3710877973383125, "learning_rate": 6.841127239454737e-06, "loss": 0.0499, "step": 1991 }, { "epoch": 1.7690941385435168, "grad_norm": 0.3799847731257121, "learning_rate": 6.837523363922249e-06, "loss": 0.0411, "step": 1992 }, { "epoch": 1.7699822380106571, "grad_norm": 0.4581509832635446, "learning_rate": 6.833918384363547e-06, "loss": 0.0485, "step": 1993 }, { "epoch": 1.7708703374777977, "grad_norm": 0.3645272434565814, "learning_rate": 6.830312302944584e-06, "loss": 0.0416, "step": 1994 }, { "epoch": 1.7717584369449377, "grad_norm": 0.4095467817591859, "learning_rate": 6.8267051218319766e-06, "loss": 0.0484, "step": 1995 }, { "epoch": 1.7726465364120783, "grad_norm": 0.4758110810972315, "learning_rate": 6.823096843193e-06, "loss": 0.0494, "step": 1996 }, { "epoch": 1.7735346358792183, "grad_norm": 0.44174361579169513, "learning_rate": 6.819487469195596e-06, "loss": 0.0528, "step": 1997 }, { "epoch": 1.7744227353463589, "grad_norm": 0.3729174949891858, "learning_rate": 6.815877002008354e-06, "loss": 0.0471, "step": 1998 }, { "epoch": 1.7753108348134992, "grad_norm": 0.33549672535848557, "learning_rate": 6.8122654438005275e-06, "loss": 0.0453, "step": 1999 }, { "epoch": 1.7761989342806395, "grad_norm": 0.46990029943424805, "learning_rate": 6.808652796742025e-06, "loss": 0.0597, "step": 2000 }, { "epoch": 1.7770870337477798, "grad_norm": 0.35477533288410923, "learning_rate": 6.805039063003407e-06, "loss": 0.0435, "step": 2001 }, { "epoch": 1.77797513321492, "grad_norm": 0.38951162771881537, "learning_rate": 6.8014242447558855e-06, "loss": 0.0489, "step": 2002 }, { "epoch": 1.7788632326820604, "grad_norm": 0.3582600205481822, "learning_rate": 6.797808344171329e-06, "loss": 0.0509, "step": 2003 }, { "epoch": 1.7797513321492007, "grad_norm": 0.5422647873950133, "learning_rate": 6.794191363422251e-06, "loss": 0.0505, "step": 2004 }, { "epoch": 1.7806394316163412, "grad_norm": 0.4918493546856528, "learning_rate": 6.79057330468182e-06, "loss": 0.058, "step": 2005 }, { "epoch": 1.7815275310834813, "grad_norm": 0.40404782312769294, "learning_rate": 6.786954170123846e-06, "loss": 0.0503, "step": 2006 }, { "epoch": 1.7824156305506218, "grad_norm": 0.38173411664949003, "learning_rate": 6.78333396192279e-06, "loss": 0.0455, "step": 2007 }, { "epoch": 1.7833037300177619, "grad_norm": 0.42275908947777346, "learning_rate": 6.779712682253754e-06, "loss": 0.0559, "step": 2008 }, { "epoch": 1.7841918294849024, "grad_norm": 0.5017305533981338, "learning_rate": 6.776090333292488e-06, "loss": 0.055, "step": 2009 }, { "epoch": 1.7850799289520425, "grad_norm": 0.4385039129993016, "learning_rate": 6.77246691721538e-06, "loss": 0.0474, "step": 2010 }, { "epoch": 1.785968028419183, "grad_norm": 0.39754305034029586, "learning_rate": 6.768842436199464e-06, "loss": 0.0445, "step": 2011 }, { "epoch": 1.7868561278863233, "grad_norm": 0.5363354231611518, "learning_rate": 6.76521689242241e-06, "loss": 0.0507, "step": 2012 }, { "epoch": 1.7877442273534636, "grad_norm": 0.4426707834739782, "learning_rate": 6.76159028806253e-06, "loss": 0.0372, "step": 2013 }, { "epoch": 1.788632326820604, "grad_norm": 0.46176746277760417, "learning_rate": 6.757962625298769e-06, "loss": 0.0493, "step": 2014 }, { "epoch": 1.7895204262877442, "grad_norm": 0.38219017080049245, "learning_rate": 6.7543339063107084e-06, "loss": 0.0443, "step": 2015 }, { "epoch": 1.7904085257548845, "grad_norm": 0.39577290688408745, "learning_rate": 6.750704133278571e-06, "loss": 0.0413, "step": 2016 }, { "epoch": 1.7912966252220248, "grad_norm": 0.4745739444420137, "learning_rate": 6.747073308383203e-06, "loss": 0.0511, "step": 2017 }, { "epoch": 1.7921847246891653, "grad_norm": 0.374979497473119, "learning_rate": 6.7434414338060894e-06, "loss": 0.0425, "step": 2018 }, { "epoch": 1.7930728241563054, "grad_norm": 0.549508948054823, "learning_rate": 6.739808511729343e-06, "loss": 0.056, "step": 2019 }, { "epoch": 1.793960923623446, "grad_norm": 0.4856860394946529, "learning_rate": 6.73617454433571e-06, "loss": 0.0502, "step": 2020 }, { "epoch": 1.794849023090586, "grad_norm": 0.4700483210304012, "learning_rate": 6.732539533808556e-06, "loss": 0.0551, "step": 2021 }, { "epoch": 1.7957371225577266, "grad_norm": 0.45411948001388186, "learning_rate": 6.7289034823318825e-06, "loss": 0.0509, "step": 2022 }, { "epoch": 1.7966252220248666, "grad_norm": 0.4323490848451854, "learning_rate": 6.7252663920903105e-06, "loss": 0.0545, "step": 2023 }, { "epoch": 1.7975133214920072, "grad_norm": 0.6001627010374895, "learning_rate": 6.72162826526909e-06, "loss": 0.0507, "step": 2024 }, { "epoch": 1.7984014209591475, "grad_norm": 0.4422196101485241, "learning_rate": 6.717989104054089e-06, "loss": 0.0465, "step": 2025 }, { "epoch": 1.7992895204262878, "grad_norm": 0.39566100815958044, "learning_rate": 6.7143489106318e-06, "loss": 0.0439, "step": 2026 }, { "epoch": 1.800177619893428, "grad_norm": 0.39363120698285337, "learning_rate": 6.710707687189335e-06, "loss": 0.0402, "step": 2027 }, { "epoch": 1.8010657193605684, "grad_norm": 0.5793439239998182, "learning_rate": 6.707065435914423e-06, "loss": 0.0725, "step": 2028 }, { "epoch": 1.8019538188277087, "grad_norm": 0.4782131052607772, "learning_rate": 6.703422158995413e-06, "loss": 0.0584, "step": 2029 }, { "epoch": 1.802841918294849, "grad_norm": 0.5628758094136977, "learning_rate": 6.699777858621271e-06, "loss": 0.0567, "step": 2030 }, { "epoch": 1.8037300177619895, "grad_norm": 0.36581802683215386, "learning_rate": 6.6961325369815736e-06, "loss": 0.0436, "step": 2031 }, { "epoch": 1.8046181172291296, "grad_norm": 0.45615475074720174, "learning_rate": 6.6924861962665166e-06, "loss": 0.043, "step": 2032 }, { "epoch": 1.80550621669627, "grad_norm": 0.49028151822505167, "learning_rate": 6.688838838666902e-06, "loss": 0.0422, "step": 2033 }, { "epoch": 1.8063943161634102, "grad_norm": 0.41650683109921516, "learning_rate": 6.685190466374149e-06, "loss": 0.042, "step": 2034 }, { "epoch": 1.8072824156305507, "grad_norm": 0.4214028572855726, "learning_rate": 6.681541081580281e-06, "loss": 0.0429, "step": 2035 }, { "epoch": 1.8081705150976908, "grad_norm": 0.30651081747774706, "learning_rate": 6.6778906864779345e-06, "loss": 0.037, "step": 2036 }, { "epoch": 1.8090586145648313, "grad_norm": 0.6446685900959485, "learning_rate": 6.674239283260347e-06, "loss": 0.0553, "step": 2037 }, { "epoch": 1.8099467140319716, "grad_norm": 0.3543614113946352, "learning_rate": 6.670586874121369e-06, "loss": 0.0394, "step": 2038 }, { "epoch": 1.810834813499112, "grad_norm": 0.3618617037372615, "learning_rate": 6.666933461255451e-06, "loss": 0.0444, "step": 2039 }, { "epoch": 1.8117229129662522, "grad_norm": 0.4245914746715078, "learning_rate": 6.663279046857647e-06, "loss": 0.0481, "step": 2040 }, { "epoch": 1.8126110124333925, "grad_norm": 0.5403697850119982, "learning_rate": 6.659623633123613e-06, "loss": 0.0483, "step": 2041 }, { "epoch": 1.8134991119005328, "grad_norm": 0.5928311112067389, "learning_rate": 6.655967222249606e-06, "loss": 0.0547, "step": 2042 }, { "epoch": 1.8143872113676731, "grad_norm": 0.36020724361911344, "learning_rate": 6.65230981643248e-06, "loss": 0.043, "step": 2043 }, { "epoch": 1.8152753108348136, "grad_norm": 0.4390963443853013, "learning_rate": 6.648651417869688e-06, "loss": 0.0478, "step": 2044 }, { "epoch": 1.8161634103019537, "grad_norm": 0.4233236483400129, "learning_rate": 6.644992028759283e-06, "loss": 0.0523, "step": 2045 }, { "epoch": 1.8170515097690942, "grad_norm": 0.4450804724945297, "learning_rate": 6.641331651299906e-06, "loss": 0.053, "step": 2046 }, { "epoch": 1.8179396092362343, "grad_norm": 0.36181807383807113, "learning_rate": 6.6376702876908e-06, "loss": 0.0464, "step": 2047 }, { "epoch": 1.8188277087033748, "grad_norm": 0.33676684376312366, "learning_rate": 6.634007940131794e-06, "loss": 0.0458, "step": 2048 }, { "epoch": 1.8197158081705151, "grad_norm": 0.3737934284503229, "learning_rate": 6.63034461082331e-06, "loss": 0.0375, "step": 2049 }, { "epoch": 1.8206039076376554, "grad_norm": 0.3832541594396424, "learning_rate": 6.6266803019663604e-06, "loss": 0.0459, "step": 2050 }, { "epoch": 1.8214920071047958, "grad_norm": 0.5256262091397433, "learning_rate": 6.623015015762547e-06, "loss": 0.0504, "step": 2051 }, { "epoch": 1.822380106571936, "grad_norm": 0.33840402093820304, "learning_rate": 6.619348754414059e-06, "loss": 0.0371, "step": 2052 }, { "epoch": 1.8232682060390764, "grad_norm": 0.39633904955040034, "learning_rate": 6.615681520123667e-06, "loss": 0.0472, "step": 2053 }, { "epoch": 1.8241563055062167, "grad_norm": 0.4642800671979631, "learning_rate": 6.612013315094733e-06, "loss": 0.0492, "step": 2054 }, { "epoch": 1.8250444049733572, "grad_norm": 0.4105829155304549, "learning_rate": 6.608344141531196e-06, "loss": 0.041, "step": 2055 }, { "epoch": 1.8259325044404973, "grad_norm": 0.37517066733437515, "learning_rate": 6.604674001637582e-06, "loss": 0.0466, "step": 2056 }, { "epoch": 1.8268206039076378, "grad_norm": 0.3971195649011647, "learning_rate": 6.601002897618993e-06, "loss": 0.0402, "step": 2057 }, { "epoch": 1.8277087033747779, "grad_norm": 0.4211811563422759, "learning_rate": 6.597330831681115e-06, "loss": 0.0527, "step": 2058 }, { "epoch": 1.8285968028419184, "grad_norm": 0.4177644253074288, "learning_rate": 6.593657806030208e-06, "loss": 0.0524, "step": 2059 }, { "epoch": 1.8294849023090585, "grad_norm": 0.5205344156293021, "learning_rate": 6.589983822873112e-06, "loss": 0.0421, "step": 2060 }, { "epoch": 1.830373001776199, "grad_norm": 0.3836569190869212, "learning_rate": 6.586308884417236e-06, "loss": 0.0439, "step": 2061 }, { "epoch": 1.8312611012433393, "grad_norm": 0.4430535990838388, "learning_rate": 6.582632992870571e-06, "loss": 0.0481, "step": 2062 }, { "epoch": 1.8321492007104796, "grad_norm": 0.39208662605247335, "learning_rate": 6.5789561504416764e-06, "loss": 0.0463, "step": 2063 }, { "epoch": 1.83303730017762, "grad_norm": 0.3879886155905086, "learning_rate": 6.575278359339685e-06, "loss": 0.0416, "step": 2064 }, { "epoch": 1.8339253996447602, "grad_norm": 0.46572152087202695, "learning_rate": 6.571599621774294e-06, "loss": 0.0522, "step": 2065 }, { "epoch": 1.8348134991119005, "grad_norm": 0.45785826870440977, "learning_rate": 6.567919939955778e-06, "loss": 0.0525, "step": 2066 }, { "epoch": 1.8357015985790408, "grad_norm": 0.37121700029217813, "learning_rate": 6.5642393160949715e-06, "loss": 0.046, "step": 2067 }, { "epoch": 1.8365896980461813, "grad_norm": 0.527652281455598, "learning_rate": 6.560557752403277e-06, "loss": 0.0389, "step": 2068 }, { "epoch": 1.8374777975133214, "grad_norm": 0.5310334008842542, "learning_rate": 6.5568752510926645e-06, "loss": 0.0489, "step": 2069 }, { "epoch": 1.838365896980462, "grad_norm": 0.4960897779274944, "learning_rate": 6.553191814375667e-06, "loss": 0.0501, "step": 2070 }, { "epoch": 1.839253996447602, "grad_norm": 0.3867255698905068, "learning_rate": 6.5495074444653735e-06, "loss": 0.0436, "step": 2071 }, { "epoch": 1.8401420959147425, "grad_norm": 0.6419876218650102, "learning_rate": 6.545822143575441e-06, "loss": 0.0548, "step": 2072 }, { "epoch": 1.8410301953818826, "grad_norm": 0.501660018948477, "learning_rate": 6.54213591392008e-06, "loss": 0.0537, "step": 2073 }, { "epoch": 1.8419182948490231, "grad_norm": 0.514176055189614, "learning_rate": 6.538448757714066e-06, "loss": 0.0496, "step": 2074 }, { "epoch": 1.8428063943161634, "grad_norm": 0.40759921456099524, "learning_rate": 6.5347606771727245e-06, "loss": 0.0485, "step": 2075 }, { "epoch": 1.8436944937833037, "grad_norm": 0.48667752765541283, "learning_rate": 6.531071674511939e-06, "loss": 0.0529, "step": 2076 }, { "epoch": 1.844582593250444, "grad_norm": 0.6444952542424409, "learning_rate": 6.5273817519481474e-06, "loss": 0.0404, "step": 2077 }, { "epoch": 1.8454706927175843, "grad_norm": 0.36145831986300553, "learning_rate": 6.523690911698339e-06, "loss": 0.0455, "step": 2078 }, { "epoch": 1.8463587921847247, "grad_norm": 0.4402720883989673, "learning_rate": 6.519999155980058e-06, "loss": 0.0456, "step": 2079 }, { "epoch": 1.847246891651865, "grad_norm": 0.47197416916409135, "learning_rate": 6.516306487011393e-06, "loss": 0.0504, "step": 2080 }, { "epoch": 1.8481349911190055, "grad_norm": 0.43927904744559854, "learning_rate": 6.512612907010986e-06, "loss": 0.0438, "step": 2081 }, { "epoch": 1.8490230905861456, "grad_norm": 0.6033732389148113, "learning_rate": 6.508918418198023e-06, "loss": 0.0551, "step": 2082 }, { "epoch": 1.849911190053286, "grad_norm": 0.4731524764323916, "learning_rate": 6.50522302279224e-06, "loss": 0.0395, "step": 2083 }, { "epoch": 1.8507992895204262, "grad_norm": 0.3886459176710567, "learning_rate": 6.501526723013913e-06, "loss": 0.052, "step": 2084 }, { "epoch": 1.8516873889875667, "grad_norm": 0.42995434792805065, "learning_rate": 6.497829521083866e-06, "loss": 0.0534, "step": 2085 }, { "epoch": 1.8525754884547068, "grad_norm": 0.3770685946384062, "learning_rate": 6.494131419223461e-06, "loss": 0.037, "step": 2086 }, { "epoch": 1.8534635879218473, "grad_norm": 0.43816930145050054, "learning_rate": 6.490432419654602e-06, "loss": 0.0554, "step": 2087 }, { "epoch": 1.8543516873889876, "grad_norm": 0.37123944678402515, "learning_rate": 6.486732524599736e-06, "loss": 0.0415, "step": 2088 }, { "epoch": 1.855239786856128, "grad_norm": 0.37410095953505745, "learning_rate": 6.483031736281843e-06, "loss": 0.0476, "step": 2089 }, { "epoch": 1.8561278863232682, "grad_norm": 0.36333640435943326, "learning_rate": 6.47933005692444e-06, "loss": 0.049, "step": 2090 }, { "epoch": 1.8570159857904085, "grad_norm": 0.42352096688025875, "learning_rate": 6.475627488751583e-06, "loss": 0.0415, "step": 2091 }, { "epoch": 1.8579040852575488, "grad_norm": 0.34066396339303506, "learning_rate": 6.471924033987859e-06, "loss": 0.0441, "step": 2092 }, { "epoch": 1.858792184724689, "grad_norm": 0.361185901194101, "learning_rate": 6.46821969485839e-06, "loss": 0.0493, "step": 2093 }, { "epoch": 1.8596802841918296, "grad_norm": 0.5659850613591552, "learning_rate": 6.464514473588825e-06, "loss": 0.0547, "step": 2094 }, { "epoch": 1.8605683836589697, "grad_norm": 0.3700538488433121, "learning_rate": 6.46080837240535e-06, "loss": 0.0544, "step": 2095 }, { "epoch": 1.8614564831261102, "grad_norm": 0.37391369608280134, "learning_rate": 6.4571013935346724e-06, "loss": 0.0552, "step": 2096 }, { "epoch": 1.8623445825932503, "grad_norm": 0.4506848177748878, "learning_rate": 6.453393539204031e-06, "loss": 0.0467, "step": 2097 }, { "epoch": 1.8632326820603908, "grad_norm": 0.4760160455310682, "learning_rate": 6.449684811641191e-06, "loss": 0.0501, "step": 2098 }, { "epoch": 1.8641207815275311, "grad_norm": 0.630914626311758, "learning_rate": 6.44597521307444e-06, "loss": 0.0537, "step": 2099 }, { "epoch": 1.8650088809946714, "grad_norm": 0.4886407075177583, "learning_rate": 6.442264745732589e-06, "loss": 0.0467, "step": 2100 }, { "epoch": 1.8658969804618117, "grad_norm": 0.4667539894576772, "learning_rate": 6.438553411844976e-06, "loss": 0.0471, "step": 2101 }, { "epoch": 1.866785079928952, "grad_norm": 0.5111039869480317, "learning_rate": 6.4348412136414505e-06, "loss": 0.062, "step": 2102 }, { "epoch": 1.8676731793960923, "grad_norm": 0.331091498971716, "learning_rate": 6.431128153352389e-06, "loss": 0.0426, "step": 2103 }, { "epoch": 1.8685612788632326, "grad_norm": 0.5559235201845065, "learning_rate": 6.427414233208682e-06, "loss": 0.0527, "step": 2104 }, { "epoch": 1.8694493783303732, "grad_norm": 0.4727738717236463, "learning_rate": 6.423699455441741e-06, "loss": 0.0525, "step": 2105 }, { "epoch": 1.8703374777975132, "grad_norm": 0.5007264649411204, "learning_rate": 6.419983822283486e-06, "loss": 0.0494, "step": 2106 }, { "epoch": 1.8712255772646538, "grad_norm": 0.43477312242283966, "learning_rate": 6.4162673359663575e-06, "loss": 0.0459, "step": 2107 }, { "epoch": 1.8721136767317939, "grad_norm": 0.337940601269951, "learning_rate": 6.412549998723304e-06, "loss": 0.0468, "step": 2108 }, { "epoch": 1.8730017761989344, "grad_norm": 0.36885606967588846, "learning_rate": 6.408831812787788e-06, "loss": 0.0457, "step": 2109 }, { "epoch": 1.8738898756660745, "grad_norm": 0.42546400924864336, "learning_rate": 6.405112780393781e-06, "loss": 0.0445, "step": 2110 }, { "epoch": 1.874777975133215, "grad_norm": 0.4619266591193366, "learning_rate": 6.401392903775762e-06, "loss": 0.0459, "step": 2111 }, { "epoch": 1.8756660746003553, "grad_norm": 0.3894712290108777, "learning_rate": 6.39767218516872e-06, "loss": 0.0439, "step": 2112 }, { "epoch": 1.8765541740674956, "grad_norm": 0.485064306096109, "learning_rate": 6.393950626808145e-06, "loss": 0.0455, "step": 2113 }, { "epoch": 1.8774422735346359, "grad_norm": 0.3403998697098418, "learning_rate": 6.3902282309300375e-06, "loss": 0.0421, "step": 2114 }, { "epoch": 1.8783303730017762, "grad_norm": 0.3640300029190646, "learning_rate": 6.3865049997709e-06, "loss": 0.0428, "step": 2115 }, { "epoch": 1.8792184724689165, "grad_norm": 0.4462567654607275, "learning_rate": 6.38278093556773e-06, "loss": 0.0569, "step": 2116 }, { "epoch": 1.8801065719360568, "grad_norm": 0.5788823336040582, "learning_rate": 6.379056040558036e-06, "loss": 0.0481, "step": 2117 }, { "epoch": 1.8809946714031973, "grad_norm": 0.339464299182698, "learning_rate": 6.375330316979816e-06, "loss": 0.0365, "step": 2118 }, { "epoch": 1.8818827708703374, "grad_norm": 0.4054052815074723, "learning_rate": 6.3716037670715725e-06, "loss": 0.0423, "step": 2119 }, { "epoch": 1.882770870337478, "grad_norm": 0.4202419168351307, "learning_rate": 6.3678763930723034e-06, "loss": 0.0419, "step": 2120 }, { "epoch": 1.883658969804618, "grad_norm": 0.36138740873727293, "learning_rate": 6.364148197221499e-06, "loss": 0.0466, "step": 2121 }, { "epoch": 1.8845470692717585, "grad_norm": 0.39202449801589895, "learning_rate": 6.360419181759143e-06, "loss": 0.0448, "step": 2122 }, { "epoch": 1.8854351687388986, "grad_norm": 0.5250988754303926, "learning_rate": 6.3566893489257185e-06, "loss": 0.0499, "step": 2123 }, { "epoch": 1.8863232682060391, "grad_norm": 0.6469008346649571, "learning_rate": 6.352958700962191e-06, "loss": 0.061, "step": 2124 }, { "epoch": 1.8872113676731794, "grad_norm": 0.4581135569733313, "learning_rate": 6.349227240110019e-06, "loss": 0.0461, "step": 2125 }, { "epoch": 1.8880994671403197, "grad_norm": 0.8025399454101909, "learning_rate": 6.345494968611152e-06, "loss": 0.0512, "step": 2126 }, { "epoch": 1.88898756660746, "grad_norm": 0.44397943494301245, "learning_rate": 6.3417618887080245e-06, "loss": 0.0494, "step": 2127 }, { "epoch": 1.8898756660746003, "grad_norm": 0.6018102541173735, "learning_rate": 6.3380280026435535e-06, "loss": 0.0452, "step": 2128 }, { "epoch": 1.8907637655417406, "grad_norm": 0.4030835604751413, "learning_rate": 6.334293312661147e-06, "loss": 0.0353, "step": 2129 }, { "epoch": 1.891651865008881, "grad_norm": 0.39190175083569617, "learning_rate": 6.33055782100469e-06, "loss": 0.0473, "step": 2130 }, { "epoch": 1.8925399644760215, "grad_norm": 0.46273519259461704, "learning_rate": 6.3268215299185545e-06, "loss": 0.0404, "step": 2131 }, { "epoch": 1.8934280639431615, "grad_norm": 0.3962674432212313, "learning_rate": 6.323084441647586e-06, "loss": 0.0442, "step": 2132 }, { "epoch": 1.894316163410302, "grad_norm": 0.7545203804080743, "learning_rate": 6.319346558437116e-06, "loss": 0.055, "step": 2133 }, { "epoch": 1.8952042628774421, "grad_norm": 0.5390726237497478, "learning_rate": 6.3156078825329495e-06, "loss": 0.0473, "step": 2134 }, { "epoch": 1.8960923623445827, "grad_norm": 0.45025381579412616, "learning_rate": 6.311868416181367e-06, "loss": 0.0443, "step": 2135 }, { "epoch": 1.8969804618117228, "grad_norm": 0.4467246091737874, "learning_rate": 6.30812816162913e-06, "loss": 0.0386, "step": 2136 }, { "epoch": 1.8978685612788633, "grad_norm": 0.3908841207514594, "learning_rate": 6.3043871211234656e-06, "loss": 0.0384, "step": 2137 }, { "epoch": 1.8987566607460036, "grad_norm": 0.3987975718435107, "learning_rate": 6.300645296912078e-06, "loss": 0.0414, "step": 2138 }, { "epoch": 1.8996447602131439, "grad_norm": 0.5002406973767204, "learning_rate": 6.296902691243141e-06, "loss": 0.0588, "step": 2139 }, { "epoch": 1.9005328596802842, "grad_norm": 0.48860923921704486, "learning_rate": 6.2931593063653e-06, "loss": 0.0612, "step": 2140 }, { "epoch": 1.9014209591474245, "grad_norm": 0.3947407088728515, "learning_rate": 6.289415144527662e-06, "loss": 0.0454, "step": 2141 }, { "epoch": 1.9023090586145648, "grad_norm": 0.41230602007856854, "learning_rate": 6.2856702079798094e-06, "loss": 0.0401, "step": 2142 }, { "epoch": 1.903197158081705, "grad_norm": 0.33116293695159454, "learning_rate": 6.2819244989717845e-06, "loss": 0.0359, "step": 2143 }, { "epoch": 1.9040852575488456, "grad_norm": 0.4000104975546426, "learning_rate": 6.278178019754095e-06, "loss": 0.0486, "step": 2144 }, { "epoch": 1.9049733570159857, "grad_norm": 0.3731738103643593, "learning_rate": 6.274430772577712e-06, "loss": 0.049, "step": 2145 }, { "epoch": 1.9058614564831262, "grad_norm": 0.3572735108537658, "learning_rate": 6.270682759694069e-06, "loss": 0.0424, "step": 2146 }, { "epoch": 1.9067495559502663, "grad_norm": 0.42934308169002866, "learning_rate": 6.266933983355056e-06, "loss": 0.0436, "step": 2147 }, { "epoch": 1.9076376554174068, "grad_norm": 0.3632847206577638, "learning_rate": 6.263184445813024e-06, "loss": 0.0486, "step": 2148 }, { "epoch": 1.9085257548845471, "grad_norm": 0.3413940971429979, "learning_rate": 6.259434149320782e-06, "loss": 0.0385, "step": 2149 }, { "epoch": 1.9094138543516874, "grad_norm": 0.48469701880453253, "learning_rate": 6.255683096131595e-06, "loss": 0.0426, "step": 2150 }, { "epoch": 1.9103019538188277, "grad_norm": 0.3662388409090857, "learning_rate": 6.2519312884991805e-06, "loss": 0.0461, "step": 2151 }, { "epoch": 1.911190053285968, "grad_norm": 0.4539062927110461, "learning_rate": 6.2481787286777116e-06, "loss": 0.051, "step": 2152 }, { "epoch": 1.9120781527531083, "grad_norm": 0.35755853642933133, "learning_rate": 6.244425418921811e-06, "loss": 0.0435, "step": 2153 }, { "epoch": 1.9129662522202486, "grad_norm": 0.42878467561117417, "learning_rate": 6.240671361486555e-06, "loss": 0.0417, "step": 2154 }, { "epoch": 1.913854351687389, "grad_norm": 0.49409777074507605, "learning_rate": 6.2369165586274664e-06, "loss": 0.0487, "step": 2155 }, { "epoch": 1.9147424511545292, "grad_norm": 0.41199253087363025, "learning_rate": 6.233161012600518e-06, "loss": 0.0415, "step": 2156 }, { "epoch": 1.9156305506216698, "grad_norm": 0.405928965572566, "learning_rate": 6.229404725662126e-06, "loss": 0.046, "step": 2157 }, { "epoch": 1.9165186500888098, "grad_norm": 0.39740350515985995, "learning_rate": 6.225647700069158e-06, "loss": 0.0443, "step": 2158 }, { "epoch": 1.9174067495559504, "grad_norm": 0.4834426843479704, "learning_rate": 6.221889938078916e-06, "loss": 0.0529, "step": 2159 }, { "epoch": 1.9182948490230904, "grad_norm": 0.49548825776613825, "learning_rate": 6.218131441949151e-06, "loss": 0.0617, "step": 2160 }, { "epoch": 1.919182948490231, "grad_norm": 0.3498393151031275, "learning_rate": 6.214372213938056e-06, "loss": 0.0344, "step": 2161 }, { "epoch": 1.9200710479573713, "grad_norm": 0.33989027423396234, "learning_rate": 6.210612256304262e-06, "loss": 0.0378, "step": 2162 }, { "epoch": 1.9209591474245116, "grad_norm": 0.36163423760475416, "learning_rate": 6.206851571306832e-06, "loss": 0.0463, "step": 2163 }, { "epoch": 1.9218472468916519, "grad_norm": 0.656676665341038, "learning_rate": 6.2030901612052794e-06, "loss": 0.0504, "step": 2164 }, { "epoch": 1.9227353463587922, "grad_norm": 0.36680522804055904, "learning_rate": 6.199328028259542e-06, "loss": 0.0423, "step": 2165 }, { "epoch": 1.9236234458259325, "grad_norm": 0.38262720764696834, "learning_rate": 6.195565174729995e-06, "loss": 0.0399, "step": 2166 }, { "epoch": 1.9245115452930728, "grad_norm": 0.37453073228605865, "learning_rate": 6.191801602877448e-06, "loss": 0.0385, "step": 2167 }, { "epoch": 1.9253996447602133, "grad_norm": 0.4683233973955515, "learning_rate": 6.188037314963142e-06, "loss": 0.0475, "step": 2168 }, { "epoch": 1.9262877442273534, "grad_norm": 0.4439790066925214, "learning_rate": 6.184272313248747e-06, "loss": 0.0458, "step": 2169 }, { "epoch": 1.927175843694494, "grad_norm": 0.37081278036155074, "learning_rate": 6.1805065999963634e-06, "loss": 0.0386, "step": 2170 }, { "epoch": 1.928063943161634, "grad_norm": 0.47575224522252724, "learning_rate": 6.176740177468515e-06, "loss": 0.0443, "step": 2171 }, { "epoch": 1.9289520426287745, "grad_norm": 0.40954337004055674, "learning_rate": 6.172973047928159e-06, "loss": 0.0436, "step": 2172 }, { "epoch": 1.9298401420959146, "grad_norm": 0.3795071993422605, "learning_rate": 6.169205213638671e-06, "loss": 0.0392, "step": 2173 }, { "epoch": 1.9307282415630551, "grad_norm": 0.5057545520526439, "learning_rate": 6.165436676863853e-06, "loss": 0.0528, "step": 2174 }, { "epoch": 1.9316163410301954, "grad_norm": 0.3815045784778289, "learning_rate": 6.1616674398679275e-06, "loss": 0.0418, "step": 2175 }, { "epoch": 1.9325044404973357, "grad_norm": 0.4536496129499768, "learning_rate": 6.1578975049155395e-06, "loss": 0.0566, "step": 2176 }, { "epoch": 1.933392539964476, "grad_norm": 0.43901000822965575, "learning_rate": 6.154126874271753e-06, "loss": 0.0591, "step": 2177 }, { "epoch": 1.9342806394316163, "grad_norm": 0.35441921879257254, "learning_rate": 6.150355550202048e-06, "loss": 0.039, "step": 2178 }, { "epoch": 1.9351687388987566, "grad_norm": 0.34742959296044007, "learning_rate": 6.146583534972322e-06, "loss": 0.0336, "step": 2179 }, { "epoch": 1.936056838365897, "grad_norm": 0.3082263743464388, "learning_rate": 6.1428108308488934e-06, "loss": 0.0407, "step": 2180 }, { "epoch": 1.9369449378330375, "grad_norm": 0.39803484202659967, "learning_rate": 6.139037440098482e-06, "loss": 0.0518, "step": 2181 }, { "epoch": 1.9378330373001775, "grad_norm": 0.45351142859117755, "learning_rate": 6.135263364988233e-06, "loss": 0.0432, "step": 2182 }, { "epoch": 1.938721136767318, "grad_norm": 0.46724683801450034, "learning_rate": 6.1314886077856946e-06, "loss": 0.0605, "step": 2183 }, { "epoch": 1.9396092362344581, "grad_norm": 0.38730485735302783, "learning_rate": 6.12771317075883e-06, "loss": 0.0416, "step": 2184 }, { "epoch": 1.9404973357015987, "grad_norm": 0.42992171692362197, "learning_rate": 6.123937056176005e-06, "loss": 0.0389, "step": 2185 }, { "epoch": 1.9413854351687387, "grad_norm": 0.46672428369803093, "learning_rate": 6.120160266305999e-06, "loss": 0.0512, "step": 2186 }, { "epoch": 1.9422735346358793, "grad_norm": 0.4706018893641143, "learning_rate": 6.116382803417993e-06, "loss": 0.0444, "step": 2187 }, { "epoch": 1.9431616341030196, "grad_norm": 0.31767870316165925, "learning_rate": 6.112604669781572e-06, "loss": 0.0371, "step": 2188 }, { "epoch": 1.9440497335701599, "grad_norm": 0.3714468432315274, "learning_rate": 6.108825867666729e-06, "loss": 0.0491, "step": 2189 }, { "epoch": 1.9449378330373002, "grad_norm": 0.39023993760606873, "learning_rate": 6.1050463993438525e-06, "loss": 0.0566, "step": 2190 }, { "epoch": 1.9458259325044405, "grad_norm": 0.3731005928135524, "learning_rate": 6.101266267083732e-06, "loss": 0.0494, "step": 2191 }, { "epoch": 1.9467140319715808, "grad_norm": 0.3415655238659524, "learning_rate": 6.0974854731575615e-06, "loss": 0.037, "step": 2192 }, { "epoch": 1.947602131438721, "grad_norm": 0.334588217445017, "learning_rate": 6.093704019836927e-06, "loss": 0.0444, "step": 2193 }, { "epoch": 1.9484902309058616, "grad_norm": 0.42727978732015626, "learning_rate": 6.089921909393812e-06, "loss": 0.0448, "step": 2194 }, { "epoch": 1.9493783303730017, "grad_norm": 0.405971400722109, "learning_rate": 6.086139144100596e-06, "loss": 0.0445, "step": 2195 }, { "epoch": 1.9502664298401422, "grad_norm": 0.46345149540794633, "learning_rate": 6.082355726230052e-06, "loss": 0.0471, "step": 2196 }, { "epoch": 1.9511545293072823, "grad_norm": 0.3588792619562609, "learning_rate": 6.078571658055343e-06, "loss": 0.0473, "step": 2197 }, { "epoch": 1.9520426287744228, "grad_norm": 0.3619948040344296, "learning_rate": 6.074786941850022e-06, "loss": 0.0443, "step": 2198 }, { "epoch": 1.952930728241563, "grad_norm": 0.46003516648541753, "learning_rate": 6.0710015798880385e-06, "loss": 0.0505, "step": 2199 }, { "epoch": 1.9538188277087034, "grad_norm": 0.4616610153615785, "learning_rate": 6.067215574443721e-06, "loss": 0.042, "step": 2200 }, { "epoch": 1.9547069271758437, "grad_norm": 0.3263572711987352, "learning_rate": 6.06342892779179e-06, "loss": 0.0425, "step": 2201 }, { "epoch": 1.955595026642984, "grad_norm": 0.48978190553900625, "learning_rate": 6.059641642207349e-06, "loss": 0.0469, "step": 2202 }, { "epoch": 1.9564831261101243, "grad_norm": 0.36095080394233947, "learning_rate": 6.055853719965888e-06, "loss": 0.0468, "step": 2203 }, { "epoch": 1.9573712255772646, "grad_norm": 0.4383836524400736, "learning_rate": 6.052065163343274e-06, "loss": 0.0471, "step": 2204 }, { "epoch": 1.958259325044405, "grad_norm": 0.3432346045341955, "learning_rate": 6.048275974615763e-06, "loss": 0.0445, "step": 2205 }, { "epoch": 1.9591474245115452, "grad_norm": 0.4474572470530067, "learning_rate": 6.044486156059982e-06, "loss": 0.0483, "step": 2206 }, { "epoch": 1.9600355239786857, "grad_norm": 0.445184440401943, "learning_rate": 6.040695709952944e-06, "loss": 0.0427, "step": 2207 }, { "epoch": 1.9609236234458258, "grad_norm": 0.3769395422201313, "learning_rate": 6.036904638572035e-06, "loss": 0.0512, "step": 2208 }, { "epoch": 1.9618117229129663, "grad_norm": 0.42956204509434803, "learning_rate": 6.033112944195021e-06, "loss": 0.0422, "step": 2209 }, { "epoch": 1.9626998223801064, "grad_norm": 0.37218715861148377, "learning_rate": 6.029320629100034e-06, "loss": 0.0467, "step": 2210 }, { "epoch": 1.963587921847247, "grad_norm": 0.34080116710205716, "learning_rate": 6.0255276955655854e-06, "loss": 0.0376, "step": 2211 }, { "epoch": 1.9644760213143873, "grad_norm": 0.3214402294267105, "learning_rate": 6.021734145870558e-06, "loss": 0.0365, "step": 2212 }, { "epoch": 1.9653641207815276, "grad_norm": 0.3525755964550656, "learning_rate": 6.017939982294203e-06, "loss": 0.0423, "step": 2213 }, { "epoch": 1.9662522202486679, "grad_norm": 0.3905564210946439, "learning_rate": 6.014145207116141e-06, "loss": 0.0453, "step": 2214 }, { "epoch": 1.9671403197158082, "grad_norm": 0.4156069438990325, "learning_rate": 6.01034982261636e-06, "loss": 0.0492, "step": 2215 }, { "epoch": 1.9680284191829485, "grad_norm": 0.4916076670695481, "learning_rate": 6.006553831075215e-06, "loss": 0.0531, "step": 2216 }, { "epoch": 1.9689165186500888, "grad_norm": 0.4292406230475399, "learning_rate": 6.0027572347734216e-06, "loss": 0.0449, "step": 2217 }, { "epoch": 1.9698046181172293, "grad_norm": 0.4839747395829105, "learning_rate": 5.998960035992067e-06, "loss": 0.037, "step": 2218 }, { "epoch": 1.9706927175843694, "grad_norm": 0.39556897595819734, "learning_rate": 5.995162237012592e-06, "loss": 0.04, "step": 2219 }, { "epoch": 1.97158081705151, "grad_norm": 0.36822460637932836, "learning_rate": 5.991363840116799e-06, "loss": 0.0469, "step": 2220 }, { "epoch": 1.97246891651865, "grad_norm": 0.3956747417750124, "learning_rate": 5.98756484758686e-06, "loss": 0.0386, "step": 2221 }, { "epoch": 1.9733570159857905, "grad_norm": 0.3532515499098325, "learning_rate": 5.983765261705289e-06, "loss": 0.0414, "step": 2222 }, { "epoch": 1.9742451154529306, "grad_norm": 0.5035466196336941, "learning_rate": 5.979965084754968e-06, "loss": 0.0419, "step": 2223 }, { "epoch": 1.975133214920071, "grad_norm": 0.40702905054749317, "learning_rate": 5.9761643190191306e-06, "loss": 0.0466, "step": 2224 }, { "epoch": 1.9760213143872114, "grad_norm": 0.44802745275927625, "learning_rate": 5.9723629667813645e-06, "loss": 0.0393, "step": 2225 }, { "epoch": 1.9769094138543517, "grad_norm": 0.35762549891057027, "learning_rate": 5.968561030325606e-06, "loss": 0.0487, "step": 2226 }, { "epoch": 1.977797513321492, "grad_norm": 0.35013910891561983, "learning_rate": 5.964758511936149e-06, "loss": 0.0435, "step": 2227 }, { "epoch": 1.9786856127886323, "grad_norm": 0.34922191729065305, "learning_rate": 5.960955413897635e-06, "loss": 0.0466, "step": 2228 }, { "epoch": 1.9795737122557726, "grad_norm": 0.467416272653791, "learning_rate": 5.957151738495048e-06, "loss": 0.0485, "step": 2229 }, { "epoch": 1.980461811722913, "grad_norm": 0.5291197544552343, "learning_rate": 5.9533474880137276e-06, "loss": 0.0564, "step": 2230 }, { "epoch": 1.9813499111900534, "grad_norm": 0.43472102473020946, "learning_rate": 5.949542664739353e-06, "loss": 0.0516, "step": 2231 }, { "epoch": 1.9822380106571935, "grad_norm": 0.3577863252018631, "learning_rate": 5.94573727095795e-06, "loss": 0.0542, "step": 2232 }, { "epoch": 1.983126110124334, "grad_norm": 0.35555273165445755, "learning_rate": 5.941931308955885e-06, "loss": 0.0467, "step": 2233 }, { "epoch": 1.9840142095914741, "grad_norm": 0.4280711372451535, "learning_rate": 5.93812478101987e-06, "loss": 0.0505, "step": 2234 }, { "epoch": 1.9849023090586146, "grad_norm": 0.39287928024451396, "learning_rate": 5.9343176894369515e-06, "loss": 0.0423, "step": 2235 }, { "epoch": 1.9857904085257547, "grad_norm": 0.4262044489424622, "learning_rate": 5.93051003649452e-06, "loss": 0.0446, "step": 2236 }, { "epoch": 1.9866785079928952, "grad_norm": 0.42635191575274267, "learning_rate": 5.9267018244803e-06, "loss": 0.0448, "step": 2237 }, { "epoch": 1.9875666074600356, "grad_norm": 0.39101569632425154, "learning_rate": 5.9228930556823515e-06, "loss": 0.0449, "step": 2238 }, { "epoch": 1.9884547069271759, "grad_norm": 0.4917474281311356, "learning_rate": 5.9190837323890715e-06, "loss": 0.0505, "step": 2239 }, { "epoch": 1.9893428063943162, "grad_norm": 0.3954654184821005, "learning_rate": 5.915273856889189e-06, "loss": 0.0493, "step": 2240 }, { "epoch": 1.9902309058614565, "grad_norm": 0.3390129303412325, "learning_rate": 5.911463431471764e-06, "loss": 0.0404, "step": 2241 }, { "epoch": 1.9911190053285968, "grad_norm": 0.32144578820830605, "learning_rate": 5.907652458426187e-06, "loss": 0.0426, "step": 2242 }, { "epoch": 1.992007104795737, "grad_norm": 0.43165150207886394, "learning_rate": 5.903840940042179e-06, "loss": 0.0359, "step": 2243 }, { "epoch": 1.9928952042628776, "grad_norm": 0.4641684247626532, "learning_rate": 5.9000288786097885e-06, "loss": 0.0528, "step": 2244 }, { "epoch": 1.9937833037300177, "grad_norm": 0.41190342167954275, "learning_rate": 5.896216276419386e-06, "loss": 0.0486, "step": 2245 }, { "epoch": 1.9946714031971582, "grad_norm": 0.33937679320158426, "learning_rate": 5.8924031357616755e-06, "loss": 0.0381, "step": 2246 }, { "epoch": 1.9955595026642983, "grad_norm": 0.3670267910671915, "learning_rate": 5.888589458927677e-06, "loss": 0.0422, "step": 2247 }, { "epoch": 1.9964476021314388, "grad_norm": 0.3431648871652662, "learning_rate": 5.884775248208733e-06, "loss": 0.0378, "step": 2248 }, { "epoch": 1.997335701598579, "grad_norm": 0.6145972650758212, "learning_rate": 5.880960505896513e-06, "loss": 0.0664, "step": 2249 }, { "epoch": 1.9982238010657194, "grad_norm": 0.3733321388555608, "learning_rate": 5.8771452342829975e-06, "loss": 0.0439, "step": 2250 }, { "epoch": 1.9991119005328597, "grad_norm": 0.4197717737729959, "learning_rate": 5.873329435660492e-06, "loss": 0.0422, "step": 2251 }, { "epoch": 2.0, "grad_norm": 0.5081836328682018, "learning_rate": 5.8695131123216155e-06, "loss": 0.0504, "step": 2252 }, { "epoch": 2.0, "eval_loss": 0.04935871437191963, "eval_runtime": 78.0736, "eval_samples_per_second": 194.252, "eval_steps_per_second": 3.036, "step": 2252 }, { "epoch": 2.0008880994671405, "grad_norm": 0.7952094344327605, "learning_rate": 5.865696266559303e-06, "loss": 0.0427, "step": 2253 }, { "epoch": 2.0017761989342806, "grad_norm": 0.4823228734323676, "learning_rate": 5.861878900666801e-06, "loss": 0.0488, "step": 2254 }, { "epoch": 2.002664298401421, "grad_norm": 0.3501005673920316, "learning_rate": 5.858061016937673e-06, "loss": 0.048, "step": 2255 }, { "epoch": 2.003552397868561, "grad_norm": 0.4824957705288789, "learning_rate": 5.85424261766579e-06, "loss": 0.0533, "step": 2256 }, { "epoch": 2.0044404973357017, "grad_norm": 0.3562463152160258, "learning_rate": 5.850423705145334e-06, "loss": 0.0365, "step": 2257 }, { "epoch": 2.005328596802842, "grad_norm": 0.4361043063883756, "learning_rate": 5.846604281670795e-06, "loss": 0.0397, "step": 2258 }, { "epoch": 2.0062166962699823, "grad_norm": 0.4735692130528875, "learning_rate": 5.8427843495369715e-06, "loss": 0.0603, "step": 2259 }, { "epoch": 2.0071047957371224, "grad_norm": 0.3910962549697052, "learning_rate": 5.838963911038965e-06, "loss": 0.0343, "step": 2260 }, { "epoch": 2.007992895204263, "grad_norm": 0.4929446695376269, "learning_rate": 5.835142968472181e-06, "loss": 0.0426, "step": 2261 }, { "epoch": 2.008880994671403, "grad_norm": 0.40181116823087554, "learning_rate": 5.831321524132336e-06, "loss": 0.0388, "step": 2262 }, { "epoch": 2.0097690941385435, "grad_norm": 0.44171569498479984, "learning_rate": 5.827499580315435e-06, "loss": 0.0418, "step": 2263 }, { "epoch": 2.0106571936056836, "grad_norm": 0.5181303751598587, "learning_rate": 5.8236771393177915e-06, "loss": 0.0546, "step": 2264 }, { "epoch": 2.011545293072824, "grad_norm": 0.3997725294058655, "learning_rate": 5.819854203436017e-06, "loss": 0.0466, "step": 2265 }, { "epoch": 2.0124333925399647, "grad_norm": 0.321014524920309, "learning_rate": 5.8160307749670185e-06, "loss": 0.0322, "step": 2266 }, { "epoch": 2.0133214920071048, "grad_norm": 0.3503755061990151, "learning_rate": 5.812206856207998e-06, "loss": 0.0345, "step": 2267 }, { "epoch": 2.0142095914742453, "grad_norm": 0.3934928239419917, "learning_rate": 5.808382449456455e-06, "loss": 0.0369, "step": 2268 }, { "epoch": 2.0150976909413854, "grad_norm": 0.3357771609218698, "learning_rate": 5.804557557010182e-06, "loss": 0.0335, "step": 2269 }, { "epoch": 2.015985790408526, "grad_norm": 0.5573701881162235, "learning_rate": 5.80073218116726e-06, "loss": 0.0422, "step": 2270 }, { "epoch": 2.016873889875666, "grad_norm": 0.39087568754014085, "learning_rate": 5.796906324226064e-06, "loss": 0.0376, "step": 2271 }, { "epoch": 2.0177619893428065, "grad_norm": 0.4522705113909331, "learning_rate": 5.793079988485257e-06, "loss": 0.0407, "step": 2272 }, { "epoch": 2.0186500888099466, "grad_norm": 0.5408591361233624, "learning_rate": 5.789253176243787e-06, "loss": 0.0346, "step": 2273 }, { "epoch": 2.019538188277087, "grad_norm": 0.43540521811593075, "learning_rate": 5.785425889800894e-06, "loss": 0.036, "step": 2274 }, { "epoch": 2.020426287744227, "grad_norm": 0.34588690097994523, "learning_rate": 5.781598131456097e-06, "loss": 0.0347, "step": 2275 }, { "epoch": 2.0213143872113677, "grad_norm": 0.39823378357341, "learning_rate": 5.777769903509201e-06, "loss": 0.0397, "step": 2276 }, { "epoch": 2.022202486678508, "grad_norm": 0.3526815627280943, "learning_rate": 5.773941208260295e-06, "loss": 0.0382, "step": 2277 }, { "epoch": 2.0230905861456483, "grad_norm": 0.34085681644037663, "learning_rate": 5.770112048009747e-06, "loss": 0.0355, "step": 2278 }, { "epoch": 2.023978685612789, "grad_norm": 0.5025947220291987, "learning_rate": 5.7662824250582024e-06, "loss": 0.0437, "step": 2279 }, { "epoch": 2.024866785079929, "grad_norm": 0.3393835832024569, "learning_rate": 5.762452341706588e-06, "loss": 0.0369, "step": 2280 }, { "epoch": 2.0257548845470694, "grad_norm": 0.3667445086990257, "learning_rate": 5.758621800256104e-06, "loss": 0.0326, "step": 2281 }, { "epoch": 2.0266429840142095, "grad_norm": 0.3703001088427121, "learning_rate": 5.7547908030082325e-06, "loss": 0.0393, "step": 2282 }, { "epoch": 2.02753108348135, "grad_norm": 0.3698404547191662, "learning_rate": 5.750959352264717e-06, "loss": 0.0348, "step": 2283 }, { "epoch": 2.02841918294849, "grad_norm": 0.43462692559074184, "learning_rate": 5.747127450327587e-06, "loss": 0.0401, "step": 2284 }, { "epoch": 2.0293072824156306, "grad_norm": 0.6715523556953165, "learning_rate": 5.743295099499135e-06, "loss": 0.0529, "step": 2285 }, { "epoch": 2.0301953818827707, "grad_norm": 0.43120066175573857, "learning_rate": 5.739462302081927e-06, "loss": 0.0457, "step": 2286 }, { "epoch": 2.0310834813499112, "grad_norm": 0.35974781406853623, "learning_rate": 5.735629060378794e-06, "loss": 0.0374, "step": 2287 }, { "epoch": 2.0319715808170513, "grad_norm": 0.41685416813432485, "learning_rate": 5.731795376692836e-06, "loss": 0.0416, "step": 2288 }, { "epoch": 2.032859680284192, "grad_norm": 0.467495635455071, "learning_rate": 5.7279612533274155e-06, "loss": 0.0431, "step": 2289 }, { "epoch": 2.0337477797513324, "grad_norm": 0.3463393769904308, "learning_rate": 5.724126692586167e-06, "loss": 0.0347, "step": 2290 }, { "epoch": 2.0346358792184724, "grad_norm": 0.3883881517285845, "learning_rate": 5.720291696772979e-06, "loss": 0.0465, "step": 2291 }, { "epoch": 2.035523978685613, "grad_norm": 0.3759559738117336, "learning_rate": 5.716456268192007e-06, "loss": 0.0314, "step": 2292 }, { "epoch": 2.036412078152753, "grad_norm": 0.389950661760715, "learning_rate": 5.712620409147663e-06, "loss": 0.0393, "step": 2293 }, { "epoch": 2.0373001776198936, "grad_norm": 0.3887518217522792, "learning_rate": 5.70878412194462e-06, "loss": 0.0357, "step": 2294 }, { "epoch": 2.0381882770870337, "grad_norm": 0.45164570387220965, "learning_rate": 5.704947408887807e-06, "loss": 0.0375, "step": 2295 }, { "epoch": 2.039076376554174, "grad_norm": 0.39126726554737956, "learning_rate": 5.70111027228241e-06, "loss": 0.043, "step": 2296 }, { "epoch": 2.0399644760213143, "grad_norm": 0.44466939780687276, "learning_rate": 5.697272714433867e-06, "loss": 0.0378, "step": 2297 }, { "epoch": 2.040852575488455, "grad_norm": 0.36494223514375723, "learning_rate": 5.693434737647872e-06, "loss": 0.0399, "step": 2298 }, { "epoch": 2.041740674955595, "grad_norm": 0.3791244667049865, "learning_rate": 5.68959634423037e-06, "loss": 0.0385, "step": 2299 }, { "epoch": 2.0426287744227354, "grad_norm": 0.3680804033230417, "learning_rate": 5.685757536487557e-06, "loss": 0.0333, "step": 2300 }, { "epoch": 2.0435168738898755, "grad_norm": 0.40232797194085973, "learning_rate": 5.681918316725874e-06, "loss": 0.0391, "step": 2301 }, { "epoch": 2.044404973357016, "grad_norm": 0.5488807941421024, "learning_rate": 5.678078687252014e-06, "loss": 0.041, "step": 2302 }, { "epoch": 2.0452930728241565, "grad_norm": 0.3533105248855852, "learning_rate": 5.674238650372913e-06, "loss": 0.0462, "step": 2303 }, { "epoch": 2.0461811722912966, "grad_norm": 0.36624694854114026, "learning_rate": 5.670398208395755e-06, "loss": 0.0327, "step": 2304 }, { "epoch": 2.047069271758437, "grad_norm": 0.44984451832680755, "learning_rate": 5.6665573636279645e-06, "loss": 0.0481, "step": 2305 }, { "epoch": 2.047957371225577, "grad_norm": 0.4424528698739734, "learning_rate": 5.662716118377212e-06, "loss": 0.0313, "step": 2306 }, { "epoch": 2.0488454706927177, "grad_norm": 0.6895484026383346, "learning_rate": 5.6588744749514015e-06, "loss": 0.0469, "step": 2307 }, { "epoch": 2.049733570159858, "grad_norm": 0.4513378893263943, "learning_rate": 5.6550324356586826e-06, "loss": 0.0394, "step": 2308 }, { "epoch": 2.0506216696269983, "grad_norm": 0.464556274747316, "learning_rate": 5.651190002807442e-06, "loss": 0.0499, "step": 2309 }, { "epoch": 2.0515097690941384, "grad_norm": 0.3924616519590994, "learning_rate": 5.647347178706301e-06, "loss": 0.0455, "step": 2310 }, { "epoch": 2.052397868561279, "grad_norm": 0.43472551535337545, "learning_rate": 5.643503965664113e-06, "loss": 0.0435, "step": 2311 }, { "epoch": 2.053285968028419, "grad_norm": 0.33532355528532265, "learning_rate": 5.639660365989974e-06, "loss": 0.0398, "step": 2312 }, { "epoch": 2.0541740674955595, "grad_norm": 0.4111267915605023, "learning_rate": 5.635816381993204e-06, "loss": 0.0504, "step": 2313 }, { "epoch": 2.0550621669626996, "grad_norm": 0.30689373893709365, "learning_rate": 5.631972015983357e-06, "loss": 0.0404, "step": 2314 }, { "epoch": 2.05595026642984, "grad_norm": 0.39067768284945187, "learning_rate": 5.628127270270216e-06, "loss": 0.0362, "step": 2315 }, { "epoch": 2.0568383658969807, "grad_norm": 0.42380897626283953, "learning_rate": 5.624282147163794e-06, "loss": 0.0516, "step": 2316 }, { "epoch": 2.0577264653641207, "grad_norm": 0.3401662750534891, "learning_rate": 5.620436648974327e-06, "loss": 0.0452, "step": 2317 }, { "epoch": 2.0586145648312613, "grad_norm": 0.35534646363805705, "learning_rate": 5.616590778012281e-06, "loss": 0.0397, "step": 2318 }, { "epoch": 2.0595026642984013, "grad_norm": 0.4362074819535085, "learning_rate": 5.612744536588342e-06, "loss": 0.0458, "step": 2319 }, { "epoch": 2.060390763765542, "grad_norm": 0.4322794319184328, "learning_rate": 5.60889792701342e-06, "loss": 0.0409, "step": 2320 }, { "epoch": 2.061278863232682, "grad_norm": 0.3883459633322041, "learning_rate": 5.605050951598648e-06, "loss": 0.0463, "step": 2321 }, { "epoch": 2.0621669626998225, "grad_norm": 0.40505188925287783, "learning_rate": 5.601203612655375e-06, "loss": 0.0471, "step": 2322 }, { "epoch": 2.0630550621669625, "grad_norm": 0.36108421805009805, "learning_rate": 5.597355912495172e-06, "loss": 0.0364, "step": 2323 }, { "epoch": 2.063943161634103, "grad_norm": 0.4234595419373533, "learning_rate": 5.593507853429824e-06, "loss": 0.0335, "step": 2324 }, { "epoch": 2.064831261101243, "grad_norm": 0.4297831836771365, "learning_rate": 5.589659437771336e-06, "loss": 0.0373, "step": 2325 }, { "epoch": 2.0657193605683837, "grad_norm": 0.48330484467582835, "learning_rate": 5.5858106678319225e-06, "loss": 0.0342, "step": 2326 }, { "epoch": 2.0666074600355238, "grad_norm": 0.3886759862387088, "learning_rate": 5.581961545924013e-06, "loss": 0.0363, "step": 2327 }, { "epoch": 2.0674955595026643, "grad_norm": 0.41718976870321706, "learning_rate": 5.578112074360247e-06, "loss": 0.0358, "step": 2328 }, { "epoch": 2.068383658969805, "grad_norm": 0.39392661421508224, "learning_rate": 5.574262255453479e-06, "loss": 0.0426, "step": 2329 }, { "epoch": 2.069271758436945, "grad_norm": 0.42149495836553186, "learning_rate": 5.570412091516764e-06, "loss": 0.0395, "step": 2330 }, { "epoch": 2.0701598579040854, "grad_norm": 0.4237706837990429, "learning_rate": 5.566561584863374e-06, "loss": 0.047, "step": 2331 }, { "epoch": 2.0710479573712255, "grad_norm": 0.4034668610493672, "learning_rate": 5.5627107378067764e-06, "loss": 0.041, "step": 2332 }, { "epoch": 2.071936056838366, "grad_norm": 0.36742348344596093, "learning_rate": 5.558859552660653e-06, "loss": 0.0316, "step": 2333 }, { "epoch": 2.072824156305506, "grad_norm": 0.3587954633601638, "learning_rate": 5.5550080317388814e-06, "loss": 0.0371, "step": 2334 }, { "epoch": 2.0737122557726466, "grad_norm": 0.40199268106153924, "learning_rate": 5.5511561773555455e-06, "loss": 0.0487, "step": 2335 }, { "epoch": 2.0746003552397867, "grad_norm": 0.407905082384333, "learning_rate": 5.547303991824926e-06, "loss": 0.0448, "step": 2336 }, { "epoch": 2.075488454706927, "grad_norm": 0.5170026832974067, "learning_rate": 5.5434514774615055e-06, "loss": 0.0467, "step": 2337 }, { "epoch": 2.0763765541740673, "grad_norm": 0.4119277516723528, "learning_rate": 5.539598636579963e-06, "loss": 0.0448, "step": 2338 }, { "epoch": 2.077264653641208, "grad_norm": 0.3260123660764787, "learning_rate": 5.535745471495174e-06, "loss": 0.0368, "step": 2339 }, { "epoch": 2.0781527531083483, "grad_norm": 0.3414650001047392, "learning_rate": 5.531891984522209e-06, "loss": 0.0368, "step": 2340 }, { "epoch": 2.0790408525754884, "grad_norm": 0.4056721485870448, "learning_rate": 5.52803817797633e-06, "loss": 0.0438, "step": 2341 }, { "epoch": 2.079928952042629, "grad_norm": 0.3678665436664055, "learning_rate": 5.524184054172993e-06, "loss": 0.0531, "step": 2342 }, { "epoch": 2.080817051509769, "grad_norm": 0.3163723687826652, "learning_rate": 5.520329615427844e-06, "loss": 0.0341, "step": 2343 }, { "epoch": 2.0817051509769096, "grad_norm": 0.4452843633721888, "learning_rate": 5.516474864056719e-06, "loss": 0.0508, "step": 2344 }, { "epoch": 2.0825932504440496, "grad_norm": 0.5166340185471159, "learning_rate": 5.5126198023756405e-06, "loss": 0.0414, "step": 2345 }, { "epoch": 2.08348134991119, "grad_norm": 0.3184361504906298, "learning_rate": 5.5087644327008175e-06, "loss": 0.0282, "step": 2346 }, { "epoch": 2.0843694493783302, "grad_norm": 0.31126797004876566, "learning_rate": 5.504908757348646e-06, "loss": 0.0336, "step": 2347 }, { "epoch": 2.0852575488454708, "grad_norm": 0.4467615849229334, "learning_rate": 5.501052778635703e-06, "loss": 0.0433, "step": 2348 }, { "epoch": 2.086145648312611, "grad_norm": 0.48231834106863347, "learning_rate": 5.4971964988787495e-06, "loss": 0.0505, "step": 2349 }, { "epoch": 2.0870337477797514, "grad_norm": 0.46344451177017526, "learning_rate": 5.493339920394725e-06, "loss": 0.0416, "step": 2350 }, { "epoch": 2.0879218472468914, "grad_norm": 0.4546314540078828, "learning_rate": 5.4894830455007565e-06, "loss": 0.0434, "step": 2351 }, { "epoch": 2.088809946714032, "grad_norm": 0.4698531978955114, "learning_rate": 5.4856258765141345e-06, "loss": 0.0499, "step": 2352 }, { "epoch": 2.0896980461811725, "grad_norm": 0.3739618895122828, "learning_rate": 5.481768415752342e-06, "loss": 0.0458, "step": 2353 }, { "epoch": 2.0905861456483126, "grad_norm": 0.47055253717642037, "learning_rate": 5.477910665533025e-06, "loss": 0.0366, "step": 2354 }, { "epoch": 2.091474245115453, "grad_norm": 0.514735803166204, "learning_rate": 5.4740526281740106e-06, "loss": 0.0397, "step": 2355 }, { "epoch": 2.092362344582593, "grad_norm": 0.7233544526900478, "learning_rate": 5.470194305993296e-06, "loss": 0.0525, "step": 2356 }, { "epoch": 2.0932504440497337, "grad_norm": 0.3705429016840043, "learning_rate": 5.4663357013090504e-06, "loss": 0.0396, "step": 2357 }, { "epoch": 2.094138543516874, "grad_norm": 0.3236082840599957, "learning_rate": 5.462476816439609e-06, "loss": 0.0354, "step": 2358 }, { "epoch": 2.0950266429840143, "grad_norm": 0.45769917506165464, "learning_rate": 5.458617653703483e-06, "loss": 0.0421, "step": 2359 }, { "epoch": 2.0959147424511544, "grad_norm": 0.4548383020284334, "learning_rate": 5.454758215419343e-06, "loss": 0.0504, "step": 2360 }, { "epoch": 2.096802841918295, "grad_norm": 0.3942923384474443, "learning_rate": 5.450898503906027e-06, "loss": 0.0391, "step": 2361 }, { "epoch": 2.097690941385435, "grad_norm": 0.40259282724092404, "learning_rate": 5.447038521482542e-06, "loss": 0.0394, "step": 2362 }, { "epoch": 2.0985790408525755, "grad_norm": 0.48116526775975643, "learning_rate": 5.443178270468052e-06, "loss": 0.0412, "step": 2363 }, { "epoch": 2.0994671403197156, "grad_norm": 0.3684661252311372, "learning_rate": 5.439317753181883e-06, "loss": 0.0387, "step": 2364 }, { "epoch": 2.100355239786856, "grad_norm": 0.3798127256992596, "learning_rate": 5.435456971943525e-06, "loss": 0.0421, "step": 2365 }, { "epoch": 2.1012433392539966, "grad_norm": 0.6152548648899345, "learning_rate": 5.431595929072622e-06, "loss": 0.0538, "step": 2366 }, { "epoch": 2.1021314387211367, "grad_norm": 0.4109862071182783, "learning_rate": 5.427734626888979e-06, "loss": 0.0368, "step": 2367 }, { "epoch": 2.1030195381882772, "grad_norm": 0.362726515994217, "learning_rate": 5.423873067712552e-06, "loss": 0.0367, "step": 2368 }, { "epoch": 2.1039076376554173, "grad_norm": 0.5872429390180516, "learning_rate": 5.420011253863458e-06, "loss": 0.0453, "step": 2369 }, { "epoch": 2.104795737122558, "grad_norm": 0.3504058387005002, "learning_rate": 5.416149187661961e-06, "loss": 0.0324, "step": 2370 }, { "epoch": 2.105683836589698, "grad_norm": 0.4239830737147979, "learning_rate": 5.41228687142848e-06, "loss": 0.0396, "step": 2371 }, { "epoch": 2.1065719360568385, "grad_norm": 0.4668127971727929, "learning_rate": 5.408424307483583e-06, "loss": 0.0427, "step": 2372 }, { "epoch": 2.1074600355239785, "grad_norm": 0.40697925373166755, "learning_rate": 5.404561498147989e-06, "loss": 0.035, "step": 2373 }, { "epoch": 2.108348134991119, "grad_norm": 0.4978113689735989, "learning_rate": 5.400698445742557e-06, "loss": 0.0492, "step": 2374 }, { "epoch": 2.109236234458259, "grad_norm": 0.323997207827145, "learning_rate": 5.396835152588304e-06, "loss": 0.0366, "step": 2375 }, { "epoch": 2.1101243339253997, "grad_norm": 0.47966324809157906, "learning_rate": 5.392971621006382e-06, "loss": 0.0416, "step": 2376 }, { "epoch": 2.11101243339254, "grad_norm": 0.3550930916403458, "learning_rate": 5.389107853318088e-06, "loss": 0.0376, "step": 2377 }, { "epoch": 2.1119005328596803, "grad_norm": 0.37221503133895334, "learning_rate": 5.385243851844866e-06, "loss": 0.0365, "step": 2378 }, { "epoch": 2.112788632326821, "grad_norm": 0.33210107248881343, "learning_rate": 5.381379618908296e-06, "loss": 0.0403, "step": 2379 }, { "epoch": 2.113676731793961, "grad_norm": 0.3391506479993458, "learning_rate": 5.377515156830094e-06, "loss": 0.0297, "step": 2380 }, { "epoch": 2.1145648312611014, "grad_norm": 0.4238328579655558, "learning_rate": 5.373650467932122e-06, "loss": 0.0367, "step": 2381 }, { "epoch": 2.1154529307282415, "grad_norm": 0.41731344402610704, "learning_rate": 5.369785554536372e-06, "loss": 0.0379, "step": 2382 }, { "epoch": 2.116341030195382, "grad_norm": 0.49826717221477446, "learning_rate": 5.365920418964973e-06, "loss": 0.0406, "step": 2383 }, { "epoch": 2.117229129662522, "grad_norm": 0.46991340393479397, "learning_rate": 5.362055063540186e-06, "loss": 0.0405, "step": 2384 }, { "epoch": 2.1181172291296626, "grad_norm": 0.3551564687160109, "learning_rate": 5.3581894905844066e-06, "loss": 0.0373, "step": 2385 }, { "epoch": 2.1190053285968027, "grad_norm": 0.38328144224858046, "learning_rate": 5.35432370242016e-06, "loss": 0.0398, "step": 2386 }, { "epoch": 2.119893428063943, "grad_norm": 0.3514847195641634, "learning_rate": 5.350457701370099e-06, "loss": 0.0389, "step": 2387 }, { "epoch": 2.1207815275310833, "grad_norm": 0.36488905651696707, "learning_rate": 5.346591489757008e-06, "loss": 0.0399, "step": 2388 }, { "epoch": 2.121669626998224, "grad_norm": 0.3875304275323844, "learning_rate": 5.3427250699037925e-06, "loss": 0.042, "step": 2389 }, { "epoch": 2.122557726465364, "grad_norm": 0.45757383320267986, "learning_rate": 5.338858444133489e-06, "loss": 0.0558, "step": 2390 }, { "epoch": 2.1234458259325044, "grad_norm": 0.3533317774363015, "learning_rate": 5.334991614769254e-06, "loss": 0.0421, "step": 2391 }, { "epoch": 2.124333925399645, "grad_norm": 0.4164393587446632, "learning_rate": 5.331124584134368e-06, "loss": 0.0445, "step": 2392 }, { "epoch": 2.125222024866785, "grad_norm": 0.44989228202393267, "learning_rate": 5.327257354552231e-06, "loss": 0.042, "step": 2393 }, { "epoch": 2.1261101243339255, "grad_norm": 0.3137401366512473, "learning_rate": 5.323389928346364e-06, "loss": 0.0304, "step": 2394 }, { "epoch": 2.1269982238010656, "grad_norm": 0.38091384172821674, "learning_rate": 5.319522307840404e-06, "loss": 0.0338, "step": 2395 }, { "epoch": 2.127886323268206, "grad_norm": 0.5376459504056712, "learning_rate": 5.315654495358108e-06, "loss": 0.045, "step": 2396 }, { "epoch": 2.1287744227353462, "grad_norm": 0.36917158695970886, "learning_rate": 5.3117864932233445e-06, "loss": 0.0355, "step": 2397 }, { "epoch": 2.1296625222024868, "grad_norm": 0.3223307555476067, "learning_rate": 5.3079183037601016e-06, "loss": 0.0392, "step": 2398 }, { "epoch": 2.130550621669627, "grad_norm": 0.38856369673299446, "learning_rate": 5.304049929292472e-06, "loss": 0.0401, "step": 2399 }, { "epoch": 2.1314387211367674, "grad_norm": 0.4974670677937293, "learning_rate": 5.300181372144665e-06, "loss": 0.0452, "step": 2400 }, { "epoch": 2.1323268206039074, "grad_norm": 0.4576734124078994, "learning_rate": 5.296312634641e-06, "loss": 0.046, "step": 2401 }, { "epoch": 2.133214920071048, "grad_norm": 0.36265008372983243, "learning_rate": 5.292443719105903e-06, "loss": 0.0363, "step": 2402 }, { "epoch": 2.1341030195381885, "grad_norm": 0.402050964517044, "learning_rate": 5.288574627863906e-06, "loss": 0.0401, "step": 2403 }, { "epoch": 2.1349911190053286, "grad_norm": 0.4020128427856151, "learning_rate": 5.284705363239651e-06, "loss": 0.0417, "step": 2404 }, { "epoch": 2.135879218472469, "grad_norm": 0.39612827186709665, "learning_rate": 5.280835927557876e-06, "loss": 0.04, "step": 2405 }, { "epoch": 2.136767317939609, "grad_norm": 0.44749739918923315, "learning_rate": 5.27696632314343e-06, "loss": 0.0421, "step": 2406 }, { "epoch": 2.1376554174067497, "grad_norm": 0.4011125419385018, "learning_rate": 5.27309655232126e-06, "loss": 0.0329, "step": 2407 }, { "epoch": 2.1385435168738898, "grad_norm": 0.39472381503967285, "learning_rate": 5.269226617416414e-06, "loss": 0.0392, "step": 2408 }, { "epoch": 2.1394316163410303, "grad_norm": 0.3719757758537787, "learning_rate": 5.265356520754033e-06, "loss": 0.0331, "step": 2409 }, { "epoch": 2.1403197158081704, "grad_norm": 0.450768938323179, "learning_rate": 5.2614862646593665e-06, "loss": 0.0411, "step": 2410 }, { "epoch": 2.141207815275311, "grad_norm": 0.3439741035367346, "learning_rate": 5.257615851457749e-06, "loss": 0.041, "step": 2411 }, { "epoch": 2.142095914742451, "grad_norm": 0.3722561847725203, "learning_rate": 5.253745283474615e-06, "loss": 0.0374, "step": 2412 }, { "epoch": 2.1429840142095915, "grad_norm": 0.34655027755283196, "learning_rate": 5.2498745630354895e-06, "loss": 0.0337, "step": 2413 }, { "epoch": 2.143872113676732, "grad_norm": 0.37294456519210245, "learning_rate": 5.246003692465993e-06, "loss": 0.0412, "step": 2414 }, { "epoch": 2.144760213143872, "grad_norm": 0.3286926436051891, "learning_rate": 5.242132674091828e-06, "loss": 0.0386, "step": 2415 }, { "epoch": 2.1456483126110126, "grad_norm": 0.3958233459670188, "learning_rate": 5.238261510238797e-06, "loss": 0.0463, "step": 2416 }, { "epoch": 2.1465364120781527, "grad_norm": 0.50878401519936, "learning_rate": 5.234390203232781e-06, "loss": 0.0508, "step": 2417 }, { "epoch": 2.1474245115452932, "grad_norm": 0.3528926521205191, "learning_rate": 5.230518755399749e-06, "loss": 0.04, "step": 2418 }, { "epoch": 2.1483126110124333, "grad_norm": 0.3876254327536883, "learning_rate": 5.22664716906576e-06, "loss": 0.0334, "step": 2419 }, { "epoch": 2.149200710479574, "grad_norm": 0.4164873782339424, "learning_rate": 5.222775446556949e-06, "loss": 0.0478, "step": 2420 }, { "epoch": 2.150088809946714, "grad_norm": 0.48165592626531356, "learning_rate": 5.2189035901995345e-06, "loss": 0.0464, "step": 2421 }, { "epoch": 2.1509769094138544, "grad_norm": 0.2926323123110968, "learning_rate": 5.21503160231982e-06, "loss": 0.0286, "step": 2422 }, { "epoch": 2.1518650088809945, "grad_norm": 0.3862857577544898, "learning_rate": 5.2111594852441815e-06, "loss": 0.0401, "step": 2423 }, { "epoch": 2.152753108348135, "grad_norm": 0.45724493121170556, "learning_rate": 5.207287241299078e-06, "loss": 0.0438, "step": 2424 }, { "epoch": 2.153641207815275, "grad_norm": 0.46963850875746427, "learning_rate": 5.203414872811042e-06, "loss": 0.0367, "step": 2425 }, { "epoch": 2.1545293072824157, "grad_norm": 0.3874755323565219, "learning_rate": 5.199542382106683e-06, "loss": 0.0377, "step": 2426 }, { "epoch": 2.1554174067495557, "grad_norm": 0.42098283646840423, "learning_rate": 5.195669771512678e-06, "loss": 0.04, "step": 2427 }, { "epoch": 2.1563055062166963, "grad_norm": 0.4038852165924546, "learning_rate": 5.191797043355784e-06, "loss": 0.0344, "step": 2428 }, { "epoch": 2.157193605683837, "grad_norm": 0.46781236227378614, "learning_rate": 5.1879241999628225e-06, "loss": 0.0509, "step": 2429 }, { "epoch": 2.158081705150977, "grad_norm": 0.475627220185539, "learning_rate": 5.18405124366069e-06, "loss": 0.0553, "step": 2430 }, { "epoch": 2.1589698046181174, "grad_norm": 0.4750349019531961, "learning_rate": 5.180178176776343e-06, "loss": 0.0397, "step": 2431 }, { "epoch": 2.1598579040852575, "grad_norm": 0.41955932914949795, "learning_rate": 5.176305001636815e-06, "loss": 0.0407, "step": 2432 }, { "epoch": 2.160746003552398, "grad_norm": 0.5094174279352747, "learning_rate": 5.172431720569193e-06, "loss": 0.051, "step": 2433 }, { "epoch": 2.161634103019538, "grad_norm": 0.6499009036247565, "learning_rate": 5.168558335900637e-06, "loss": 0.0589, "step": 2434 }, { "epoch": 2.1625222024866786, "grad_norm": 0.928135622496828, "learning_rate": 5.164684849958361e-06, "loss": 0.0471, "step": 2435 }, { "epoch": 2.1634103019538187, "grad_norm": 0.4229532799999624, "learning_rate": 5.160811265069649e-06, "loss": 0.0493, "step": 2436 }, { "epoch": 2.164298401420959, "grad_norm": 0.6163607840284632, "learning_rate": 5.156937583561836e-06, "loss": 0.0524, "step": 2437 }, { "epoch": 2.1651865008880993, "grad_norm": 0.49230292894102246, "learning_rate": 5.153063807762323e-06, "loss": 0.0365, "step": 2438 }, { "epoch": 2.16607460035524, "grad_norm": 0.4943115148059102, "learning_rate": 5.149189939998559e-06, "loss": 0.0452, "step": 2439 }, { "epoch": 2.1669626998223803, "grad_norm": 0.467686666274734, "learning_rate": 5.145315982598055e-06, "loss": 0.0464, "step": 2440 }, { "epoch": 2.1678507992895204, "grad_norm": 0.47532599194431346, "learning_rate": 5.141441937888373e-06, "loss": 0.0468, "step": 2441 }, { "epoch": 2.168738898756661, "grad_norm": 0.3892145559248581, "learning_rate": 5.137567808197129e-06, "loss": 0.0411, "step": 2442 }, { "epoch": 2.169626998223801, "grad_norm": 0.36256033283499056, "learning_rate": 5.133693595851987e-06, "loss": 0.0339, "step": 2443 }, { "epoch": 2.1705150976909415, "grad_norm": 0.49318274840470505, "learning_rate": 5.129819303180664e-06, "loss": 0.0318, "step": 2444 }, { "epoch": 2.1714031971580816, "grad_norm": 0.4068342729967645, "learning_rate": 5.125944932510924e-06, "loss": 0.045, "step": 2445 }, { "epoch": 2.172291296625222, "grad_norm": 0.40835516360053953, "learning_rate": 5.1220704861705775e-06, "loss": 0.0308, "step": 2446 }, { "epoch": 2.173179396092362, "grad_norm": 0.4293983663737367, "learning_rate": 5.1181959664874805e-06, "loss": 0.0396, "step": 2447 }, { "epoch": 2.1740674955595027, "grad_norm": 0.3551141328057159, "learning_rate": 5.114321375789533e-06, "loss": 0.0315, "step": 2448 }, { "epoch": 2.174955595026643, "grad_norm": 0.39060087437266106, "learning_rate": 5.1104467164046826e-06, "loss": 0.0389, "step": 2449 }, { "epoch": 2.1758436944937833, "grad_norm": 0.4575507982502618, "learning_rate": 5.106571990660906e-06, "loss": 0.0437, "step": 2450 }, { "epoch": 2.1767317939609234, "grad_norm": 0.6223067818026029, "learning_rate": 5.102697200886233e-06, "loss": 0.0445, "step": 2451 }, { "epoch": 2.177619893428064, "grad_norm": 0.4578559816803396, "learning_rate": 5.098822349408723e-06, "loss": 0.0408, "step": 2452 }, { "epoch": 2.1785079928952045, "grad_norm": 0.4007856222752702, "learning_rate": 5.094947438556478e-06, "loss": 0.0426, "step": 2453 }, { "epoch": 2.1793960923623446, "grad_norm": 0.4543726905784699, "learning_rate": 5.091072470657632e-06, "loss": 0.0477, "step": 2454 }, { "epoch": 2.180284191829485, "grad_norm": 0.4401945881834879, "learning_rate": 5.087197448040356e-06, "loss": 0.0495, "step": 2455 }, { "epoch": 2.181172291296625, "grad_norm": 0.37683087318402914, "learning_rate": 5.08332237303285e-06, "loss": 0.0407, "step": 2456 }, { "epoch": 2.1820603907637657, "grad_norm": 0.4454427822161007, "learning_rate": 5.079447247963353e-06, "loss": 0.0409, "step": 2457 }, { "epoch": 2.1829484902309058, "grad_norm": 0.3944633204708997, "learning_rate": 5.075572075160124e-06, "loss": 0.0457, "step": 2458 }, { "epoch": 2.1838365896980463, "grad_norm": 0.457263937853391, "learning_rate": 5.071696856951456e-06, "loss": 0.0407, "step": 2459 }, { "epoch": 2.1847246891651864, "grad_norm": 0.46945903448285464, "learning_rate": 5.067821595665672e-06, "loss": 0.0409, "step": 2460 }, { "epoch": 2.185612788632327, "grad_norm": 0.4454713636140774, "learning_rate": 5.063946293631117e-06, "loss": 0.0399, "step": 2461 }, { "epoch": 2.186500888099467, "grad_norm": 0.3639475258217265, "learning_rate": 5.060070953176161e-06, "loss": 0.0346, "step": 2462 }, { "epoch": 2.1873889875666075, "grad_norm": 0.3692077335640843, "learning_rate": 5.0561955766291956e-06, "loss": 0.0357, "step": 2463 }, { "epoch": 2.1882770870337476, "grad_norm": 0.33478174781272485, "learning_rate": 5.05232016631864e-06, "loss": 0.0444, "step": 2464 }, { "epoch": 2.189165186500888, "grad_norm": 0.659648827807359, "learning_rate": 5.048444724572926e-06, "loss": 0.0403, "step": 2465 }, { "epoch": 2.1900532859680286, "grad_norm": 0.5913352691181757, "learning_rate": 5.04456925372051e-06, "loss": 0.0448, "step": 2466 }, { "epoch": 2.1909413854351687, "grad_norm": 0.3353058086495773, "learning_rate": 5.040693756089865e-06, "loss": 0.0359, "step": 2467 }, { "epoch": 2.191829484902309, "grad_norm": 0.7156363476313247, "learning_rate": 5.036818234009475e-06, "loss": 0.0479, "step": 2468 }, { "epoch": 2.1927175843694493, "grad_norm": 0.3959098125472889, "learning_rate": 5.032942689807846e-06, "loss": 0.038, "step": 2469 }, { "epoch": 2.19360568383659, "grad_norm": 0.3429627851552565, "learning_rate": 5.029067125813491e-06, "loss": 0.0331, "step": 2470 }, { "epoch": 2.19449378330373, "grad_norm": 0.3924177849475489, "learning_rate": 5.025191544354943e-06, "loss": 0.035, "step": 2471 }, { "epoch": 2.1953818827708704, "grad_norm": 0.4869526520943761, "learning_rate": 5.021315947760733e-06, "loss": 0.049, "step": 2472 }, { "epoch": 2.1962699822380105, "grad_norm": 0.3662791431581093, "learning_rate": 5.0174403383594164e-06, "loss": 0.0411, "step": 2473 }, { "epoch": 2.197158081705151, "grad_norm": 0.42488731888157805, "learning_rate": 5.013564718479541e-06, "loss": 0.0404, "step": 2474 }, { "epoch": 2.198046181172291, "grad_norm": 0.43766610179518745, "learning_rate": 5.009689090449672e-06, "loss": 0.0427, "step": 2475 }, { "epoch": 2.1989342806394316, "grad_norm": 0.559354053145988, "learning_rate": 5.0058134565983755e-06, "loss": 0.0497, "step": 2476 }, { "epoch": 2.199822380106572, "grad_norm": 0.43829322947353366, "learning_rate": 5.001937819254222e-06, "loss": 0.0453, "step": 2477 }, { "epoch": 2.2007104795737122, "grad_norm": 0.33712304531982884, "learning_rate": 4.99806218074578e-06, "loss": 0.0365, "step": 2478 }, { "epoch": 2.2015985790408528, "grad_norm": 0.3471908460157216, "learning_rate": 4.994186543401625e-06, "loss": 0.0406, "step": 2479 }, { "epoch": 2.202486678507993, "grad_norm": 0.3395579542540958, "learning_rate": 4.990310909550329e-06, "loss": 0.0384, "step": 2480 }, { "epoch": 2.2033747779751334, "grad_norm": 0.4184935348825603, "learning_rate": 4.98643528152046e-06, "loss": 0.045, "step": 2481 }, { "epoch": 2.2042628774422734, "grad_norm": 0.45204372784847996, "learning_rate": 4.982559661640587e-06, "loss": 0.0406, "step": 2482 }, { "epoch": 2.205150976909414, "grad_norm": 0.3896104210146149, "learning_rate": 4.978684052239268e-06, "loss": 0.0402, "step": 2483 }, { "epoch": 2.206039076376554, "grad_norm": 0.35630909986035014, "learning_rate": 4.974808455645059e-06, "loss": 0.0336, "step": 2484 }, { "epoch": 2.2069271758436946, "grad_norm": 0.34952584839653356, "learning_rate": 4.970932874186509e-06, "loss": 0.0316, "step": 2485 }, { "epoch": 2.2078152753108347, "grad_norm": 0.4138651550318472, "learning_rate": 4.967057310192157e-06, "loss": 0.0442, "step": 2486 }, { "epoch": 2.208703374777975, "grad_norm": 0.3472660999027293, "learning_rate": 4.963181765990526e-06, "loss": 0.0344, "step": 2487 }, { "epoch": 2.2095914742451153, "grad_norm": 0.4229061559000631, "learning_rate": 4.959306243910137e-06, "loss": 0.0454, "step": 2488 }, { "epoch": 2.210479573712256, "grad_norm": 0.46172989835250633, "learning_rate": 4.955430746279491e-06, "loss": 0.0472, "step": 2489 }, { "epoch": 2.211367673179396, "grad_norm": 0.4036318673922448, "learning_rate": 4.9515552754270755e-06, "loss": 0.0427, "step": 2490 }, { "epoch": 2.2122557726465364, "grad_norm": 0.541114161640207, "learning_rate": 4.947679833681362e-06, "loss": 0.0447, "step": 2491 }, { "epoch": 2.213143872113677, "grad_norm": 0.6612485939486665, "learning_rate": 4.943804423370805e-06, "loss": 0.0375, "step": 2492 }, { "epoch": 2.214031971580817, "grad_norm": 0.38519461605628913, "learning_rate": 4.939929046823841e-06, "loss": 0.0373, "step": 2493 }, { "epoch": 2.2149200710479575, "grad_norm": 0.43596239859118924, "learning_rate": 4.936053706368885e-06, "loss": 0.0369, "step": 2494 }, { "epoch": 2.2158081705150976, "grad_norm": 0.5214142272669903, "learning_rate": 4.93217840433433e-06, "loss": 0.0487, "step": 2495 }, { "epoch": 2.216696269982238, "grad_norm": 0.44909038495860126, "learning_rate": 4.928303143048546e-06, "loss": 0.0463, "step": 2496 }, { "epoch": 2.217584369449378, "grad_norm": 0.32007862995712666, "learning_rate": 4.924427924839877e-06, "loss": 0.0272, "step": 2497 }, { "epoch": 2.2184724689165187, "grad_norm": 0.39234455389527295, "learning_rate": 4.92055275203665e-06, "loss": 0.0449, "step": 2498 }, { "epoch": 2.219360568383659, "grad_norm": 0.3507787456257942, "learning_rate": 4.916677626967151e-06, "loss": 0.0407, "step": 2499 }, { "epoch": 2.2202486678507993, "grad_norm": 0.5166786079477914, "learning_rate": 4.912802551959645e-06, "loss": 0.048, "step": 2500 }, { "epoch": 2.2211367673179394, "grad_norm": 0.36191579299062243, "learning_rate": 4.908927529342368e-06, "loss": 0.0426, "step": 2501 }, { "epoch": 2.22202486678508, "grad_norm": 0.41294413343315517, "learning_rate": 4.905052561443524e-06, "loss": 0.0483, "step": 2502 }, { "epoch": 2.2229129662522205, "grad_norm": 0.44858654491957084, "learning_rate": 4.901177650591279e-06, "loss": 0.0423, "step": 2503 }, { "epoch": 2.2238010657193605, "grad_norm": 0.6103950236671091, "learning_rate": 4.897302799113769e-06, "loss": 0.044, "step": 2504 }, { "epoch": 2.224689165186501, "grad_norm": 0.5347523795636051, "learning_rate": 4.893428009339095e-06, "loss": 0.0452, "step": 2505 }, { "epoch": 2.225577264653641, "grad_norm": 0.41891851271819086, "learning_rate": 4.889553283595321e-06, "loss": 0.0368, "step": 2506 }, { "epoch": 2.2264653641207817, "grad_norm": 0.6514687549744711, "learning_rate": 4.885678624210467e-06, "loss": 0.044, "step": 2507 }, { "epoch": 2.2273534635879217, "grad_norm": 0.4499692226864143, "learning_rate": 4.881804033512521e-06, "loss": 0.0426, "step": 2508 }, { "epoch": 2.2282415630550623, "grad_norm": 0.32550795932680926, "learning_rate": 4.877929513829424e-06, "loss": 0.0353, "step": 2509 }, { "epoch": 2.2291296625222023, "grad_norm": 0.3533849860086317, "learning_rate": 4.874055067489076e-06, "loss": 0.0403, "step": 2510 }, { "epoch": 2.230017761989343, "grad_norm": 0.5868618759230922, "learning_rate": 4.870180696819338e-06, "loss": 0.0508, "step": 2511 }, { "epoch": 2.230905861456483, "grad_norm": 0.4739367414083903, "learning_rate": 4.866306404148015e-06, "loss": 0.0465, "step": 2512 }, { "epoch": 2.2317939609236235, "grad_norm": 0.5686742423392736, "learning_rate": 4.862432191802872e-06, "loss": 0.0488, "step": 2513 }, { "epoch": 2.232682060390764, "grad_norm": 0.6190975724239128, "learning_rate": 4.858558062111627e-06, "loss": 0.0544, "step": 2514 }, { "epoch": 2.233570159857904, "grad_norm": 0.35318577146410396, "learning_rate": 4.854684017401946e-06, "loss": 0.0313, "step": 2515 }, { "epoch": 2.2344582593250446, "grad_norm": 0.4053766693688942, "learning_rate": 4.850810060001442e-06, "loss": 0.0405, "step": 2516 }, { "epoch": 2.2353463587921847, "grad_norm": 0.43843182979747325, "learning_rate": 4.846936192237678e-06, "loss": 0.0394, "step": 2517 }, { "epoch": 2.236234458259325, "grad_norm": 0.36834885738507367, "learning_rate": 4.843062416438164e-06, "loss": 0.0321, "step": 2518 }, { "epoch": 2.2371225577264653, "grad_norm": 0.44165322336636986, "learning_rate": 4.839188734930353e-06, "loss": 0.0426, "step": 2519 }, { "epoch": 2.238010657193606, "grad_norm": 0.48788470844480375, "learning_rate": 4.83531515004164e-06, "loss": 0.0445, "step": 2520 }, { "epoch": 2.238898756660746, "grad_norm": 0.4193260877184814, "learning_rate": 4.831441664099366e-06, "loss": 0.0455, "step": 2521 }, { "epoch": 2.2397868561278864, "grad_norm": 0.43055837743335906, "learning_rate": 4.8275682794308086e-06, "loss": 0.0459, "step": 2522 }, { "epoch": 2.2406749555950265, "grad_norm": 0.4831699768851515, "learning_rate": 4.823694998363187e-06, "loss": 0.0554, "step": 2523 }, { "epoch": 2.241563055062167, "grad_norm": 0.3282028064215389, "learning_rate": 4.8198218232236574e-06, "loss": 0.0353, "step": 2524 }, { "epoch": 2.242451154529307, "grad_norm": 0.319811130083667, "learning_rate": 4.8159487563393106e-06, "loss": 0.043, "step": 2525 }, { "epoch": 2.2433392539964476, "grad_norm": 0.36193086810401764, "learning_rate": 4.8120758000371775e-06, "loss": 0.0442, "step": 2526 }, { "epoch": 2.2442273534635877, "grad_norm": 0.33164621134940664, "learning_rate": 4.808202956644219e-06, "loss": 0.0339, "step": 2527 }, { "epoch": 2.2451154529307282, "grad_norm": 1.0731647467839553, "learning_rate": 4.804330228487323e-06, "loss": 0.0459, "step": 2528 }, { "epoch": 2.2460035523978688, "grad_norm": 0.41240977356633574, "learning_rate": 4.800457617893319e-06, "loss": 0.037, "step": 2529 }, { "epoch": 2.246891651865009, "grad_norm": 0.42211434934630127, "learning_rate": 4.796585127188958e-06, "loss": 0.0402, "step": 2530 }, { "epoch": 2.2477797513321494, "grad_norm": 0.721098937515704, "learning_rate": 4.792712758700923e-06, "loss": 0.0417, "step": 2531 }, { "epoch": 2.2486678507992894, "grad_norm": 0.42751787066261426, "learning_rate": 4.788840514755819e-06, "loss": 0.0371, "step": 2532 }, { "epoch": 2.24955595026643, "grad_norm": 0.5856360344875098, "learning_rate": 4.784968397680181e-06, "loss": 0.0354, "step": 2533 }, { "epoch": 2.25044404973357, "grad_norm": 0.4131978300410885, "learning_rate": 4.781096409800466e-06, "loss": 0.0441, "step": 2534 }, { "epoch": 2.2513321492007106, "grad_norm": 0.4156576374906733, "learning_rate": 4.7772245534430535e-06, "loss": 0.0417, "step": 2535 }, { "epoch": 2.2522202486678506, "grad_norm": 0.4015180530083811, "learning_rate": 4.773352830934242e-06, "loss": 0.0387, "step": 2536 }, { "epoch": 2.253108348134991, "grad_norm": 0.3873023967813091, "learning_rate": 4.7694812446002516e-06, "loss": 0.0405, "step": 2537 }, { "epoch": 2.2539964476021312, "grad_norm": 0.37819588112493085, "learning_rate": 4.76560979676722e-06, "loss": 0.0375, "step": 2538 }, { "epoch": 2.2548845470692718, "grad_norm": 0.3405180786556365, "learning_rate": 4.7617384897612055e-06, "loss": 0.0426, "step": 2539 }, { "epoch": 2.2557726465364123, "grad_norm": 0.4616456053902133, "learning_rate": 4.757867325908174e-06, "loss": 0.0425, "step": 2540 }, { "epoch": 2.2566607460035524, "grad_norm": 0.37888562507067025, "learning_rate": 4.753996307534009e-06, "loss": 0.0387, "step": 2541 }, { "epoch": 2.257548845470693, "grad_norm": 0.37789579527129913, "learning_rate": 4.7501254369645105e-06, "loss": 0.0384, "step": 2542 }, { "epoch": 2.258436944937833, "grad_norm": 0.4358472658585724, "learning_rate": 4.746254716525388e-06, "loss": 0.0383, "step": 2543 }, { "epoch": 2.2593250444049735, "grad_norm": 0.40128540158761594, "learning_rate": 4.742384148542252e-06, "loss": 0.0477, "step": 2544 }, { "epoch": 2.2602131438721136, "grad_norm": 0.3799235401621402, "learning_rate": 4.738513735340634e-06, "loss": 0.0366, "step": 2545 }, { "epoch": 2.261101243339254, "grad_norm": 0.44371140206874726, "learning_rate": 4.734643479245967e-06, "loss": 0.0444, "step": 2546 }, { "epoch": 2.261989342806394, "grad_norm": 0.45476470201441677, "learning_rate": 4.730773382583589e-06, "loss": 0.0401, "step": 2547 }, { "epoch": 2.2628774422735347, "grad_norm": 0.41522269427289815, "learning_rate": 4.726903447678741e-06, "loss": 0.0421, "step": 2548 }, { "epoch": 2.263765541740675, "grad_norm": 0.45973427614502205, "learning_rate": 4.723033676856571e-06, "loss": 0.0419, "step": 2549 }, { "epoch": 2.2646536412078153, "grad_norm": 0.34047523807710073, "learning_rate": 4.719164072442125e-06, "loss": 0.0365, "step": 2550 }, { "epoch": 2.265541740674956, "grad_norm": 0.5434438568961559, "learning_rate": 4.715294636760352e-06, "loss": 0.0556, "step": 2551 }, { "epoch": 2.266429840142096, "grad_norm": 0.3987002813829804, "learning_rate": 4.711425372136095e-06, "loss": 0.0367, "step": 2552 }, { "epoch": 2.267317939609236, "grad_norm": 0.5178962986501356, "learning_rate": 4.707556280894099e-06, "loss": 0.0397, "step": 2553 }, { "epoch": 2.2682060390763765, "grad_norm": 0.4294164720104162, "learning_rate": 4.703687365359e-06, "loss": 0.0417, "step": 2554 }, { "epoch": 2.269094138543517, "grad_norm": 0.4796438938735421, "learning_rate": 4.6998186278553375e-06, "loss": 0.0472, "step": 2555 }, { "epoch": 2.269982238010657, "grad_norm": 0.4884580178535705, "learning_rate": 4.69595007070753e-06, "loss": 0.0331, "step": 2556 }, { "epoch": 2.2708703374777977, "grad_norm": 0.3881108467602305, "learning_rate": 4.6920816962399e-06, "loss": 0.0347, "step": 2557 }, { "epoch": 2.2717584369449377, "grad_norm": 0.39596463725067144, "learning_rate": 4.6882135067766555e-06, "loss": 0.0421, "step": 2558 }, { "epoch": 2.2726465364120783, "grad_norm": 0.4294002256652185, "learning_rate": 4.684345504641894e-06, "loss": 0.0401, "step": 2559 }, { "epoch": 2.2735346358792183, "grad_norm": 0.42363391918012416, "learning_rate": 4.680477692159597e-06, "loss": 0.0462, "step": 2560 }, { "epoch": 2.274422735346359, "grad_norm": 0.3582869755447265, "learning_rate": 4.676610071653638e-06, "loss": 0.0372, "step": 2561 }, { "epoch": 2.275310834813499, "grad_norm": 0.3916175494250579, "learning_rate": 4.67274264544777e-06, "loss": 0.0339, "step": 2562 }, { "epoch": 2.2761989342806395, "grad_norm": 0.5189670677965533, "learning_rate": 4.6688754158656335e-06, "loss": 0.0451, "step": 2563 }, { "epoch": 2.2770870337477795, "grad_norm": 0.41871587575728325, "learning_rate": 4.665008385230747e-06, "loss": 0.0398, "step": 2564 }, { "epoch": 2.27797513321492, "grad_norm": 0.39671935469464326, "learning_rate": 4.6611415558665115e-06, "loss": 0.0443, "step": 2565 }, { "epoch": 2.2788632326820606, "grad_norm": 0.41792405006136724, "learning_rate": 4.657274930096208e-06, "loss": 0.039, "step": 2566 }, { "epoch": 2.2797513321492007, "grad_norm": 0.34331716917412153, "learning_rate": 4.653408510242995e-06, "loss": 0.0327, "step": 2567 }, { "epoch": 2.280639431616341, "grad_norm": 0.39629268819743063, "learning_rate": 4.649542298629903e-06, "loss": 0.0487, "step": 2568 }, { "epoch": 2.2815275310834813, "grad_norm": 0.49315951881137177, "learning_rate": 4.645676297579841e-06, "loss": 0.0475, "step": 2569 }, { "epoch": 2.282415630550622, "grad_norm": 0.35594959843398993, "learning_rate": 4.641810509415594e-06, "loss": 0.0383, "step": 2570 }, { "epoch": 2.283303730017762, "grad_norm": 0.39177546065555535, "learning_rate": 4.6379449364598165e-06, "loss": 0.0366, "step": 2571 }, { "epoch": 2.2841918294849024, "grad_norm": 0.3850619559079712, "learning_rate": 4.634079581035029e-06, "loss": 0.0328, "step": 2572 }, { "epoch": 2.2850799289520425, "grad_norm": 0.4756070589111131, "learning_rate": 4.630214445463629e-06, "loss": 0.0428, "step": 2573 }, { "epoch": 2.285968028419183, "grad_norm": 0.43955577293826825, "learning_rate": 4.626349532067879e-06, "loss": 0.0431, "step": 2574 }, { "epoch": 2.286856127886323, "grad_norm": 0.34848039746895904, "learning_rate": 4.622484843169907e-06, "loss": 0.0388, "step": 2575 }, { "epoch": 2.2877442273534636, "grad_norm": 0.3472130105952495, "learning_rate": 4.618620381091707e-06, "loss": 0.0367, "step": 2576 }, { "epoch": 2.288632326820604, "grad_norm": 0.3634314214099599, "learning_rate": 4.614756148155135e-06, "loss": 0.0357, "step": 2577 }, { "epoch": 2.289520426287744, "grad_norm": 0.3563163781719383, "learning_rate": 4.610892146681913e-06, "loss": 0.0385, "step": 2578 }, { "epoch": 2.2904085257548847, "grad_norm": 0.36444852000028927, "learning_rate": 4.607028378993619e-06, "loss": 0.0418, "step": 2579 }, { "epoch": 2.291296625222025, "grad_norm": 0.3520780957388402, "learning_rate": 4.603164847411698e-06, "loss": 0.0346, "step": 2580 }, { "epoch": 2.2921847246891653, "grad_norm": 0.40450817499551517, "learning_rate": 4.599301554257444e-06, "loss": 0.0388, "step": 2581 }, { "epoch": 2.2930728241563054, "grad_norm": 0.4632521174305848, "learning_rate": 4.595438501852013e-06, "loss": 0.0466, "step": 2582 }, { "epoch": 2.293960923623446, "grad_norm": 0.5782949167988353, "learning_rate": 4.591575692516417e-06, "loss": 0.0419, "step": 2583 }, { "epoch": 2.294849023090586, "grad_norm": 0.3621161895075694, "learning_rate": 4.587713128571522e-06, "loss": 0.0342, "step": 2584 }, { "epoch": 2.2957371225577266, "grad_norm": 0.3729789315196646, "learning_rate": 4.583850812338041e-06, "loss": 0.0401, "step": 2585 }, { "epoch": 2.2966252220248666, "grad_norm": 0.3687046514181492, "learning_rate": 4.579988746136543e-06, "loss": 0.0413, "step": 2586 }, { "epoch": 2.297513321492007, "grad_norm": 0.4461284341597481, "learning_rate": 4.576126932287449e-06, "loss": 0.0442, "step": 2587 }, { "epoch": 2.2984014209591472, "grad_norm": 0.5508823297299353, "learning_rate": 4.572265373111024e-06, "loss": 0.0396, "step": 2588 }, { "epoch": 2.2992895204262878, "grad_norm": 0.49580490300154184, "learning_rate": 4.56840407092738e-06, "loss": 0.047, "step": 2589 }, { "epoch": 2.300177619893428, "grad_norm": 0.3805186811895452, "learning_rate": 4.5645430280564775e-06, "loss": 0.041, "step": 2590 }, { "epoch": 2.3010657193605684, "grad_norm": 0.3961612376062459, "learning_rate": 4.560682246818118e-06, "loss": 0.0391, "step": 2591 }, { "epoch": 2.301953818827709, "grad_norm": 0.369603894824233, "learning_rate": 4.556821729531951e-06, "loss": 0.0389, "step": 2592 }, { "epoch": 2.302841918294849, "grad_norm": 0.3388520145689632, "learning_rate": 4.5529614785174606e-06, "loss": 0.0353, "step": 2593 }, { "epoch": 2.3037300177619895, "grad_norm": 0.3970081601406595, "learning_rate": 4.549101496093974e-06, "loss": 0.0412, "step": 2594 }, { "epoch": 2.3046181172291296, "grad_norm": 0.5662660049321084, "learning_rate": 4.545241784580658e-06, "loss": 0.0472, "step": 2595 }, { "epoch": 2.30550621669627, "grad_norm": 0.37673003155222734, "learning_rate": 4.5413823462965195e-06, "loss": 0.0278, "step": 2596 }, { "epoch": 2.30639431616341, "grad_norm": 0.3526715067187626, "learning_rate": 4.537523183560392e-06, "loss": 0.0352, "step": 2597 }, { "epoch": 2.3072824156305507, "grad_norm": 0.3913933260665068, "learning_rate": 4.533664298690951e-06, "loss": 0.0363, "step": 2598 }, { "epoch": 2.308170515097691, "grad_norm": 0.3266126022029271, "learning_rate": 4.529805694006704e-06, "loss": 0.0348, "step": 2599 }, { "epoch": 2.3090586145648313, "grad_norm": 0.4409332935117264, "learning_rate": 4.52594737182599e-06, "loss": 0.0382, "step": 2600 }, { "epoch": 2.3099467140319714, "grad_norm": 0.38668057959744595, "learning_rate": 4.522089334466977e-06, "loss": 0.0358, "step": 2601 }, { "epoch": 2.310834813499112, "grad_norm": 0.4167138364334915, "learning_rate": 4.51823158424766e-06, "loss": 0.0347, "step": 2602 }, { "epoch": 2.3117229129662524, "grad_norm": 0.3971986715978222, "learning_rate": 4.5143741234858655e-06, "loss": 0.0347, "step": 2603 }, { "epoch": 2.3126110124333925, "grad_norm": 0.33372030175031625, "learning_rate": 4.510516954499246e-06, "loss": 0.0381, "step": 2604 }, { "epoch": 2.313499111900533, "grad_norm": 0.3933343502736182, "learning_rate": 4.5066600796052755e-06, "loss": 0.0449, "step": 2605 }, { "epoch": 2.314387211367673, "grad_norm": 0.45756384072037526, "learning_rate": 4.502803501121252e-06, "loss": 0.036, "step": 2606 }, { "epoch": 2.3152753108348136, "grad_norm": 0.3987698602657196, "learning_rate": 4.498947221364299e-06, "loss": 0.0425, "step": 2607 }, { "epoch": 2.3161634103019537, "grad_norm": 0.4512693637599679, "learning_rate": 4.495091242651356e-06, "loss": 0.0389, "step": 2608 }, { "epoch": 2.3170515097690942, "grad_norm": 0.3974871510688192, "learning_rate": 4.491235567299185e-06, "loss": 0.0428, "step": 2609 }, { "epoch": 2.3179396092362343, "grad_norm": 0.540763951633457, "learning_rate": 4.487380197624361e-06, "loss": 0.0448, "step": 2610 }, { "epoch": 2.318827708703375, "grad_norm": 0.5609297515451566, "learning_rate": 4.483525135943282e-06, "loss": 0.0472, "step": 2611 }, { "epoch": 2.319715808170515, "grad_norm": 0.561920406048222, "learning_rate": 4.479670384572159e-06, "loss": 0.0446, "step": 2612 }, { "epoch": 2.3206039076376554, "grad_norm": 0.3662670350130841, "learning_rate": 4.475815945827008e-06, "loss": 0.0372, "step": 2613 }, { "epoch": 2.321492007104796, "grad_norm": 0.4336635507906872, "learning_rate": 4.4719618220236715e-06, "loss": 0.0375, "step": 2614 }, { "epoch": 2.322380106571936, "grad_norm": 0.31236302035819924, "learning_rate": 4.468108015477792e-06, "loss": 0.0314, "step": 2615 }, { "epoch": 2.323268206039076, "grad_norm": 0.3779332333237963, "learning_rate": 4.464254528504827e-06, "loss": 0.0307, "step": 2616 }, { "epoch": 2.3241563055062167, "grad_norm": 0.36485662834350213, "learning_rate": 4.460401363420038e-06, "loss": 0.0376, "step": 2617 }, { "epoch": 2.325044404973357, "grad_norm": 0.3360456767434795, "learning_rate": 4.456548522538496e-06, "loss": 0.0329, "step": 2618 }, { "epoch": 2.3259325044404973, "grad_norm": 0.3792834149891537, "learning_rate": 4.4526960081750754e-06, "loss": 0.0299, "step": 2619 }, { "epoch": 2.326820603907638, "grad_norm": 0.4468078495755882, "learning_rate": 4.448843822644458e-06, "loss": 0.0358, "step": 2620 }, { "epoch": 2.327708703374778, "grad_norm": 0.5255619436694257, "learning_rate": 4.444991968261121e-06, "loss": 0.0498, "step": 2621 }, { "epoch": 2.3285968028419184, "grad_norm": 0.3463107043548755, "learning_rate": 4.441140447339349e-06, "loss": 0.0336, "step": 2622 }, { "epoch": 2.3294849023090585, "grad_norm": 0.46177759076054403, "learning_rate": 4.4372892621932235e-06, "loss": 0.0371, "step": 2623 }, { "epoch": 2.330373001776199, "grad_norm": 0.44754932979274265, "learning_rate": 4.433438415136629e-06, "loss": 0.0465, "step": 2624 }, { "epoch": 2.331261101243339, "grad_norm": 0.3744449625787688, "learning_rate": 4.429587908483237e-06, "loss": 0.0451, "step": 2625 }, { "epoch": 2.3321492007104796, "grad_norm": 0.38720151614565995, "learning_rate": 4.425737744546522e-06, "loss": 0.0341, "step": 2626 }, { "epoch": 2.3330373001776197, "grad_norm": 0.37858368183699215, "learning_rate": 4.421887925639753e-06, "loss": 0.042, "step": 2627 }, { "epoch": 2.33392539964476, "grad_norm": 0.3532938714042838, "learning_rate": 4.4180384540759905e-06, "loss": 0.0303, "step": 2628 }, { "epoch": 2.3348134991119007, "grad_norm": 0.5221524768696196, "learning_rate": 4.414189332168079e-06, "loss": 0.0399, "step": 2629 }, { "epoch": 2.335701598579041, "grad_norm": 0.40731896962907893, "learning_rate": 4.4103405622286645e-06, "loss": 0.036, "step": 2630 }, { "epoch": 2.3365896980461813, "grad_norm": 0.4795094424305429, "learning_rate": 4.406492146570176e-06, "loss": 0.0445, "step": 2631 }, { "epoch": 2.3374777975133214, "grad_norm": 0.3875058274637457, "learning_rate": 4.4026440875048295e-06, "loss": 0.0383, "step": 2632 }, { "epoch": 2.338365896980462, "grad_norm": 0.4483653823017389, "learning_rate": 4.398796387344626e-06, "loss": 0.0476, "step": 2633 }, { "epoch": 2.339253996447602, "grad_norm": 0.4718006742914674, "learning_rate": 4.394949048401354e-06, "loss": 0.0502, "step": 2634 }, { "epoch": 2.3401420959147425, "grad_norm": 0.49651496653938887, "learning_rate": 4.391102072986581e-06, "loss": 0.046, "step": 2635 }, { "epoch": 2.3410301953818826, "grad_norm": 0.3709334633768202, "learning_rate": 4.387255463411658e-06, "loss": 0.0369, "step": 2636 }, { "epoch": 2.341918294849023, "grad_norm": 0.4068319491201398, "learning_rate": 4.3834092219877214e-06, "loss": 0.0461, "step": 2637 }, { "epoch": 2.342806394316163, "grad_norm": 0.5168445023408296, "learning_rate": 4.3795633510256745e-06, "loss": 0.0418, "step": 2638 }, { "epoch": 2.3436944937833037, "grad_norm": 0.36258257240961933, "learning_rate": 4.375717852836207e-06, "loss": 0.0382, "step": 2639 }, { "epoch": 2.3445825932504443, "grad_norm": 0.3772719221437402, "learning_rate": 4.371872729729784e-06, "loss": 0.0352, "step": 2640 }, { "epoch": 2.3454706927175843, "grad_norm": 0.4179596388727361, "learning_rate": 4.3680279840166444e-06, "loss": 0.0403, "step": 2641 }, { "epoch": 2.346358792184725, "grad_norm": 0.4114879162383246, "learning_rate": 4.364183618006798e-06, "loss": 0.0379, "step": 2642 }, { "epoch": 2.347246891651865, "grad_norm": 0.3749628004987579, "learning_rate": 4.360339634010027e-06, "loss": 0.0357, "step": 2643 }, { "epoch": 2.3481349911190055, "grad_norm": 0.400806823046906, "learning_rate": 4.356496034335887e-06, "loss": 0.0467, "step": 2644 }, { "epoch": 2.3490230905861456, "grad_norm": 0.37358836479591945, "learning_rate": 4.352652821293702e-06, "loss": 0.0435, "step": 2645 }, { "epoch": 2.349911190053286, "grad_norm": 0.3382574641349636, "learning_rate": 4.34880999719256e-06, "loss": 0.0401, "step": 2646 }, { "epoch": 2.350799289520426, "grad_norm": 0.3725024069218942, "learning_rate": 4.344967564341318e-06, "loss": 0.0404, "step": 2647 }, { "epoch": 2.3516873889875667, "grad_norm": 0.3095419968941293, "learning_rate": 4.341125525048599e-06, "loss": 0.0371, "step": 2648 }, { "epoch": 2.3525754884547068, "grad_norm": 0.5392748938131307, "learning_rate": 4.3372838816227905e-06, "loss": 0.0611, "step": 2649 }, { "epoch": 2.3534635879218473, "grad_norm": 0.4635937921067071, "learning_rate": 4.333442636372036e-06, "loss": 0.0417, "step": 2650 }, { "epoch": 2.354351687388988, "grad_norm": 0.3110201674381657, "learning_rate": 4.329601791604246e-06, "loss": 0.0351, "step": 2651 }, { "epoch": 2.355239786856128, "grad_norm": 0.34607897834966267, "learning_rate": 4.325761349627088e-06, "loss": 0.0424, "step": 2652 }, { "epoch": 2.356127886323268, "grad_norm": 0.3449482506256782, "learning_rate": 4.321921312747989e-06, "loss": 0.0382, "step": 2653 }, { "epoch": 2.3570159857904085, "grad_norm": 0.45013139098831834, "learning_rate": 4.318081683274128e-06, "loss": 0.0358, "step": 2654 }, { "epoch": 2.357904085257549, "grad_norm": 0.3780648972123435, "learning_rate": 4.3142424635124445e-06, "loss": 0.0352, "step": 2655 }, { "epoch": 2.358792184724689, "grad_norm": 0.5219853618866549, "learning_rate": 4.310403655769629e-06, "loss": 0.0521, "step": 2656 }, { "epoch": 2.3596802841918296, "grad_norm": 0.3383305725899369, "learning_rate": 4.3065652623521285e-06, "loss": 0.0384, "step": 2657 }, { "epoch": 2.3605683836589697, "grad_norm": 0.40145436774201537, "learning_rate": 4.302727285566134e-06, "loss": 0.044, "step": 2658 }, { "epoch": 2.3614564831261102, "grad_norm": 0.39845663541383897, "learning_rate": 4.298889727717592e-06, "loss": 0.0433, "step": 2659 }, { "epoch": 2.3623445825932503, "grad_norm": 0.3778807856315741, "learning_rate": 4.295052591112194e-06, "loss": 0.0366, "step": 2660 }, { "epoch": 2.363232682060391, "grad_norm": 0.38902474709123536, "learning_rate": 4.291215878055382e-06, "loss": 0.031, "step": 2661 }, { "epoch": 2.364120781527531, "grad_norm": 0.4161428012640726, "learning_rate": 4.2873795908523384e-06, "loss": 0.0429, "step": 2662 }, { "epoch": 2.3650088809946714, "grad_norm": 0.38191207983459374, "learning_rate": 4.283543731807994e-06, "loss": 0.0445, "step": 2663 }, { "epoch": 2.3658969804618115, "grad_norm": 0.42078712896487014, "learning_rate": 4.27970830322702e-06, "loss": 0.0375, "step": 2664 }, { "epoch": 2.366785079928952, "grad_norm": 0.8424960126890609, "learning_rate": 4.275873307413835e-06, "loss": 0.0496, "step": 2665 }, { "epoch": 2.3676731793960926, "grad_norm": 0.4396625943845716, "learning_rate": 4.272038746672586e-06, "loss": 0.0467, "step": 2666 }, { "epoch": 2.3685612788632326, "grad_norm": 0.536416973392681, "learning_rate": 4.268204623307166e-06, "loss": 0.0471, "step": 2667 }, { "epoch": 2.369449378330373, "grad_norm": 0.35724190190476407, "learning_rate": 4.264370939621207e-06, "loss": 0.0331, "step": 2668 }, { "epoch": 2.3703374777975132, "grad_norm": 0.3474386013978734, "learning_rate": 4.260537697918076e-06, "loss": 0.0301, "step": 2669 }, { "epoch": 2.3712255772646538, "grad_norm": 0.4181100995540747, "learning_rate": 4.256704900500866e-06, "loss": 0.0468, "step": 2670 }, { "epoch": 2.372113676731794, "grad_norm": 1.172328043611854, "learning_rate": 4.2528725496724135e-06, "loss": 0.052, "step": 2671 }, { "epoch": 2.3730017761989344, "grad_norm": 0.4189048331830737, "learning_rate": 4.249040647735284e-06, "loss": 0.0414, "step": 2672 }, { "epoch": 2.3738898756660745, "grad_norm": 0.3545069090635621, "learning_rate": 4.245209196991771e-06, "loss": 0.0352, "step": 2673 }, { "epoch": 2.374777975133215, "grad_norm": 0.6105691217865397, "learning_rate": 4.2413781997438965e-06, "loss": 0.0508, "step": 2674 }, { "epoch": 2.375666074600355, "grad_norm": 0.3603993506401413, "learning_rate": 4.2375476582934134e-06, "loss": 0.0371, "step": 2675 }, { "epoch": 2.3765541740674956, "grad_norm": 0.3793291502154685, "learning_rate": 4.233717574941799e-06, "loss": 0.0451, "step": 2676 }, { "epoch": 2.377442273534636, "grad_norm": 0.4999983500982288, "learning_rate": 4.229887951990255e-06, "loss": 0.0352, "step": 2677 }, { "epoch": 2.378330373001776, "grad_norm": 0.3801942526748397, "learning_rate": 4.2260587917397064e-06, "loss": 0.0381, "step": 2678 }, { "epoch": 2.3792184724689167, "grad_norm": 0.4072780423292999, "learning_rate": 4.2222300964908e-06, "loss": 0.0405, "step": 2679 }, { "epoch": 2.380106571936057, "grad_norm": 0.5037803829193044, "learning_rate": 4.218401868543903e-06, "loss": 0.0365, "step": 2680 }, { "epoch": 2.3809946714031973, "grad_norm": 0.44276942538337943, "learning_rate": 4.214574110199108e-06, "loss": 0.0417, "step": 2681 }, { "epoch": 2.3818827708703374, "grad_norm": 0.45279219712738855, "learning_rate": 4.2107468237562135e-06, "loss": 0.0424, "step": 2682 }, { "epoch": 2.382770870337478, "grad_norm": 0.2778588750278949, "learning_rate": 4.206920011514744e-06, "loss": 0.0288, "step": 2683 }, { "epoch": 2.383658969804618, "grad_norm": 0.3847579877166627, "learning_rate": 4.203093675773936e-06, "loss": 0.0418, "step": 2684 }, { "epoch": 2.3845470692717585, "grad_norm": 0.3662947708633666, "learning_rate": 4.199267818832741e-06, "loss": 0.0353, "step": 2685 }, { "epoch": 2.3854351687388986, "grad_norm": 0.3462681279029546, "learning_rate": 4.195442442989819e-06, "loss": 0.0405, "step": 2686 }, { "epoch": 2.386323268206039, "grad_norm": 0.2827109752420791, "learning_rate": 4.1916175505435454e-06, "loss": 0.0339, "step": 2687 }, { "epoch": 2.387211367673179, "grad_norm": 0.38974521298192905, "learning_rate": 4.187793143792003e-06, "loss": 0.0358, "step": 2688 }, { "epoch": 2.3880994671403197, "grad_norm": 0.46842705755348635, "learning_rate": 4.183969225032984e-06, "loss": 0.0484, "step": 2689 }, { "epoch": 2.38898756660746, "grad_norm": 0.4501375814597474, "learning_rate": 4.180145796563985e-06, "loss": 0.0467, "step": 2690 }, { "epoch": 2.3898756660746003, "grad_norm": 0.3869689008612881, "learning_rate": 4.176322860682209e-06, "loss": 0.0407, "step": 2691 }, { "epoch": 2.390763765541741, "grad_norm": 0.39653085599217086, "learning_rate": 4.172500419684566e-06, "loss": 0.0426, "step": 2692 }, { "epoch": 2.391651865008881, "grad_norm": 0.4559469979477046, "learning_rate": 4.168678475867667e-06, "loss": 0.0518, "step": 2693 }, { "epoch": 2.3925399644760215, "grad_norm": 0.4844137660745055, "learning_rate": 4.1648570315278195e-06, "loss": 0.041, "step": 2694 }, { "epoch": 2.3934280639431615, "grad_norm": 0.5687974521056933, "learning_rate": 4.161036088961037e-06, "loss": 0.0376, "step": 2695 }, { "epoch": 2.394316163410302, "grad_norm": 0.3966335434023539, "learning_rate": 4.15721565046303e-06, "loss": 0.044, "step": 2696 }, { "epoch": 2.395204262877442, "grad_norm": 0.3828405283624342, "learning_rate": 4.153395718329206e-06, "loss": 0.039, "step": 2697 }, { "epoch": 2.3960923623445827, "grad_norm": 0.3982785775450126, "learning_rate": 4.149576294854668e-06, "loss": 0.0398, "step": 2698 }, { "epoch": 2.3969804618117228, "grad_norm": 0.3918824184327849, "learning_rate": 4.145757382334212e-06, "loss": 0.0355, "step": 2699 }, { "epoch": 2.3978685612788633, "grad_norm": 0.4785647829306533, "learning_rate": 4.141938983062329e-06, "loss": 0.0539, "step": 2700 }, { "epoch": 2.3987566607460034, "grad_norm": 0.5174157755413326, "learning_rate": 4.1381210993332e-06, "loss": 0.0398, "step": 2701 }, { "epoch": 2.399644760213144, "grad_norm": 0.37733907417145135, "learning_rate": 4.1343037334407e-06, "loss": 0.051, "step": 2702 }, { "epoch": 2.4005328596802844, "grad_norm": 0.39113603647838174, "learning_rate": 4.130486887678386e-06, "loss": 0.0449, "step": 2703 }, { "epoch": 2.4014209591474245, "grad_norm": 0.4295292054426644, "learning_rate": 4.12667056433951e-06, "loss": 0.0595, "step": 2704 }, { "epoch": 2.402309058614565, "grad_norm": 0.3390281146932373, "learning_rate": 4.1228547657170025e-06, "loss": 0.0344, "step": 2705 }, { "epoch": 2.403197158081705, "grad_norm": 0.30059656981610866, "learning_rate": 4.11903949410349e-06, "loss": 0.0296, "step": 2706 }, { "epoch": 2.4040852575488456, "grad_norm": 0.38955577602654734, "learning_rate": 4.115224751791269e-06, "loss": 0.0509, "step": 2707 }, { "epoch": 2.4049733570159857, "grad_norm": 0.5506825358029819, "learning_rate": 4.111410541072325e-06, "loss": 0.0431, "step": 2708 }, { "epoch": 2.405861456483126, "grad_norm": 0.3805251173156188, "learning_rate": 4.107596864238325e-06, "loss": 0.035, "step": 2709 }, { "epoch": 2.4067495559502663, "grad_norm": 0.42385169806694045, "learning_rate": 4.103783723580616e-06, "loss": 0.0493, "step": 2710 }, { "epoch": 2.407637655417407, "grad_norm": 0.3754169360294611, "learning_rate": 4.099971121390214e-06, "loss": 0.0386, "step": 2711 }, { "epoch": 2.408525754884547, "grad_norm": 0.5621179327428543, "learning_rate": 4.096159059957822e-06, "loss": 0.04, "step": 2712 }, { "epoch": 2.4094138543516874, "grad_norm": 0.44413035112259003, "learning_rate": 4.092347541573814e-06, "loss": 0.0359, "step": 2713 }, { "epoch": 2.410301953818828, "grad_norm": 0.6688079934738826, "learning_rate": 4.088536568528239e-06, "loss": 0.0336, "step": 2714 }, { "epoch": 2.411190053285968, "grad_norm": 0.43700648671433046, "learning_rate": 4.084726143110813e-06, "loss": 0.0437, "step": 2715 }, { "epoch": 2.412078152753108, "grad_norm": 0.3094797733618257, "learning_rate": 4.08091626761093e-06, "loss": 0.0316, "step": 2716 }, { "epoch": 2.4129662522202486, "grad_norm": 0.48276689637977, "learning_rate": 4.077106944317649e-06, "loss": 0.0456, "step": 2717 }, { "epoch": 2.413854351687389, "grad_norm": 0.447972694388429, "learning_rate": 4.0732981755197024e-06, "loss": 0.0336, "step": 2718 }, { "epoch": 2.4147424511545292, "grad_norm": 0.33650977191476394, "learning_rate": 4.069489963505482e-06, "loss": 0.0311, "step": 2719 }, { "epoch": 2.4156305506216698, "grad_norm": 0.4843850307505809, "learning_rate": 4.065682310563049e-06, "loss": 0.0434, "step": 2720 }, { "epoch": 2.41651865008881, "grad_norm": 0.3218363169067914, "learning_rate": 4.061875218980131e-06, "loss": 0.0337, "step": 2721 }, { "epoch": 2.4174067495559504, "grad_norm": 0.38755539137071665, "learning_rate": 4.058068691044117e-06, "loss": 0.0356, "step": 2722 }, { "epoch": 2.4182948490230904, "grad_norm": 0.29684929361619666, "learning_rate": 4.054262729042052e-06, "loss": 0.0324, "step": 2723 }, { "epoch": 2.419182948490231, "grad_norm": 0.49865711099936944, "learning_rate": 4.050457335260648e-06, "loss": 0.0493, "step": 2724 }, { "epoch": 2.420071047957371, "grad_norm": 0.33211617378906494, "learning_rate": 4.046652511986273e-06, "loss": 0.0344, "step": 2725 }, { "epoch": 2.4209591474245116, "grad_norm": 0.42675436849450327, "learning_rate": 4.042848261504954e-06, "loss": 0.0468, "step": 2726 }, { "epoch": 2.4218472468916517, "grad_norm": 0.5191041505079167, "learning_rate": 4.039044586102368e-06, "loss": 0.0453, "step": 2727 }, { "epoch": 2.422735346358792, "grad_norm": 0.34489502732281585, "learning_rate": 4.0352414880638515e-06, "loss": 0.0367, "step": 2728 }, { "epoch": 2.4236234458259327, "grad_norm": 0.36686567825122324, "learning_rate": 4.031438969674395e-06, "loss": 0.0345, "step": 2729 }, { "epoch": 2.424511545293073, "grad_norm": 0.6001448010579975, "learning_rate": 4.027637033218638e-06, "loss": 0.0633, "step": 2730 }, { "epoch": 2.4253996447602133, "grad_norm": 0.5891148472698688, "learning_rate": 4.023835680980871e-06, "loss": 0.0454, "step": 2731 }, { "epoch": 2.4262877442273534, "grad_norm": 0.36258957798737657, "learning_rate": 4.020034915245033e-06, "loss": 0.0382, "step": 2732 }, { "epoch": 2.427175843694494, "grad_norm": 0.33420820115356176, "learning_rate": 4.016234738294712e-06, "loss": 0.0345, "step": 2733 }, { "epoch": 2.428063943161634, "grad_norm": 0.3920069199046036, "learning_rate": 4.012435152413142e-06, "loss": 0.0379, "step": 2734 }, { "epoch": 2.4289520426287745, "grad_norm": 0.41356769396612303, "learning_rate": 4.008636159883202e-06, "loss": 0.0352, "step": 2735 }, { "epoch": 2.4298401420959146, "grad_norm": 0.4455515310459408, "learning_rate": 4.00483776298741e-06, "loss": 0.0399, "step": 2736 }, { "epoch": 2.430728241563055, "grad_norm": 0.44120035915825245, "learning_rate": 4.001039964007934e-06, "loss": 0.0489, "step": 2737 }, { "epoch": 2.431616341030195, "grad_norm": 0.5120214650549834, "learning_rate": 3.99724276522658e-06, "loss": 0.039, "step": 2738 }, { "epoch": 2.4325044404973357, "grad_norm": 0.38981652065605005, "learning_rate": 3.9934461689247875e-06, "loss": 0.0421, "step": 2739 }, { "epoch": 2.4333925399644762, "grad_norm": 0.568470672972159, "learning_rate": 3.989650177383641e-06, "loss": 0.0407, "step": 2740 }, { "epoch": 2.4342806394316163, "grad_norm": 0.40355470611435923, "learning_rate": 3.98585479288386e-06, "loss": 0.0356, "step": 2741 }, { "epoch": 2.435168738898757, "grad_norm": 0.4116171241446696, "learning_rate": 3.982060017705798e-06, "loss": 0.0382, "step": 2742 }, { "epoch": 2.436056838365897, "grad_norm": 0.40057688779101597, "learning_rate": 3.978265854129443e-06, "loss": 0.0328, "step": 2743 }, { "epoch": 2.4369449378330375, "grad_norm": 0.3094163402024519, "learning_rate": 3.974472304434415e-06, "loss": 0.0313, "step": 2744 }, { "epoch": 2.4378330373001775, "grad_norm": 0.4049276533515741, "learning_rate": 3.970679370899968e-06, "loss": 0.036, "step": 2745 }, { "epoch": 2.438721136767318, "grad_norm": 0.47667367548802525, "learning_rate": 3.966887055804982e-06, "loss": 0.0476, "step": 2746 }, { "epoch": 2.439609236234458, "grad_norm": 0.3766780299327582, "learning_rate": 3.963095361427966e-06, "loss": 0.0325, "step": 2747 }, { "epoch": 2.4404973357015987, "grad_norm": 0.4859547568778557, "learning_rate": 3.959304290047057e-06, "loss": 0.0363, "step": 2748 }, { "epoch": 2.4413854351687387, "grad_norm": 0.4869308975398042, "learning_rate": 3.9555138439400185e-06, "loss": 0.0428, "step": 2749 }, { "epoch": 2.4422735346358793, "grad_norm": 0.30245289236148404, "learning_rate": 3.95172402538424e-06, "loss": 0.0309, "step": 2750 }, { "epoch": 2.44316163410302, "grad_norm": 0.5199088586403358, "learning_rate": 3.947934836656728e-06, "loss": 0.0422, "step": 2751 }, { "epoch": 2.44404973357016, "grad_norm": 0.5265924905344541, "learning_rate": 3.944146280034114e-06, "loss": 0.0486, "step": 2752 }, { "epoch": 2.4449378330373, "grad_norm": 0.4834636258748378, "learning_rate": 3.940358357792651e-06, "loss": 0.0568, "step": 2753 }, { "epoch": 2.4458259325044405, "grad_norm": 0.44039544087123245, "learning_rate": 3.9365710722082115e-06, "loss": 0.0406, "step": 2754 }, { "epoch": 2.446714031971581, "grad_norm": 0.6374575947435365, "learning_rate": 3.93278442555628e-06, "loss": 0.0487, "step": 2755 }, { "epoch": 2.447602131438721, "grad_norm": 0.3804706890254963, "learning_rate": 3.928998420111962e-06, "loss": 0.0357, "step": 2756 }, { "epoch": 2.4484902309058616, "grad_norm": 0.4251012836402806, "learning_rate": 3.925213058149978e-06, "loss": 0.0367, "step": 2757 }, { "epoch": 2.4493783303730017, "grad_norm": 0.3798698706885474, "learning_rate": 3.92142834194466e-06, "loss": 0.0476, "step": 2758 }, { "epoch": 2.450266429840142, "grad_norm": 0.37773651740704356, "learning_rate": 3.917644273769951e-06, "loss": 0.0333, "step": 2759 }, { "epoch": 2.4511545293072823, "grad_norm": 0.4482599796707847, "learning_rate": 3.913860855899406e-06, "loss": 0.0365, "step": 2760 }, { "epoch": 2.452042628774423, "grad_norm": 0.2763609593786991, "learning_rate": 3.91007809060619e-06, "loss": 0.0267, "step": 2761 }, { "epoch": 2.452930728241563, "grad_norm": 0.4978321758468063, "learning_rate": 3.906295980163074e-06, "loss": 0.0416, "step": 2762 }, { "epoch": 2.4538188277087034, "grad_norm": 0.31397778808375126, "learning_rate": 3.902514526842441e-06, "loss": 0.033, "step": 2763 }, { "epoch": 2.4547069271758435, "grad_norm": 0.3851362092370547, "learning_rate": 3.89873373291627e-06, "loss": 0.0401, "step": 2764 }, { "epoch": 2.455595026642984, "grad_norm": 0.438629903479192, "learning_rate": 3.894953600656149e-06, "loss": 0.037, "step": 2765 }, { "epoch": 2.4564831261101245, "grad_norm": 0.46741530520208524, "learning_rate": 3.891174132333272e-06, "loss": 0.0438, "step": 2766 }, { "epoch": 2.4573712255772646, "grad_norm": 0.3797562200330637, "learning_rate": 3.887395330218429e-06, "loss": 0.0342, "step": 2767 }, { "epoch": 2.458259325044405, "grad_norm": 0.3857543496406655, "learning_rate": 3.883617196582009e-06, "loss": 0.0424, "step": 2768 }, { "epoch": 2.459147424511545, "grad_norm": 0.4812522645092898, "learning_rate": 3.879839733694002e-06, "loss": 0.0532, "step": 2769 }, { "epoch": 2.4600355239786857, "grad_norm": 0.3485268002136728, "learning_rate": 3.876062943823996e-06, "loss": 0.0347, "step": 2770 }, { "epoch": 2.460923623445826, "grad_norm": 0.4411614487930039, "learning_rate": 3.872286829241173e-06, "loss": 0.0421, "step": 2771 }, { "epoch": 2.4618117229129663, "grad_norm": 0.48810326046545194, "learning_rate": 3.868511392214307e-06, "loss": 0.0422, "step": 2772 }, { "epoch": 2.4626998223801064, "grad_norm": 0.37294810337952194, "learning_rate": 3.864736635011769e-06, "loss": 0.037, "step": 2773 }, { "epoch": 2.463587921847247, "grad_norm": 0.3728646324306839, "learning_rate": 3.8609625599015185e-06, "loss": 0.0366, "step": 2774 }, { "epoch": 2.464476021314387, "grad_norm": 0.38711426996452747, "learning_rate": 3.85718916915111e-06, "loss": 0.0444, "step": 2775 }, { "epoch": 2.4653641207815276, "grad_norm": 0.5261969291466544, "learning_rate": 3.853416465027679e-06, "loss": 0.0416, "step": 2776 }, { "epoch": 2.466252220248668, "grad_norm": 0.2970242562064065, "learning_rate": 3.849644449797953e-06, "loss": 0.0361, "step": 2777 }, { "epoch": 2.467140319715808, "grad_norm": 0.32613001679980447, "learning_rate": 3.845873125728248e-06, "loss": 0.0371, "step": 2778 }, { "epoch": 2.4680284191829482, "grad_norm": 0.42788023619013316, "learning_rate": 3.842102495084463e-06, "loss": 0.0343, "step": 2779 }, { "epoch": 2.4689165186500888, "grad_norm": 0.4399667867015698, "learning_rate": 3.838332560132073e-06, "loss": 0.0401, "step": 2780 }, { "epoch": 2.4698046181172293, "grad_norm": 0.7905770446460482, "learning_rate": 3.834563323136148e-06, "loss": 0.0412, "step": 2781 }, { "epoch": 2.4706927175843694, "grad_norm": 0.3454084982147548, "learning_rate": 3.83079478636133e-06, "loss": 0.0335, "step": 2782 }, { "epoch": 2.47158081705151, "grad_norm": 0.41736050082198234, "learning_rate": 3.827026952071843e-06, "loss": 0.0398, "step": 2783 }, { "epoch": 2.47246891651865, "grad_norm": 0.414440722711596, "learning_rate": 3.823259822531486e-06, "loss": 0.039, "step": 2784 }, { "epoch": 2.4733570159857905, "grad_norm": 0.4383871393630624, "learning_rate": 3.819493400003639e-06, "loss": 0.0497, "step": 2785 }, { "epoch": 2.4742451154529306, "grad_norm": 0.38402241311475643, "learning_rate": 3.8157276867512534e-06, "loss": 0.0444, "step": 2786 }, { "epoch": 2.475133214920071, "grad_norm": 0.3591487235416993, "learning_rate": 3.8119626850368596e-06, "loss": 0.0336, "step": 2787 }, { "epoch": 2.476021314387211, "grad_norm": 0.4649138044371827, "learning_rate": 3.808198397122553e-06, "loss": 0.0411, "step": 2788 }, { "epoch": 2.4769094138543517, "grad_norm": 0.30891025330418503, "learning_rate": 3.804434825270007e-06, "loss": 0.029, "step": 2789 }, { "epoch": 2.477797513321492, "grad_norm": 0.4057066028505945, "learning_rate": 3.800671971740459e-06, "loss": 0.0373, "step": 2790 }, { "epoch": 2.4786856127886323, "grad_norm": 0.4485134118853294, "learning_rate": 3.7969098387947227e-06, "loss": 0.046, "step": 2791 }, { "epoch": 2.479573712255773, "grad_norm": 0.3741271494789935, "learning_rate": 3.793148428693169e-06, "loss": 0.0373, "step": 2792 }, { "epoch": 2.480461811722913, "grad_norm": 0.3351458516011296, "learning_rate": 3.7893877436957404e-06, "loss": 0.0402, "step": 2793 }, { "epoch": 2.4813499111900534, "grad_norm": 0.3170907636503459, "learning_rate": 3.7856277860619444e-06, "loss": 0.0352, "step": 2794 }, { "epoch": 2.4822380106571935, "grad_norm": 0.3753494565551761, "learning_rate": 3.7818685580508514e-06, "loss": 0.0302, "step": 2795 }, { "epoch": 2.483126110124334, "grad_norm": 0.4120030499457759, "learning_rate": 3.7781100619210863e-06, "loss": 0.0515, "step": 2796 }, { "epoch": 2.484014209591474, "grad_norm": 0.3368195602403163, "learning_rate": 3.7743522999308444e-06, "loss": 0.034, "step": 2797 }, { "epoch": 2.4849023090586146, "grad_norm": 0.41438216257087224, "learning_rate": 3.770595274337874e-06, "loss": 0.0334, "step": 2798 }, { "epoch": 2.4857904085257547, "grad_norm": 0.33862474385802527, "learning_rate": 3.7668389873994838e-06, "loss": 0.0321, "step": 2799 }, { "epoch": 2.4866785079928952, "grad_norm": 0.3582555091253264, "learning_rate": 3.763083441372535e-06, "loss": 0.0349, "step": 2800 }, { "epoch": 2.4875666074600353, "grad_norm": 0.40475396015514215, "learning_rate": 3.7593286385134465e-06, "loss": 0.0494, "step": 2801 }, { "epoch": 2.488454706927176, "grad_norm": 0.38091393825789654, "learning_rate": 3.7555745810781897e-06, "loss": 0.0344, "step": 2802 }, { "epoch": 2.4893428063943164, "grad_norm": 0.339786667559056, "learning_rate": 3.7518212713222905e-06, "loss": 0.0345, "step": 2803 }, { "epoch": 2.4902309058614565, "grad_norm": 0.4273878372700921, "learning_rate": 3.7480687115008208e-06, "loss": 0.0443, "step": 2804 }, { "epoch": 2.491119005328597, "grad_norm": 0.500913255920978, "learning_rate": 3.744316903868407e-06, "loss": 0.0458, "step": 2805 }, { "epoch": 2.492007104795737, "grad_norm": 0.321788951593254, "learning_rate": 3.7405658506792173e-06, "loss": 0.0318, "step": 2806 }, { "epoch": 2.4928952042628776, "grad_norm": 0.3168269059365531, "learning_rate": 3.736815554186978e-06, "loss": 0.0343, "step": 2807 }, { "epoch": 2.4937833037300177, "grad_norm": 0.34346931724268553, "learning_rate": 3.7330660166449456e-06, "loss": 0.0379, "step": 2808 }, { "epoch": 2.494671403197158, "grad_norm": 0.3804179544568355, "learning_rate": 3.729317240305932e-06, "loss": 0.0374, "step": 2809 }, { "epoch": 2.4955595026642983, "grad_norm": 0.3949637629728905, "learning_rate": 3.7255692274222877e-06, "loss": 0.0347, "step": 2810 }, { "epoch": 2.496447602131439, "grad_norm": 0.3392338453183855, "learning_rate": 3.7218219802459066e-06, "loss": 0.0364, "step": 2811 }, { "epoch": 2.497335701598579, "grad_norm": 0.3686146599576966, "learning_rate": 3.7180755010282168e-06, "loss": 0.0441, "step": 2812 }, { "epoch": 2.4982238010657194, "grad_norm": 0.34774061639894743, "learning_rate": 3.7143297920201914e-06, "loss": 0.0366, "step": 2813 }, { "epoch": 2.49911190053286, "grad_norm": 0.49083038717662253, "learning_rate": 3.710584855472339e-06, "loss": 0.0393, "step": 2814 }, { "epoch": 2.5, "grad_norm": 0.3572366891807521, "learning_rate": 3.706840693634703e-06, "loss": 0.0367, "step": 2815 }, { "epoch": 2.50088809946714, "grad_norm": 0.4455845738306417, "learning_rate": 3.70309730875686e-06, "loss": 0.0461, "step": 2816 }, { "epoch": 2.5017761989342806, "grad_norm": 0.5198819104211502, "learning_rate": 3.699354703087923e-06, "loss": 0.0376, "step": 2817 }, { "epoch": 2.502664298401421, "grad_norm": 0.4422211669001438, "learning_rate": 3.6956128788765344e-06, "loss": 0.0403, "step": 2818 }, { "epoch": 2.503552397868561, "grad_norm": 0.46793559864748135, "learning_rate": 3.6918718383708724e-06, "loss": 0.0417, "step": 2819 }, { "epoch": 2.5044404973357017, "grad_norm": 0.34059852627396947, "learning_rate": 3.6881315838186343e-06, "loss": 0.0338, "step": 2820 }, { "epoch": 2.505328596802842, "grad_norm": 0.4051312569334247, "learning_rate": 3.684392117467052e-06, "loss": 0.0388, "step": 2821 }, { "epoch": 2.5062166962699823, "grad_norm": 0.42607681600785546, "learning_rate": 3.680653441562885e-06, "loss": 0.0377, "step": 2822 }, { "epoch": 2.5071047957371224, "grad_norm": 0.355778634899678, "learning_rate": 3.6769155583524146e-06, "loss": 0.0322, "step": 2823 }, { "epoch": 2.507992895204263, "grad_norm": 0.3925274860510811, "learning_rate": 3.6731784700814476e-06, "loss": 0.0345, "step": 2824 }, { "epoch": 2.5088809946714035, "grad_norm": 0.3848164029146997, "learning_rate": 3.6694421789953106e-06, "loss": 0.0403, "step": 2825 }, { "epoch": 2.5097690941385435, "grad_norm": 0.3576375902415348, "learning_rate": 3.6657066873388535e-06, "loss": 0.0396, "step": 2826 }, { "epoch": 2.5106571936056836, "grad_norm": 0.40225398181238375, "learning_rate": 3.6619719973564474e-06, "loss": 0.0464, "step": 2827 }, { "epoch": 2.511545293072824, "grad_norm": 0.43150829076536945, "learning_rate": 3.658238111291977e-06, "loss": 0.0485, "step": 2828 }, { "epoch": 2.5124333925399647, "grad_norm": 0.5158332048639124, "learning_rate": 3.654505031388849e-06, "loss": 0.0435, "step": 2829 }, { "epoch": 2.5133214920071048, "grad_norm": 0.5704487254885319, "learning_rate": 3.6507727598899824e-06, "loss": 0.0308, "step": 2830 }, { "epoch": 2.5142095914742453, "grad_norm": 0.37261650251003003, "learning_rate": 3.6470412990378094e-06, "loss": 0.0401, "step": 2831 }, { "epoch": 2.5150976909413854, "grad_norm": 0.4524888623403166, "learning_rate": 3.643310651074283e-06, "loss": 0.0365, "step": 2832 }, { "epoch": 2.515985790408526, "grad_norm": 0.36724890281451805, "learning_rate": 3.6395808182408576e-06, "loss": 0.0335, "step": 2833 }, { "epoch": 2.516873889875666, "grad_norm": 0.3470388730002607, "learning_rate": 3.635851802778502e-06, "loss": 0.0312, "step": 2834 }, { "epoch": 2.5177619893428065, "grad_norm": 0.36925857817379404, "learning_rate": 3.6321236069276974e-06, "loss": 0.041, "step": 2835 }, { "epoch": 2.5186500888099466, "grad_norm": 0.42091887871820133, "learning_rate": 3.6283962329284296e-06, "loss": 0.0385, "step": 2836 }, { "epoch": 2.519538188277087, "grad_norm": 0.39060575848952295, "learning_rate": 3.6246696830201857e-06, "loss": 0.0346, "step": 2837 }, { "epoch": 2.520426287744227, "grad_norm": 0.41001251654868726, "learning_rate": 3.620943959441966e-06, "loss": 0.0407, "step": 2838 }, { "epoch": 2.5213143872113677, "grad_norm": 0.6384879319000327, "learning_rate": 3.61721906443227e-06, "loss": 0.0399, "step": 2839 }, { "epoch": 2.522202486678508, "grad_norm": 0.6358890048725068, "learning_rate": 3.6134950002291024e-06, "loss": 0.0458, "step": 2840 }, { "epoch": 2.5230905861456483, "grad_norm": 0.41654454959931, "learning_rate": 3.609771769069963e-06, "loss": 0.0484, "step": 2841 }, { "epoch": 2.5239786856127884, "grad_norm": 0.41026870628096135, "learning_rate": 3.6060493731918556e-06, "loss": 0.0529, "step": 2842 }, { "epoch": 2.524866785079929, "grad_norm": 0.36026343844245623, "learning_rate": 3.6023278148312825e-06, "loss": 0.0277, "step": 2843 }, { "epoch": 2.5257548845470694, "grad_norm": 0.6081041401548583, "learning_rate": 3.59860709622424e-06, "loss": 0.0378, "step": 2844 }, { "epoch": 2.5266429840142095, "grad_norm": 0.4201293121477265, "learning_rate": 3.594887219606221e-06, "loss": 0.0381, "step": 2845 }, { "epoch": 2.52753108348135, "grad_norm": 0.32886891777410415, "learning_rate": 3.5911681872122134e-06, "loss": 0.0338, "step": 2846 }, { "epoch": 2.52841918294849, "grad_norm": 0.37576580295724116, "learning_rate": 3.587450001276696e-06, "loss": 0.0382, "step": 2847 }, { "epoch": 2.5293072824156306, "grad_norm": 0.5649131229696506, "learning_rate": 3.5837326640336446e-06, "loss": 0.038, "step": 2848 }, { "epoch": 2.5301953818827707, "grad_norm": 0.5380441758868038, "learning_rate": 3.5800161777165142e-06, "loss": 0.0505, "step": 2849 }, { "epoch": 2.5310834813499112, "grad_norm": 0.3520682042354716, "learning_rate": 3.5763005445582598e-06, "loss": 0.0327, "step": 2850 }, { "epoch": 2.5319715808170518, "grad_norm": 0.46086011318357123, "learning_rate": 3.572585766791318e-06, "loss": 0.0503, "step": 2851 }, { "epoch": 2.532859680284192, "grad_norm": 0.333696984351875, "learning_rate": 3.5688718466476126e-06, "loss": 0.0321, "step": 2852 }, { "epoch": 2.533747779751332, "grad_norm": 0.37443283983859443, "learning_rate": 3.565158786358551e-06, "loss": 0.0324, "step": 2853 }, { "epoch": 2.5346358792184724, "grad_norm": 0.387891504117701, "learning_rate": 3.561446588155026e-06, "loss": 0.0368, "step": 2854 }, { "epoch": 2.535523978685613, "grad_norm": 0.35003652913451, "learning_rate": 3.557735254267411e-06, "loss": 0.0374, "step": 2855 }, { "epoch": 2.536412078152753, "grad_norm": 0.3848763312813055, "learning_rate": 3.554024786925562e-06, "loss": 0.0373, "step": 2856 }, { "epoch": 2.5373001776198936, "grad_norm": 0.5812958824030889, "learning_rate": 3.5503151883588105e-06, "loss": 0.0407, "step": 2857 }, { "epoch": 2.5381882770870337, "grad_norm": 0.3621047327198333, "learning_rate": 3.5466064607959706e-06, "loss": 0.042, "step": 2858 }, { "epoch": 2.539076376554174, "grad_norm": 0.37045340152949985, "learning_rate": 3.5428986064653292e-06, "loss": 0.0388, "step": 2859 }, { "epoch": 2.5399644760213143, "grad_norm": 0.32207390348490944, "learning_rate": 3.5391916275946524e-06, "loss": 0.0305, "step": 2860 }, { "epoch": 2.540852575488455, "grad_norm": 0.394066356311656, "learning_rate": 3.5354855264111766e-06, "loss": 0.0358, "step": 2861 }, { "epoch": 2.5417406749555953, "grad_norm": 0.34432415195908195, "learning_rate": 3.531780305141611e-06, "loss": 0.0397, "step": 2862 }, { "epoch": 2.5426287744227354, "grad_norm": 0.36581966863870385, "learning_rate": 3.528075966012141e-06, "loss": 0.038, "step": 2863 }, { "epoch": 2.5435168738898755, "grad_norm": 0.5032688099131521, "learning_rate": 3.5243725112484195e-06, "loss": 0.0379, "step": 2864 }, { "epoch": 2.544404973357016, "grad_norm": 0.38488727674835654, "learning_rate": 3.5206699430755608e-06, "loss": 0.0409, "step": 2865 }, { "epoch": 2.5452930728241565, "grad_norm": 0.37304866547132043, "learning_rate": 3.516968263718159e-06, "loss": 0.0339, "step": 2866 }, { "epoch": 2.5461811722912966, "grad_norm": 0.4307250413119771, "learning_rate": 3.5132674754002647e-06, "loss": 0.0397, "step": 2867 }, { "epoch": 2.5470692717584367, "grad_norm": 0.34368311854340644, "learning_rate": 3.509567580345399e-06, "loss": 0.0375, "step": 2868 }, { "epoch": 2.547957371225577, "grad_norm": 0.37156540513181685, "learning_rate": 3.505868580776541e-06, "loss": 0.035, "step": 2869 }, { "epoch": 2.5488454706927177, "grad_norm": 0.36206664437516267, "learning_rate": 3.502170478916136e-06, "loss": 0.035, "step": 2870 }, { "epoch": 2.549733570159858, "grad_norm": 0.343392725047896, "learning_rate": 3.498473276986088e-06, "loss": 0.0329, "step": 2871 }, { "epoch": 2.5506216696269983, "grad_norm": 0.3900332030331758, "learning_rate": 3.494776977207762e-06, "loss": 0.0336, "step": 2872 }, { "epoch": 2.5515097690941384, "grad_norm": 0.3745245008953631, "learning_rate": 3.4910815818019783e-06, "loss": 0.0394, "step": 2873 }, { "epoch": 2.552397868561279, "grad_norm": 0.43092854439295397, "learning_rate": 3.4873870929890165e-06, "loss": 0.0519, "step": 2874 }, { "epoch": 2.553285968028419, "grad_norm": 0.33144216286454115, "learning_rate": 3.4836935129886073e-06, "loss": 0.0403, "step": 2875 }, { "epoch": 2.5541740674955595, "grad_norm": 0.44095855076671014, "learning_rate": 3.4800008440199445e-06, "loss": 0.0363, "step": 2876 }, { "epoch": 2.5550621669627, "grad_norm": 0.36678200335324146, "learning_rate": 3.4763090883016627e-06, "loss": 0.042, "step": 2877 }, { "epoch": 2.55595026642984, "grad_norm": 0.4160810683582419, "learning_rate": 3.4726182480518534e-06, "loss": 0.0417, "step": 2878 }, { "epoch": 2.55683836589698, "grad_norm": 0.2806849700019478, "learning_rate": 3.4689283254880617e-06, "loss": 0.0321, "step": 2879 }, { "epoch": 2.5577264653641207, "grad_norm": 0.35270937093838695, "learning_rate": 3.465239322827277e-06, "loss": 0.0495, "step": 2880 }, { "epoch": 2.5586145648312613, "grad_norm": 0.3625520593586307, "learning_rate": 3.4615512422859354e-06, "loss": 0.042, "step": 2881 }, { "epoch": 2.5595026642984013, "grad_norm": 0.32298762004616643, "learning_rate": 3.4578640860799204e-06, "loss": 0.0317, "step": 2882 }, { "epoch": 2.560390763765542, "grad_norm": 0.36807535325436513, "learning_rate": 3.454177856424561e-06, "loss": 0.0376, "step": 2883 }, { "epoch": 2.561278863232682, "grad_norm": 0.3473177487556965, "learning_rate": 3.450492555534628e-06, "loss": 0.0383, "step": 2884 }, { "epoch": 2.5621669626998225, "grad_norm": 0.36229498204501787, "learning_rate": 3.446808185624335e-06, "loss": 0.0411, "step": 2885 }, { "epoch": 2.5630550621669625, "grad_norm": 0.37695665100036135, "learning_rate": 3.443124748907336e-06, "loss": 0.0407, "step": 2886 }, { "epoch": 2.563943161634103, "grad_norm": 0.4413943400042454, "learning_rate": 3.439442247596724e-06, "loss": 0.042, "step": 2887 }, { "epoch": 2.5648312611012436, "grad_norm": 0.28336163559055605, "learning_rate": 3.4357606839050293e-06, "loss": 0.0332, "step": 2888 }, { "epoch": 2.5657193605683837, "grad_norm": 0.366834742739253, "learning_rate": 3.4320800600442242e-06, "loss": 0.0298, "step": 2889 }, { "epoch": 2.5666074600355238, "grad_norm": 0.4107521682052408, "learning_rate": 3.4284003782257076e-06, "loss": 0.0436, "step": 2890 }, { "epoch": 2.5674955595026643, "grad_norm": 0.36825952705836196, "learning_rate": 3.424721640660316e-06, "loss": 0.0436, "step": 2891 }, { "epoch": 2.568383658969805, "grad_norm": 0.4261051183471473, "learning_rate": 3.421043849558323e-06, "loss": 0.0418, "step": 2892 }, { "epoch": 2.569271758436945, "grad_norm": 0.34718143598343243, "learning_rate": 3.41736700712943e-06, "loss": 0.0455, "step": 2893 }, { "epoch": 2.5701598579040854, "grad_norm": 0.34991386925439155, "learning_rate": 3.4136911155827655e-06, "loss": 0.0398, "step": 2894 }, { "epoch": 2.5710479573712255, "grad_norm": 0.35068336409204465, "learning_rate": 3.4100161771268903e-06, "loss": 0.0342, "step": 2895 }, { "epoch": 2.571936056838366, "grad_norm": 0.37072320692567096, "learning_rate": 3.4063421939697925e-06, "loss": 0.0284, "step": 2896 }, { "epoch": 2.572824156305506, "grad_norm": 0.4521511118608954, "learning_rate": 3.4026691683188863e-06, "loss": 0.0435, "step": 2897 }, { "epoch": 2.5737122557726466, "grad_norm": 0.374071820358087, "learning_rate": 3.3989971023810075e-06, "loss": 0.0372, "step": 2898 }, { "epoch": 2.5746003552397867, "grad_norm": 0.40806820394640164, "learning_rate": 3.39532599836242e-06, "loss": 0.0361, "step": 2899 }, { "epoch": 2.575488454706927, "grad_norm": 0.45178010575771294, "learning_rate": 3.3916558584688055e-06, "loss": 0.0383, "step": 2900 }, { "epoch": 2.5763765541740673, "grad_norm": 0.3505220127517853, "learning_rate": 3.3879866849052694e-06, "loss": 0.0361, "step": 2901 }, { "epoch": 2.577264653641208, "grad_norm": 0.30114347233551425, "learning_rate": 3.3843184798763345e-06, "loss": 0.0295, "step": 2902 }, { "epoch": 2.5781527531083483, "grad_norm": 0.36488150352891885, "learning_rate": 3.380651245585942e-06, "loss": 0.0451, "step": 2903 }, { "epoch": 2.5790408525754884, "grad_norm": 0.3484295377210476, "learning_rate": 3.376984984237453e-06, "loss": 0.0359, "step": 2904 }, { "epoch": 2.5799289520426285, "grad_norm": 0.3194184868830932, "learning_rate": 3.373319698033642e-06, "loss": 0.0364, "step": 2905 }, { "epoch": 2.580817051509769, "grad_norm": 0.3669837511743523, "learning_rate": 3.3696553891766915e-06, "loss": 0.0347, "step": 2906 }, { "epoch": 2.5817051509769096, "grad_norm": 0.3339126721912081, "learning_rate": 3.3659920598682076e-06, "loss": 0.0363, "step": 2907 }, { "epoch": 2.5825932504440496, "grad_norm": 0.34543586647078134, "learning_rate": 3.3623297123092007e-06, "loss": 0.0371, "step": 2908 }, { "epoch": 2.58348134991119, "grad_norm": 0.3732936486347056, "learning_rate": 3.358668348700095e-06, "loss": 0.0296, "step": 2909 }, { "epoch": 2.5843694493783302, "grad_norm": 0.298841796117749, "learning_rate": 3.355007971240719e-06, "loss": 0.0311, "step": 2910 }, { "epoch": 2.5852575488454708, "grad_norm": 0.4702763133687615, "learning_rate": 3.3513485821303133e-06, "loss": 0.0484, "step": 2911 }, { "epoch": 2.586145648312611, "grad_norm": 0.3860602131308114, "learning_rate": 3.3476901835675225e-06, "loss": 0.0317, "step": 2912 }, { "epoch": 2.5870337477797514, "grad_norm": 0.384854003859277, "learning_rate": 3.3440327777503965e-06, "loss": 0.0357, "step": 2913 }, { "epoch": 2.587921847246892, "grad_norm": 0.4383993297060523, "learning_rate": 3.340376366876389e-06, "loss": 0.0336, "step": 2914 }, { "epoch": 2.588809946714032, "grad_norm": 0.356911280912349, "learning_rate": 3.336720953142354e-06, "loss": 0.0306, "step": 2915 }, { "epoch": 2.589698046181172, "grad_norm": 0.33852240382143295, "learning_rate": 3.333066538744548e-06, "loss": 0.0346, "step": 2916 }, { "epoch": 2.5905861456483126, "grad_norm": 0.38118336262773833, "learning_rate": 3.3294131258786323e-06, "loss": 0.0386, "step": 2917 }, { "epoch": 2.591474245115453, "grad_norm": 0.4241710970512753, "learning_rate": 3.3257607167396544e-06, "loss": 0.0424, "step": 2918 }, { "epoch": 2.592362344582593, "grad_norm": 0.3569774734083075, "learning_rate": 3.322109313522067e-06, "loss": 0.0368, "step": 2919 }, { "epoch": 2.5932504440497337, "grad_norm": 0.5444549033396232, "learning_rate": 3.3184589184197196e-06, "loss": 0.0493, "step": 2920 }, { "epoch": 2.594138543516874, "grad_norm": 0.2617625106184296, "learning_rate": 3.314809533625853e-06, "loss": 0.025, "step": 2921 }, { "epoch": 2.5950266429840143, "grad_norm": 0.363478572714634, "learning_rate": 3.3111611613330997e-06, "loss": 0.0362, "step": 2922 }, { "epoch": 2.5959147424511544, "grad_norm": 0.3203471913208762, "learning_rate": 3.307513803733485e-06, "loss": 0.031, "step": 2923 }, { "epoch": 2.596802841918295, "grad_norm": 0.594434496755036, "learning_rate": 3.3038674630184277e-06, "loss": 0.0484, "step": 2924 }, { "epoch": 2.5976909413854354, "grad_norm": 0.5404762678979643, "learning_rate": 3.3002221413787312e-06, "loss": 0.0488, "step": 2925 }, { "epoch": 2.5985790408525755, "grad_norm": 0.389023684590373, "learning_rate": 3.296577841004588e-06, "loss": 0.0396, "step": 2926 }, { "epoch": 2.5994671403197156, "grad_norm": 0.3645946034336746, "learning_rate": 3.2929345640855786e-06, "loss": 0.0407, "step": 2927 }, { "epoch": 2.600355239786856, "grad_norm": 0.3698995047619411, "learning_rate": 3.289292312810667e-06, "loss": 0.0313, "step": 2928 }, { "epoch": 2.6012433392539966, "grad_norm": 0.3785277559545529, "learning_rate": 3.285651089368202e-06, "loss": 0.0308, "step": 2929 }, { "epoch": 2.6021314387211367, "grad_norm": 0.42401408231598636, "learning_rate": 3.282010895945913e-06, "loss": 0.0399, "step": 2930 }, { "epoch": 2.6030195381882772, "grad_norm": 0.39439939566153936, "learning_rate": 3.278371734730912e-06, "loss": 0.0317, "step": 2931 }, { "epoch": 2.6039076376554173, "grad_norm": 0.31028852178930894, "learning_rate": 3.274733607909689e-06, "loss": 0.0369, "step": 2932 }, { "epoch": 2.604795737122558, "grad_norm": 0.37252961773648, "learning_rate": 3.2710965176681204e-06, "loss": 0.035, "step": 2933 }, { "epoch": 2.605683836589698, "grad_norm": 0.4535072794437051, "learning_rate": 3.2674604661914455e-06, "loss": 0.0371, "step": 2934 }, { "epoch": 2.6065719360568385, "grad_norm": 0.3969832457752023, "learning_rate": 3.2638254556642925e-06, "loss": 0.0325, "step": 2935 }, { "epoch": 2.6074600355239785, "grad_norm": 0.4283737347708285, "learning_rate": 3.2601914882706564e-06, "loss": 0.0374, "step": 2936 }, { "epoch": 2.608348134991119, "grad_norm": 0.4130381818412859, "learning_rate": 3.256558566193912e-06, "loss": 0.0376, "step": 2937 }, { "epoch": 2.609236234458259, "grad_norm": 0.42132721189588773, "learning_rate": 3.2529266916167986e-06, "loss": 0.0425, "step": 2938 }, { "epoch": 2.6101243339253997, "grad_norm": 0.43400390122551263, "learning_rate": 3.2492958667214307e-06, "loss": 0.0446, "step": 2939 }, { "epoch": 2.61101243339254, "grad_norm": 0.3466476212168075, "learning_rate": 3.2456660936892915e-06, "loss": 0.0325, "step": 2940 }, { "epoch": 2.6119005328596803, "grad_norm": 0.34737023769869063, "learning_rate": 3.2420373747012335e-06, "loss": 0.0364, "step": 2941 }, { "epoch": 2.6127886323268203, "grad_norm": 0.4005533408704453, "learning_rate": 3.238409711937472e-06, "loss": 0.0413, "step": 2942 }, { "epoch": 2.613676731793961, "grad_norm": 0.3947731548238819, "learning_rate": 3.2347831075775902e-06, "loss": 0.04, "step": 2943 }, { "epoch": 2.6145648312611014, "grad_norm": 0.3995688788988381, "learning_rate": 3.231157563800536e-06, "loss": 0.0391, "step": 2944 }, { "epoch": 2.6154529307282415, "grad_norm": 0.4067894054610786, "learning_rate": 3.227533082784622e-06, "loss": 0.0352, "step": 2945 }, { "epoch": 2.616341030195382, "grad_norm": 0.3548331867833077, "learning_rate": 3.2239096667075146e-06, "loss": 0.0339, "step": 2946 }, { "epoch": 2.617229129662522, "grad_norm": 0.36989131123859503, "learning_rate": 3.220287317746247e-06, "loss": 0.0372, "step": 2947 }, { "epoch": 2.6181172291296626, "grad_norm": 0.37035928762402337, "learning_rate": 3.216666038077211e-06, "loss": 0.033, "step": 2948 }, { "epoch": 2.6190053285968027, "grad_norm": 0.4617474395181904, "learning_rate": 3.2130458298761545e-06, "loss": 0.0346, "step": 2949 }, { "epoch": 2.619893428063943, "grad_norm": 0.29683093274114897, "learning_rate": 3.2094266953181817e-06, "loss": 0.039, "step": 2950 }, { "epoch": 2.6207815275310837, "grad_norm": 0.3736082637973777, "learning_rate": 3.2058086365777496e-06, "loss": 0.043, "step": 2951 }, { "epoch": 2.621669626998224, "grad_norm": 0.33112148916076817, "learning_rate": 3.2021916558286726e-06, "loss": 0.0411, "step": 2952 }, { "epoch": 2.622557726465364, "grad_norm": 0.3930293105597273, "learning_rate": 3.1985757552441153e-06, "loss": 0.0359, "step": 2953 }, { "epoch": 2.6234458259325044, "grad_norm": 0.3862605851823903, "learning_rate": 3.194960936996596e-06, "loss": 0.0289, "step": 2954 }, { "epoch": 2.624333925399645, "grad_norm": 0.3151600324706895, "learning_rate": 3.1913472032579767e-06, "loss": 0.0329, "step": 2955 }, { "epoch": 2.625222024866785, "grad_norm": 0.3524541625651139, "learning_rate": 3.1877345561994733e-06, "loss": 0.0352, "step": 2956 }, { "epoch": 2.6261101243339255, "grad_norm": 0.337047036097464, "learning_rate": 3.1841229979916465e-06, "loss": 0.0339, "step": 2957 }, { "epoch": 2.6269982238010656, "grad_norm": 0.35556623893984984, "learning_rate": 3.180512530804407e-06, "loss": 0.0344, "step": 2958 }, { "epoch": 2.627886323268206, "grad_norm": 0.3193329513631799, "learning_rate": 3.176903156807001e-06, "loss": 0.0317, "step": 2959 }, { "epoch": 2.6287744227353462, "grad_norm": 0.3298905491531614, "learning_rate": 3.173294878168025e-06, "loss": 0.0376, "step": 2960 }, { "epoch": 2.6296625222024868, "grad_norm": 0.39272454144962626, "learning_rate": 3.1696876970554168e-06, "loss": 0.0394, "step": 2961 }, { "epoch": 2.630550621669627, "grad_norm": 0.3627808441763758, "learning_rate": 3.1660816156364554e-06, "loss": 0.0324, "step": 2962 }, { "epoch": 2.6314387211367674, "grad_norm": 0.43259092133880245, "learning_rate": 3.162476636077752e-06, "loss": 0.0501, "step": 2963 }, { "epoch": 2.6323268206039074, "grad_norm": 0.3483421661518904, "learning_rate": 3.158872760545265e-06, "loss": 0.0394, "step": 2964 }, { "epoch": 2.633214920071048, "grad_norm": 0.4716050015458488, "learning_rate": 3.1552699912042838e-06, "loss": 0.0482, "step": 2965 }, { "epoch": 2.6341030195381885, "grad_norm": 0.4481975800780285, "learning_rate": 3.151668330219438e-06, "loss": 0.045, "step": 2966 }, { "epoch": 2.6349911190053286, "grad_norm": 0.3278300926385035, "learning_rate": 3.1480677797546844e-06, "loss": 0.032, "step": 2967 }, { "epoch": 2.6358792184724686, "grad_norm": 0.33690863420780326, "learning_rate": 3.144468341973317e-06, "loss": 0.0336, "step": 2968 }, { "epoch": 2.636767317939609, "grad_norm": 0.3783747932133116, "learning_rate": 3.140870019037963e-06, "loss": 0.0359, "step": 2969 }, { "epoch": 2.6376554174067497, "grad_norm": 0.425392831471685, "learning_rate": 3.1372728131105755e-06, "loss": 0.0446, "step": 2970 }, { "epoch": 2.6385435168738898, "grad_norm": 0.7784890754673194, "learning_rate": 3.133676726352438e-06, "loss": 0.0479, "step": 2971 }, { "epoch": 2.6394316163410303, "grad_norm": 0.5111252232332689, "learning_rate": 3.130081760924163e-06, "loss": 0.0384, "step": 2972 }, { "epoch": 2.6403197158081704, "grad_norm": 0.3171858484190841, "learning_rate": 3.1264879189856872e-06, "loss": 0.0276, "step": 2973 }, { "epoch": 2.641207815275311, "grad_norm": 0.33107180168551675, "learning_rate": 3.1228952026962767e-06, "loss": 0.0355, "step": 2974 }, { "epoch": 2.642095914742451, "grad_norm": 0.4442599606911684, "learning_rate": 3.119303614214513e-06, "loss": 0.0385, "step": 2975 }, { "epoch": 2.6429840142095915, "grad_norm": 0.3316176091595877, "learning_rate": 3.115713155698308e-06, "loss": 0.0327, "step": 2976 }, { "epoch": 2.643872113676732, "grad_norm": 0.3363556403586757, "learning_rate": 3.1121238293048905e-06, "loss": 0.0387, "step": 2977 }, { "epoch": 2.644760213143872, "grad_norm": 0.4465577388620344, "learning_rate": 3.1085356371908115e-06, "loss": 0.043, "step": 2978 }, { "epoch": 2.645648312611012, "grad_norm": 0.3669758155338096, "learning_rate": 3.1049485815119375e-06, "loss": 0.0392, "step": 2979 }, { "epoch": 2.6465364120781527, "grad_norm": 0.46008973055391744, "learning_rate": 3.1013626644234544e-06, "loss": 0.0554, "step": 2980 }, { "epoch": 2.6474245115452932, "grad_norm": 0.4244444704806108, "learning_rate": 3.097777888079864e-06, "loss": 0.0342, "step": 2981 }, { "epoch": 2.6483126110124333, "grad_norm": 0.367731556869747, "learning_rate": 3.0941942546349826e-06, "loss": 0.0439, "step": 2982 }, { "epoch": 2.649200710479574, "grad_norm": 0.4656929380593179, "learning_rate": 3.090611766241938e-06, "loss": 0.0472, "step": 2983 }, { "epoch": 2.650088809946714, "grad_norm": 0.6473793402715186, "learning_rate": 3.087030425053172e-06, "loss": 0.0414, "step": 2984 }, { "epoch": 2.6509769094138544, "grad_norm": 0.3793462821056831, "learning_rate": 3.0834502332204357e-06, "loss": 0.0453, "step": 2985 }, { "epoch": 2.6518650088809945, "grad_norm": 0.37597507486313714, "learning_rate": 3.0798711928947942e-06, "loss": 0.0352, "step": 2986 }, { "epoch": 2.652753108348135, "grad_norm": 0.4146752423449461, "learning_rate": 3.0762933062266137e-06, "loss": 0.0367, "step": 2987 }, { "epoch": 2.6536412078152756, "grad_norm": 0.36351325144612573, "learning_rate": 3.0727165753655696e-06, "loss": 0.0405, "step": 2988 }, { "epoch": 2.6545293072824157, "grad_norm": 0.35421094882711285, "learning_rate": 3.0691410024606473e-06, "loss": 0.0354, "step": 2989 }, { "epoch": 2.6554174067495557, "grad_norm": 0.4496461727993196, "learning_rate": 3.0655665896601328e-06, "loss": 0.0415, "step": 2990 }, { "epoch": 2.6563055062166963, "grad_norm": 0.36727983724418684, "learning_rate": 3.061993339111611e-06, "loss": 0.0407, "step": 2991 }, { "epoch": 2.657193605683837, "grad_norm": 0.41815552831259034, "learning_rate": 3.0584212529619777e-06, "loss": 0.0339, "step": 2992 }, { "epoch": 2.658081705150977, "grad_norm": 0.32399233087319707, "learning_rate": 3.054850333357422e-06, "loss": 0.0286, "step": 2993 }, { "epoch": 2.6589698046181174, "grad_norm": 0.4371919155295159, "learning_rate": 3.051280582443436e-06, "loss": 0.0389, "step": 2994 }, { "epoch": 2.6598579040852575, "grad_norm": 0.4733948146254441, "learning_rate": 3.0477120023648054e-06, "loss": 0.038, "step": 2995 }, { "epoch": 2.660746003552398, "grad_norm": 0.4172994785595082, "learning_rate": 3.044144595265617e-06, "loss": 0.0408, "step": 2996 }, { "epoch": 2.661634103019538, "grad_norm": 0.4185737348870064, "learning_rate": 3.0405783632892504e-06, "loss": 0.0382, "step": 2997 }, { "epoch": 2.6625222024866786, "grad_norm": 0.43850163979209644, "learning_rate": 3.0370133085783802e-06, "loss": 0.0342, "step": 2998 }, { "epoch": 2.6634103019538187, "grad_norm": 0.36394452920851955, "learning_rate": 3.0334494332749716e-06, "loss": 0.038, "step": 2999 }, { "epoch": 2.664298401420959, "grad_norm": 0.8676387559738967, "learning_rate": 3.0298867395202835e-06, "loss": 0.0381, "step": 3000 }, { "epoch": 2.6651865008880993, "grad_norm": 0.3211973265446368, "learning_rate": 3.0263252294548627e-06, "loss": 0.0379, "step": 3001 }, { "epoch": 2.66607460035524, "grad_norm": 0.3287240144142483, "learning_rate": 3.0227649052185516e-06, "loss": 0.0333, "step": 3002 }, { "epoch": 2.6669626998223803, "grad_norm": 0.4447839208547026, "learning_rate": 3.019205768950468e-06, "loss": 0.036, "step": 3003 }, { "epoch": 2.6678507992895204, "grad_norm": 0.5902655962428176, "learning_rate": 3.0156478227890248e-06, "loss": 0.0375, "step": 3004 }, { "epoch": 2.6687388987566605, "grad_norm": 0.47690906179300235, "learning_rate": 3.0120910688719185e-06, "loss": 0.0415, "step": 3005 }, { "epoch": 2.669626998223801, "grad_norm": 0.3396342251266381, "learning_rate": 3.0085355093361302e-06, "loss": 0.0379, "step": 3006 }, { "epoch": 2.6705150976909415, "grad_norm": 0.5088779799265526, "learning_rate": 3.004981146317919e-06, "loss": 0.0367, "step": 3007 }, { "epoch": 2.6714031971580816, "grad_norm": 0.36418986487674176, "learning_rate": 3.001427981952828e-06, "loss": 0.0342, "step": 3008 }, { "epoch": 2.672291296625222, "grad_norm": 0.3954559374557829, "learning_rate": 2.9978760183756805e-06, "loss": 0.0354, "step": 3009 }, { "epoch": 2.673179396092362, "grad_norm": 0.3641799103563655, "learning_rate": 2.9943252577205784e-06, "loss": 0.0347, "step": 3010 }, { "epoch": 2.6740674955595027, "grad_norm": 0.3308059502230103, "learning_rate": 2.9907757021208995e-06, "loss": 0.0352, "step": 3011 }, { "epoch": 2.674955595026643, "grad_norm": 0.33698903752999726, "learning_rate": 2.987227353709298e-06, "loss": 0.0325, "step": 3012 }, { "epoch": 2.6758436944937833, "grad_norm": 0.36608490742160205, "learning_rate": 2.9836802146177034e-06, "loss": 0.037, "step": 3013 }, { "epoch": 2.676731793960924, "grad_norm": 0.3829778651837029, "learning_rate": 2.9801342869773178e-06, "loss": 0.0328, "step": 3014 }, { "epoch": 2.677619893428064, "grad_norm": 0.4413181391001877, "learning_rate": 2.976589572918619e-06, "loss": 0.0372, "step": 3015 }, { "epoch": 2.678507992895204, "grad_norm": 0.4096440798592964, "learning_rate": 2.9730460745713464e-06, "loss": 0.0351, "step": 3016 }, { "epoch": 2.6793960923623446, "grad_norm": 0.34925480702046746, "learning_rate": 2.9695037940645203e-06, "loss": 0.0361, "step": 3017 }, { "epoch": 2.680284191829485, "grad_norm": 0.3937293274808995, "learning_rate": 2.965962733526423e-06, "loss": 0.0362, "step": 3018 }, { "epoch": 2.681172291296625, "grad_norm": 0.4847093542004826, "learning_rate": 2.9624228950846046e-06, "loss": 0.0429, "step": 3019 }, { "epoch": 2.6820603907637657, "grad_norm": 0.43458423290311116, "learning_rate": 2.9588842808658814e-06, "loss": 0.0406, "step": 3020 }, { "epoch": 2.6829484902309058, "grad_norm": 0.3815783977598914, "learning_rate": 2.9553468929963324e-06, "loss": 0.0446, "step": 3021 }, { "epoch": 2.6838365896980463, "grad_norm": 0.4052260100848497, "learning_rate": 2.951810733601304e-06, "loss": 0.0342, "step": 3022 }, { "epoch": 2.6847246891651864, "grad_norm": 0.3753104263653454, "learning_rate": 2.948275804805402e-06, "loss": 0.0352, "step": 3023 }, { "epoch": 2.685612788632327, "grad_norm": 0.35865529778040856, "learning_rate": 2.94474210873249e-06, "loss": 0.0374, "step": 3024 }, { "epoch": 2.6865008880994674, "grad_norm": 0.2891758746772565, "learning_rate": 2.941209647505695e-06, "loss": 0.0318, "step": 3025 }, { "epoch": 2.6873889875666075, "grad_norm": 0.3522820379304216, "learning_rate": 2.937678423247402e-06, "loss": 0.0302, "step": 3026 }, { "epoch": 2.6882770870337476, "grad_norm": 0.4449943279570721, "learning_rate": 2.9341484380792507e-06, "loss": 0.0441, "step": 3027 }, { "epoch": 2.689165186500888, "grad_norm": 0.3424612494984106, "learning_rate": 2.9306196941221367e-06, "loss": 0.045, "step": 3028 }, { "epoch": 2.6900532859680286, "grad_norm": 0.3676593409513878, "learning_rate": 2.9270921934962104e-06, "loss": 0.038, "step": 3029 }, { "epoch": 2.6909413854351687, "grad_norm": 0.358705160343916, "learning_rate": 2.9235659383208757e-06, "loss": 0.0283, "step": 3030 }, { "epoch": 2.691829484902309, "grad_norm": 0.4925229944074189, "learning_rate": 2.9200409307147904e-06, "loss": 0.0494, "step": 3031 }, { "epoch": 2.6927175843694493, "grad_norm": 0.4111014949629559, "learning_rate": 2.916517172795854e-06, "loss": 0.0443, "step": 3032 }, { "epoch": 2.69360568383659, "grad_norm": 0.36681988928787235, "learning_rate": 2.912994666681225e-06, "loss": 0.0326, "step": 3033 }, { "epoch": 2.69449378330373, "grad_norm": 0.34229723815761903, "learning_rate": 2.9094734144873037e-06, "loss": 0.0355, "step": 3034 }, { "epoch": 2.6953818827708704, "grad_norm": 0.36229111471339054, "learning_rate": 2.9059534183297457e-06, "loss": 0.0325, "step": 3035 }, { "epoch": 2.6962699822380105, "grad_norm": 0.35222247668977646, "learning_rate": 2.902434680323436e-06, "loss": 0.0384, "step": 3036 }, { "epoch": 2.697158081705151, "grad_norm": 0.42622225834403915, "learning_rate": 2.8989172025825175e-06, "loss": 0.0428, "step": 3037 }, { "epoch": 2.698046181172291, "grad_norm": 0.41324685295948094, "learning_rate": 2.8954009872203744e-06, "loss": 0.0312, "step": 3038 }, { "epoch": 2.6989342806394316, "grad_norm": 0.3637683571924108, "learning_rate": 2.8918860363496264e-06, "loss": 0.0436, "step": 3039 }, { "epoch": 2.699822380106572, "grad_norm": 0.3824596679258426, "learning_rate": 2.888372352082136e-06, "loss": 0.0386, "step": 3040 }, { "epoch": 2.7007104795737122, "grad_norm": 0.4166251517678143, "learning_rate": 2.8848599365290077e-06, "loss": 0.0389, "step": 3041 }, { "epoch": 2.7015985790408523, "grad_norm": 0.33828199610338894, "learning_rate": 2.881348791800579e-06, "loss": 0.0362, "step": 3042 }, { "epoch": 2.702486678507993, "grad_norm": 0.4600832400912325, "learning_rate": 2.8778389200064293e-06, "loss": 0.0435, "step": 3043 }, { "epoch": 2.7033747779751334, "grad_norm": 0.43137721874031665, "learning_rate": 2.8743303232553687e-06, "loss": 0.0323, "step": 3044 }, { "epoch": 2.7042628774422734, "grad_norm": 0.3809669708160857, "learning_rate": 2.87082300365544e-06, "loss": 0.0428, "step": 3045 }, { "epoch": 2.705150976909414, "grad_norm": 0.43923888562920393, "learning_rate": 2.867316963313925e-06, "loss": 0.0366, "step": 3046 }, { "epoch": 2.706039076376554, "grad_norm": 0.4322001321322257, "learning_rate": 2.8638122043373332e-06, "loss": 0.049, "step": 3047 }, { "epoch": 2.7069271758436946, "grad_norm": 0.4024055032038764, "learning_rate": 2.8603087288314004e-06, "loss": 0.0503, "step": 3048 }, { "epoch": 2.7078152753108347, "grad_norm": 0.4738050977664571, "learning_rate": 2.8568065389010996e-06, "loss": 0.0464, "step": 3049 }, { "epoch": 2.708703374777975, "grad_norm": 0.32485361335492613, "learning_rate": 2.853305636650623e-06, "loss": 0.0356, "step": 3050 }, { "epoch": 2.7095914742451157, "grad_norm": 0.35248987030964635, "learning_rate": 2.849806024183397e-06, "loss": 0.0376, "step": 3051 }, { "epoch": 2.710479573712256, "grad_norm": 0.4176054705775532, "learning_rate": 2.846307703602065e-06, "loss": 0.0486, "step": 3052 }, { "epoch": 2.711367673179396, "grad_norm": 0.34139511690655033, "learning_rate": 2.842810677008502e-06, "loss": 0.0324, "step": 3053 }, { "epoch": 2.7122557726465364, "grad_norm": 0.31741424501326176, "learning_rate": 2.8393149465037985e-06, "loss": 0.0321, "step": 3054 }, { "epoch": 2.713143872113677, "grad_norm": 0.41725709223720847, "learning_rate": 2.8358205141882735e-06, "loss": 0.0315, "step": 3055 }, { "epoch": 2.714031971580817, "grad_norm": 0.4440463575338334, "learning_rate": 2.83232738216146e-06, "loss": 0.0391, "step": 3056 }, { "epoch": 2.7149200710479575, "grad_norm": 0.3400860185121632, "learning_rate": 2.8288355525221095e-06, "loss": 0.0344, "step": 3057 }, { "epoch": 2.7158081705150976, "grad_norm": 0.39726409592181433, "learning_rate": 2.8253450273681983e-06, "loss": 0.0337, "step": 3058 }, { "epoch": 2.716696269982238, "grad_norm": 0.3542869938190121, "learning_rate": 2.8218558087969113e-06, "loss": 0.0385, "step": 3059 }, { "epoch": 2.717584369449378, "grad_norm": 0.42471033370910666, "learning_rate": 2.818367898904649e-06, "loss": 0.0455, "step": 3060 }, { "epoch": 2.7184724689165187, "grad_norm": 0.38325242452874897, "learning_rate": 2.8148812997870325e-06, "loss": 0.0425, "step": 3061 }, { "epoch": 2.719360568383659, "grad_norm": 0.35984571631548623, "learning_rate": 2.8113960135388863e-06, "loss": 0.0387, "step": 3062 }, { "epoch": 2.7202486678507993, "grad_norm": 0.34441000717761105, "learning_rate": 2.807912042254254e-06, "loss": 0.0383, "step": 3063 }, { "epoch": 2.7211367673179394, "grad_norm": 0.3205683697649063, "learning_rate": 2.804429388026383e-06, "loss": 0.0334, "step": 3064 }, { "epoch": 2.72202486678508, "grad_norm": 0.34943096582410743, "learning_rate": 2.8009480529477316e-06, "loss": 0.0384, "step": 3065 }, { "epoch": 2.7229129662522205, "grad_norm": 0.3463969882660535, "learning_rate": 2.797468039109966e-06, "loss": 0.0381, "step": 3066 }, { "epoch": 2.7238010657193605, "grad_norm": 0.3969513229946938, "learning_rate": 2.793989348603964e-06, "loss": 0.0357, "step": 3067 }, { "epoch": 2.7246891651865006, "grad_norm": 0.4109948891742574, "learning_rate": 2.7905119835197934e-06, "loss": 0.0362, "step": 3068 }, { "epoch": 2.725577264653641, "grad_norm": 0.37030197164004486, "learning_rate": 2.78703594594674e-06, "loss": 0.0438, "step": 3069 }, { "epoch": 2.7264653641207817, "grad_norm": 0.3286524903853804, "learning_rate": 2.783561237973289e-06, "loss": 0.0337, "step": 3070 }, { "epoch": 2.7273534635879217, "grad_norm": 0.34347142029845, "learning_rate": 2.7800878616871197e-06, "loss": 0.0317, "step": 3071 }, { "epoch": 2.7282415630550623, "grad_norm": 0.4061876988817035, "learning_rate": 2.7766158191751204e-06, "loss": 0.0406, "step": 3072 }, { "epoch": 2.7291296625222023, "grad_norm": 0.35728304118602083, "learning_rate": 2.7731451125233723e-06, "loss": 0.0372, "step": 3073 }, { "epoch": 2.730017761989343, "grad_norm": 0.49859679294884096, "learning_rate": 2.769675743817154e-06, "loss": 0.0369, "step": 3074 }, { "epoch": 2.730905861456483, "grad_norm": 0.3541473907240671, "learning_rate": 2.7662077151409433e-06, "loss": 0.0368, "step": 3075 }, { "epoch": 2.7317939609236235, "grad_norm": 0.33061854975846117, "learning_rate": 2.7627410285784164e-06, "loss": 0.0292, "step": 3076 }, { "epoch": 2.732682060390764, "grad_norm": 0.3590679714754361, "learning_rate": 2.7592756862124283e-06, "loss": 0.0365, "step": 3077 }, { "epoch": 2.733570159857904, "grad_norm": 0.3032153657146553, "learning_rate": 2.755811690125042e-06, "loss": 0.0281, "step": 3078 }, { "epoch": 2.734458259325044, "grad_norm": 0.3483171659209983, "learning_rate": 2.7523490423975068e-06, "loss": 0.0343, "step": 3079 }, { "epoch": 2.7353463587921847, "grad_norm": 0.4175432531434774, "learning_rate": 2.7488877451102595e-06, "loss": 0.0422, "step": 3080 }, { "epoch": 2.736234458259325, "grad_norm": 0.38632308931942116, "learning_rate": 2.7454278003429247e-06, "loss": 0.031, "step": 3081 }, { "epoch": 2.7371225577264653, "grad_norm": 0.4132379943308144, "learning_rate": 2.741969210174321e-06, "loss": 0.0389, "step": 3082 }, { "epoch": 2.738010657193606, "grad_norm": 0.3592539348863347, "learning_rate": 2.7385119766824442e-06, "loss": 0.0354, "step": 3083 }, { "epoch": 2.738898756660746, "grad_norm": 0.3517297203952675, "learning_rate": 2.7350561019444843e-06, "loss": 0.0299, "step": 3084 }, { "epoch": 2.7397868561278864, "grad_norm": 0.3338412132014452, "learning_rate": 2.7316015880368075e-06, "loss": 0.0364, "step": 3085 }, { "epoch": 2.7406749555950265, "grad_norm": 0.33894518891668773, "learning_rate": 2.728148437034963e-06, "loss": 0.0278, "step": 3086 }, { "epoch": 2.741563055062167, "grad_norm": 0.37497061261448794, "learning_rate": 2.7246966510136874e-06, "loss": 0.0361, "step": 3087 }, { "epoch": 2.7424511545293075, "grad_norm": 0.3731636362628687, "learning_rate": 2.721246232046891e-06, "loss": 0.0362, "step": 3088 }, { "epoch": 2.7433392539964476, "grad_norm": 0.3752080198911008, "learning_rate": 2.717797182207663e-06, "loss": 0.0401, "step": 3089 }, { "epoch": 2.7442273534635877, "grad_norm": 0.31885751006488333, "learning_rate": 2.7143495035682758e-06, "loss": 0.0313, "step": 3090 }, { "epoch": 2.7451154529307282, "grad_norm": 0.4366593177861865, "learning_rate": 2.7109031982001693e-06, "loss": 0.0401, "step": 3091 }, { "epoch": 2.7460035523978688, "grad_norm": 0.36538923873440216, "learning_rate": 2.707458268173967e-06, "loss": 0.038, "step": 3092 }, { "epoch": 2.746891651865009, "grad_norm": 0.43090700112460756, "learning_rate": 2.7040147155594597e-06, "loss": 0.0353, "step": 3093 }, { "epoch": 2.7477797513321494, "grad_norm": 0.3470842529256806, "learning_rate": 2.7005725424256113e-06, "loss": 0.0409, "step": 3094 }, { "epoch": 2.7486678507992894, "grad_norm": 0.4966612143178959, "learning_rate": 2.6971317508405605e-06, "loss": 0.0443, "step": 3095 }, { "epoch": 2.74955595026643, "grad_norm": 0.401111051805152, "learning_rate": 2.6936923428716156e-06, "loss": 0.0407, "step": 3096 }, { "epoch": 2.75044404973357, "grad_norm": 0.3436742483620814, "learning_rate": 2.6902543205852496e-06, "loss": 0.037, "step": 3097 }, { "epoch": 2.7513321492007106, "grad_norm": 0.42055982354584304, "learning_rate": 2.686817686047104e-06, "loss": 0.0432, "step": 3098 }, { "epoch": 2.7522202486678506, "grad_norm": 0.32998544501435473, "learning_rate": 2.6833824413219913e-06, "loss": 0.0344, "step": 3099 }, { "epoch": 2.753108348134991, "grad_norm": 0.33682246330946336, "learning_rate": 2.6799485884738828e-06, "loss": 0.0315, "step": 3100 }, { "epoch": 2.7539964476021312, "grad_norm": 0.3852709983882135, "learning_rate": 2.6765161295659147e-06, "loss": 0.0363, "step": 3101 }, { "epoch": 2.7548845470692718, "grad_norm": 0.41192177696509297, "learning_rate": 2.6730850666603915e-06, "loss": 0.0394, "step": 3102 }, { "epoch": 2.7557726465364123, "grad_norm": 0.35081987490990374, "learning_rate": 2.669655401818769e-06, "loss": 0.0343, "step": 3103 }, { "epoch": 2.7566607460035524, "grad_norm": 0.3161120571571943, "learning_rate": 2.6662271371016745e-06, "loss": 0.0318, "step": 3104 }, { "epoch": 2.7575488454706925, "grad_norm": 0.4278308516005874, "learning_rate": 2.662800274568885e-06, "loss": 0.0356, "step": 3105 }, { "epoch": 2.758436944937833, "grad_norm": 0.3136397724341847, "learning_rate": 2.659374816279337e-06, "loss": 0.0278, "step": 3106 }, { "epoch": 2.7593250444049735, "grad_norm": 0.4531262771201052, "learning_rate": 2.6559507642911254e-06, "loss": 0.0388, "step": 3107 }, { "epoch": 2.7602131438721136, "grad_norm": 0.3630650763261708, "learning_rate": 2.652528120661504e-06, "loss": 0.0364, "step": 3108 }, { "epoch": 2.761101243339254, "grad_norm": 0.29369108790662546, "learning_rate": 2.6491068874468673e-06, "loss": 0.0306, "step": 3109 }, { "epoch": 2.761989342806394, "grad_norm": 0.4405601872703178, "learning_rate": 2.6456870667027746e-06, "loss": 0.0378, "step": 3110 }, { "epoch": 2.7628774422735347, "grad_norm": 0.3741620119668413, "learning_rate": 2.6422686604839352e-06, "loss": 0.0353, "step": 3111 }, { "epoch": 2.763765541740675, "grad_norm": 0.3505583318842318, "learning_rate": 2.6388516708442035e-06, "loss": 0.036, "step": 3112 }, { "epoch": 2.7646536412078153, "grad_norm": 0.38289287641724995, "learning_rate": 2.635436099836584e-06, "loss": 0.0322, "step": 3113 }, { "epoch": 2.765541740674956, "grad_norm": 0.3203790920698382, "learning_rate": 2.6320219495132336e-06, "loss": 0.0308, "step": 3114 }, { "epoch": 2.766429840142096, "grad_norm": 0.4995950897896086, "learning_rate": 2.628609221925448e-06, "loss": 0.0541, "step": 3115 }, { "epoch": 2.767317939609236, "grad_norm": 0.39833917217254344, "learning_rate": 2.625197919123677e-06, "loss": 0.0357, "step": 3116 }, { "epoch": 2.7682060390763765, "grad_norm": 0.40304209922427403, "learning_rate": 2.621788043157507e-06, "loss": 0.0389, "step": 3117 }, { "epoch": 2.769094138543517, "grad_norm": 0.4903203936952132, "learning_rate": 2.618379596075668e-06, "loss": 0.0336, "step": 3118 }, { "epoch": 2.769982238010657, "grad_norm": 0.3312631123935787, "learning_rate": 2.614972579926035e-06, "loss": 0.0315, "step": 3119 }, { "epoch": 2.7708703374777977, "grad_norm": 0.6283735428100229, "learning_rate": 2.611566996755626e-06, "loss": 0.0306, "step": 3120 }, { "epoch": 2.7717584369449377, "grad_norm": 0.5439363621147506, "learning_rate": 2.608162848610586e-06, "loss": 0.0418, "step": 3121 }, { "epoch": 2.7726465364120783, "grad_norm": 0.2961699441768386, "learning_rate": 2.6047601375362076e-06, "loss": 0.0318, "step": 3122 }, { "epoch": 2.7735346358792183, "grad_norm": 0.39078320127905963, "learning_rate": 2.6013588655769204e-06, "loss": 0.0419, "step": 3123 }, { "epoch": 2.774422735346359, "grad_norm": 0.3529278737597956, "learning_rate": 2.597959034776286e-06, "loss": 0.0264, "step": 3124 }, { "epoch": 2.7753108348134994, "grad_norm": 0.3599449620069869, "learning_rate": 2.594560647176997e-06, "loss": 0.0335, "step": 3125 }, { "epoch": 2.7761989342806395, "grad_norm": 0.3800935534712962, "learning_rate": 2.5911637048208883e-06, "loss": 0.0339, "step": 3126 }, { "epoch": 2.7770870337477795, "grad_norm": 0.4045657631427908, "learning_rate": 2.5877682097489154e-06, "loss": 0.0427, "step": 3127 }, { "epoch": 2.77797513321492, "grad_norm": 0.3408945249183258, "learning_rate": 2.5843741640011736e-06, "loss": 0.0327, "step": 3128 }, { "epoch": 2.7788632326820606, "grad_norm": 0.321873706472909, "learning_rate": 2.5809815696168834e-06, "loss": 0.0346, "step": 3129 }, { "epoch": 2.7797513321492007, "grad_norm": 0.34804314898483796, "learning_rate": 2.5775904286343897e-06, "loss": 0.0401, "step": 3130 }, { "epoch": 2.780639431616341, "grad_norm": 0.3084675333042666, "learning_rate": 2.5742007430911718e-06, "loss": 0.0289, "step": 3131 }, { "epoch": 2.7815275310834813, "grad_norm": 0.39465989111686417, "learning_rate": 2.5708125150238294e-06, "loss": 0.0415, "step": 3132 }, { "epoch": 2.782415630550622, "grad_norm": 0.3386636046446292, "learning_rate": 2.567425746468086e-06, "loss": 0.034, "step": 3133 }, { "epoch": 2.783303730017762, "grad_norm": 0.6566379417707975, "learning_rate": 2.5640404394587926e-06, "loss": 0.0507, "step": 3134 }, { "epoch": 2.7841918294849024, "grad_norm": 0.394947389510546, "learning_rate": 2.5606565960299155e-06, "loss": 0.0459, "step": 3135 }, { "epoch": 2.7850799289520425, "grad_norm": 0.5038306712498116, "learning_rate": 2.5572742182145487e-06, "loss": 0.0332, "step": 3136 }, { "epoch": 2.785968028419183, "grad_norm": 0.7023618024773217, "learning_rate": 2.5538933080449034e-06, "loss": 0.0399, "step": 3137 }, { "epoch": 2.786856127886323, "grad_norm": 0.4351684729605337, "learning_rate": 2.5505138675523065e-06, "loss": 0.0433, "step": 3138 }, { "epoch": 2.7877442273534636, "grad_norm": 0.3612418971693695, "learning_rate": 2.547135898767202e-06, "loss": 0.0321, "step": 3139 }, { "epoch": 2.788632326820604, "grad_norm": 0.34691162957898175, "learning_rate": 2.5437594037191537e-06, "loss": 0.037, "step": 3140 }, { "epoch": 2.789520426287744, "grad_norm": 0.33204491770084243, "learning_rate": 2.540384384436836e-06, "loss": 0.0358, "step": 3141 }, { "epoch": 2.7904085257548843, "grad_norm": 0.3800840497245024, "learning_rate": 2.5370108429480363e-06, "loss": 0.0373, "step": 3142 }, { "epoch": 2.791296625222025, "grad_norm": 0.39018940775699407, "learning_rate": 2.533638781279659e-06, "loss": 0.0345, "step": 3143 }, { "epoch": 2.7921847246891653, "grad_norm": 0.5326300879857159, "learning_rate": 2.5302682014577128e-06, "loss": 0.0413, "step": 3144 }, { "epoch": 2.7930728241563054, "grad_norm": 0.3926874549576116, "learning_rate": 2.526899105507323e-06, "loss": 0.0345, "step": 3145 }, { "epoch": 2.793960923623446, "grad_norm": 0.5331303675935396, "learning_rate": 2.523531495452718e-06, "loss": 0.0366, "step": 3146 }, { "epoch": 2.794849023090586, "grad_norm": 0.5328948201512393, "learning_rate": 2.520165373317234e-06, "loss": 0.0472, "step": 3147 }, { "epoch": 2.7957371225577266, "grad_norm": 0.6570392083128864, "learning_rate": 2.5168007411233143e-06, "loss": 0.046, "step": 3148 }, { "epoch": 2.7966252220248666, "grad_norm": 0.5039413424872988, "learning_rate": 2.513437600892513e-06, "loss": 0.0524, "step": 3149 }, { "epoch": 2.797513321492007, "grad_norm": 0.4504170060756764, "learning_rate": 2.510075954645474e-06, "loss": 0.0406, "step": 3150 }, { "epoch": 2.7984014209591477, "grad_norm": 0.47097439585522943, "learning_rate": 2.5067158044019546e-06, "loss": 0.0493, "step": 3151 }, { "epoch": 2.7992895204262878, "grad_norm": 0.2949652998777908, "learning_rate": 2.5033571521808123e-06, "loss": 0.0345, "step": 3152 }, { "epoch": 2.800177619893428, "grad_norm": 0.44072704377014293, "learning_rate": 2.5000000000000015e-06, "loss": 0.0355, "step": 3153 }, { "epoch": 2.8010657193605684, "grad_norm": 0.4206901717156175, "learning_rate": 2.496644349876573e-06, "loss": 0.0343, "step": 3154 }, { "epoch": 2.801953818827709, "grad_norm": 0.37458478856790284, "learning_rate": 2.4932902038266826e-06, "loss": 0.0371, "step": 3155 }, { "epoch": 2.802841918294849, "grad_norm": 0.33662647621758907, "learning_rate": 2.489937563865575e-06, "loss": 0.0316, "step": 3156 }, { "epoch": 2.8037300177619895, "grad_norm": 0.5403540396925937, "learning_rate": 2.4865864320075966e-06, "loss": 0.0529, "step": 3157 }, { "epoch": 2.8046181172291296, "grad_norm": 0.3307046888382853, "learning_rate": 2.4832368102661823e-06, "loss": 0.0275, "step": 3158 }, { "epoch": 2.80550621669627, "grad_norm": 0.3849223780314751, "learning_rate": 2.4798887006538606e-06, "loss": 0.0349, "step": 3159 }, { "epoch": 2.80639431616341, "grad_norm": 0.48250628245999927, "learning_rate": 2.476542105182254e-06, "loss": 0.0428, "step": 3160 }, { "epoch": 2.8072824156305507, "grad_norm": 0.37306916992082356, "learning_rate": 2.4731970258620775e-06, "loss": 0.0378, "step": 3161 }, { "epoch": 2.808170515097691, "grad_norm": 0.41142639692118227, "learning_rate": 2.469853464703124e-06, "loss": 0.0433, "step": 3162 }, { "epoch": 2.8090586145648313, "grad_norm": 0.35651442833603836, "learning_rate": 2.4665114237142856e-06, "loss": 0.0303, "step": 3163 }, { "epoch": 2.8099467140319714, "grad_norm": 0.40861213770415533, "learning_rate": 2.4631709049035397e-06, "loss": 0.0335, "step": 3164 }, { "epoch": 2.810834813499112, "grad_norm": 0.8840489724240785, "learning_rate": 2.4598319102779438e-06, "loss": 0.0377, "step": 3165 }, { "epoch": 2.8117229129662524, "grad_norm": 0.4532714630908367, "learning_rate": 2.4564944418436416e-06, "loss": 0.044, "step": 3166 }, { "epoch": 2.8126110124333925, "grad_norm": 0.3614593805074122, "learning_rate": 2.453158501605864e-06, "loss": 0.0425, "step": 3167 }, { "epoch": 2.8134991119005326, "grad_norm": 0.31680155247348535, "learning_rate": 2.4498240915689165e-06, "loss": 0.0358, "step": 3168 }, { "epoch": 2.814387211367673, "grad_norm": 0.4318446130593219, "learning_rate": 2.4464912137361936e-06, "loss": 0.0368, "step": 3169 }, { "epoch": 2.8152753108348136, "grad_norm": 0.4250897496319355, "learning_rate": 2.443159870110162e-06, "loss": 0.0441, "step": 3170 }, { "epoch": 2.8161634103019537, "grad_norm": 0.3643571527710475, "learning_rate": 2.439830062692368e-06, "loss": 0.0388, "step": 3171 }, { "epoch": 2.8170515097690942, "grad_norm": 0.3505699957865153, "learning_rate": 2.4365017934834406e-06, "loss": 0.0313, "step": 3172 }, { "epoch": 2.8179396092362343, "grad_norm": 0.4029893161185415, "learning_rate": 2.433175064483078e-06, "loss": 0.037, "step": 3173 }, { "epoch": 2.818827708703375, "grad_norm": 0.34238881678764294, "learning_rate": 2.429849877690053e-06, "loss": 0.0307, "step": 3174 }, { "epoch": 2.819715808170515, "grad_norm": 0.3872472148779724, "learning_rate": 2.426526235102219e-06, "loss": 0.034, "step": 3175 }, { "epoch": 2.8206039076376554, "grad_norm": 0.37087670496994585, "learning_rate": 2.4232041387164924e-06, "loss": 0.0364, "step": 3176 }, { "epoch": 2.821492007104796, "grad_norm": 0.3979646727542978, "learning_rate": 2.4198835905288686e-06, "loss": 0.0366, "step": 3177 }, { "epoch": 2.822380106571936, "grad_norm": 0.3653071700008589, "learning_rate": 2.416564592534405e-06, "loss": 0.0355, "step": 3178 }, { "epoch": 2.823268206039076, "grad_norm": 0.32659833966423785, "learning_rate": 2.413247146727237e-06, "loss": 0.0349, "step": 3179 }, { "epoch": 2.8241563055062167, "grad_norm": 0.44083296008742695, "learning_rate": 2.4099312551005568e-06, "loss": 0.04, "step": 3180 }, { "epoch": 2.825044404973357, "grad_norm": 0.40793836680442663, "learning_rate": 2.4066169196466326e-06, "loss": 0.036, "step": 3181 }, { "epoch": 2.8259325044404973, "grad_norm": 0.3582256078871051, "learning_rate": 2.4033041423567914e-06, "loss": 0.0351, "step": 3182 }, { "epoch": 2.826820603907638, "grad_norm": 0.36582260805964584, "learning_rate": 2.3999929252214245e-06, "loss": 0.036, "step": 3183 }, { "epoch": 2.827708703374778, "grad_norm": 0.38339914278379317, "learning_rate": 2.39668327022999e-06, "loss": 0.0384, "step": 3184 }, { "epoch": 2.8285968028419184, "grad_norm": 0.8048974461847316, "learning_rate": 2.393375179371003e-06, "loss": 0.0558, "step": 3185 }, { "epoch": 2.8294849023090585, "grad_norm": 0.5919715371103896, "learning_rate": 2.390068654632039e-06, "loss": 0.034, "step": 3186 }, { "epoch": 2.830373001776199, "grad_norm": 0.5241335446677382, "learning_rate": 2.386763697999737e-06, "loss": 0.0574, "step": 3187 }, { "epoch": 2.8312611012433395, "grad_norm": 0.4277430815376803, "learning_rate": 2.3834603114597875e-06, "loss": 0.034, "step": 3188 }, { "epoch": 2.8321492007104796, "grad_norm": 0.3525031394477271, "learning_rate": 2.380158496996945e-06, "loss": 0.0373, "step": 3189 }, { "epoch": 2.8330373001776197, "grad_norm": 0.3858913624846048, "learning_rate": 2.376858256595012e-06, "loss": 0.0363, "step": 3190 }, { "epoch": 2.83392539964476, "grad_norm": 0.33604324286970155, "learning_rate": 2.3735595922368495e-06, "loss": 0.0389, "step": 3191 }, { "epoch": 2.8348134991119007, "grad_norm": 0.39935929326742753, "learning_rate": 2.3702625059043697e-06, "loss": 0.0401, "step": 3192 }, { "epoch": 2.835701598579041, "grad_norm": 0.4730886258127644, "learning_rate": 2.3669669995785436e-06, "loss": 0.0353, "step": 3193 }, { "epoch": 2.8365896980461813, "grad_norm": 0.42154361099056825, "learning_rate": 2.363673075239379e-06, "loss": 0.0381, "step": 3194 }, { "epoch": 2.8374777975133214, "grad_norm": 0.3934859872948006, "learning_rate": 2.3603807348659447e-06, "loss": 0.0346, "step": 3195 }, { "epoch": 2.838365896980462, "grad_norm": 0.34488289545283485, "learning_rate": 2.357089980436356e-06, "loss": 0.0324, "step": 3196 }, { "epoch": 2.839253996447602, "grad_norm": 0.48559060269746396, "learning_rate": 2.3538008139277705e-06, "loss": 0.0386, "step": 3197 }, { "epoch": 2.8401420959147425, "grad_norm": 0.4909439166088392, "learning_rate": 2.3505132373163984e-06, "loss": 0.0377, "step": 3198 }, { "epoch": 2.8410301953818826, "grad_norm": 0.40415548474459967, "learning_rate": 2.3472272525774905e-06, "loss": 0.0421, "step": 3199 }, { "epoch": 2.841918294849023, "grad_norm": 0.31081432267044307, "learning_rate": 2.343942861685338e-06, "loss": 0.0337, "step": 3200 }, { "epoch": 2.842806394316163, "grad_norm": 0.32784077186237176, "learning_rate": 2.340660066613281e-06, "loss": 0.0237, "step": 3201 }, { "epoch": 2.8436944937833037, "grad_norm": 0.5013646416173178, "learning_rate": 2.3373788693337024e-06, "loss": 0.0455, "step": 3202 }, { "epoch": 2.8445825932504443, "grad_norm": 0.45898845240520664, "learning_rate": 2.3340992718180126e-06, "loss": 0.044, "step": 3203 }, { "epoch": 2.8454706927175843, "grad_norm": 0.38690156844100004, "learning_rate": 2.3308212760366734e-06, "loss": 0.0428, "step": 3204 }, { "epoch": 2.8463587921847244, "grad_norm": 0.38094134759029563, "learning_rate": 2.3275448839591814e-06, "loss": 0.0356, "step": 3205 }, { "epoch": 2.847246891651865, "grad_norm": 0.5811129908265699, "learning_rate": 2.3242700975540665e-06, "loss": 0.0362, "step": 3206 }, { "epoch": 2.8481349911190055, "grad_norm": 0.40116093066446995, "learning_rate": 2.320996918788894e-06, "loss": 0.045, "step": 3207 }, { "epoch": 2.8490230905861456, "grad_norm": 0.3362104213982332, "learning_rate": 2.3177253496302677e-06, "loss": 0.0355, "step": 3208 }, { "epoch": 2.849911190053286, "grad_norm": 0.3805397457710735, "learning_rate": 2.3144553920438183e-06, "loss": 0.0364, "step": 3209 }, { "epoch": 2.850799289520426, "grad_norm": 0.35351437634179955, "learning_rate": 2.3111870479942156e-06, "loss": 0.0351, "step": 3210 }, { "epoch": 2.8516873889875667, "grad_norm": 0.32550272153517773, "learning_rate": 2.3079203194451527e-06, "loss": 0.0343, "step": 3211 }, { "epoch": 2.8525754884547068, "grad_norm": 0.3303439701522621, "learning_rate": 2.3046552083593536e-06, "loss": 0.035, "step": 3212 }, { "epoch": 2.8534635879218473, "grad_norm": 0.367616218813943, "learning_rate": 2.3013917166985766e-06, "loss": 0.0399, "step": 3213 }, { "epoch": 2.854351687388988, "grad_norm": 0.33662839356963586, "learning_rate": 2.298129846423599e-06, "loss": 0.0404, "step": 3214 }, { "epoch": 2.855239786856128, "grad_norm": 0.5840277130030126, "learning_rate": 2.294869599494227e-06, "loss": 0.049, "step": 3215 }, { "epoch": 2.856127886323268, "grad_norm": 0.4443593809675, "learning_rate": 2.2916109778692948e-06, "loss": 0.0466, "step": 3216 }, { "epoch": 2.8570159857904085, "grad_norm": 0.3367573564426318, "learning_rate": 2.288353983506653e-06, "loss": 0.0383, "step": 3217 }, { "epoch": 2.857904085257549, "grad_norm": 0.5190183025532943, "learning_rate": 2.285098618363183e-06, "loss": 0.0377, "step": 3218 }, { "epoch": 2.858792184724689, "grad_norm": 0.2958611938020286, "learning_rate": 2.281844884394781e-06, "loss": 0.0306, "step": 3219 }, { "epoch": 2.8596802841918296, "grad_norm": 0.3452665952641175, "learning_rate": 2.2785927835563624e-06, "loss": 0.0343, "step": 3220 }, { "epoch": 2.8605683836589697, "grad_norm": 0.37239810186921896, "learning_rate": 2.2753423178018667e-06, "loss": 0.0327, "step": 3221 }, { "epoch": 2.8614564831261102, "grad_norm": 0.3540942291981282, "learning_rate": 2.27209348908425e-06, "loss": 0.0384, "step": 3222 }, { "epoch": 2.8623445825932503, "grad_norm": 0.36505716497548985, "learning_rate": 2.268846299355481e-06, "loss": 0.0367, "step": 3223 }, { "epoch": 2.863232682060391, "grad_norm": 0.408787810640581, "learning_rate": 2.265600750566544e-06, "loss": 0.0399, "step": 3224 }, { "epoch": 2.8641207815275314, "grad_norm": 0.3638252630312671, "learning_rate": 2.262356844667443e-06, "loss": 0.0362, "step": 3225 }, { "epoch": 2.8650088809946714, "grad_norm": 0.4133926248591744, "learning_rate": 2.25911458360719e-06, "loss": 0.0366, "step": 3226 }, { "epoch": 2.8658969804618115, "grad_norm": 0.3062802727335299, "learning_rate": 2.255873969333807e-06, "loss": 0.0277, "step": 3227 }, { "epoch": 2.866785079928952, "grad_norm": 0.43524843682150793, "learning_rate": 2.252635003794334e-06, "loss": 0.0351, "step": 3228 }, { "epoch": 2.8676731793960926, "grad_norm": 0.3426621209283039, "learning_rate": 2.249397688934811e-06, "loss": 0.0389, "step": 3229 }, { "epoch": 2.8685612788632326, "grad_norm": 0.3370006175503084, "learning_rate": 2.246162026700296e-06, "loss": 0.0346, "step": 3230 }, { "epoch": 2.869449378330373, "grad_norm": 0.36902644678834845, "learning_rate": 2.2429280190348463e-06, "loss": 0.0367, "step": 3231 }, { "epoch": 2.8703374777975132, "grad_norm": 0.3372975556473261, "learning_rate": 2.2396956678815266e-06, "loss": 0.0327, "step": 3232 }, { "epoch": 2.8712255772646538, "grad_norm": 0.4067092763332751, "learning_rate": 2.2364649751824096e-06, "loss": 0.0336, "step": 3233 }, { "epoch": 2.872113676731794, "grad_norm": 0.41264789259217366, "learning_rate": 2.2332359428785733e-06, "loss": 0.0453, "step": 3234 }, { "epoch": 2.8730017761989344, "grad_norm": 0.38972966777230966, "learning_rate": 2.230008572910087e-06, "loss": 0.0382, "step": 3235 }, { "epoch": 2.8738898756660745, "grad_norm": 0.3445557467587843, "learning_rate": 2.2267828672160317e-06, "loss": 0.0348, "step": 3236 }, { "epoch": 2.874777975133215, "grad_norm": 0.34872288625593595, "learning_rate": 2.223558827734489e-06, "loss": 0.0328, "step": 3237 }, { "epoch": 2.875666074600355, "grad_norm": 0.3655919876619049, "learning_rate": 2.2203364564025315e-06, "loss": 0.0378, "step": 3238 }, { "epoch": 2.8765541740674956, "grad_norm": 0.40558629003143404, "learning_rate": 2.217115755156234e-06, "loss": 0.0422, "step": 3239 }, { "epoch": 2.877442273534636, "grad_norm": 0.364095436499667, "learning_rate": 2.2138967259306702e-06, "loss": 0.0365, "step": 3240 }, { "epoch": 2.878330373001776, "grad_norm": 0.4287254451004397, "learning_rate": 2.2106793706599045e-06, "loss": 0.046, "step": 3241 }, { "epoch": 2.8792184724689163, "grad_norm": 0.37982949088249746, "learning_rate": 2.2074636912770003e-06, "loss": 0.0408, "step": 3242 }, { "epoch": 2.880106571936057, "grad_norm": 0.33271622421670694, "learning_rate": 2.20424968971401e-06, "loss": 0.0306, "step": 3243 }, { "epoch": 2.8809946714031973, "grad_norm": 0.42343377317989356, "learning_rate": 2.2010373679019773e-06, "loss": 0.0378, "step": 3244 }, { "epoch": 2.8818827708703374, "grad_norm": 0.4123469336539692, "learning_rate": 2.1978267277709415e-06, "loss": 0.0415, "step": 3245 }, { "epoch": 2.882770870337478, "grad_norm": 0.32210500457700036, "learning_rate": 2.1946177712499322e-06, "loss": 0.0297, "step": 3246 }, { "epoch": 2.883658969804618, "grad_norm": 0.358708910741767, "learning_rate": 2.1914105002669567e-06, "loss": 0.04, "step": 3247 }, { "epoch": 2.8845470692717585, "grad_norm": 0.5864879669784532, "learning_rate": 2.188204916749021e-06, "loss": 0.0579, "step": 3248 }, { "epoch": 2.8854351687388986, "grad_norm": 0.3731123103237032, "learning_rate": 2.1850010226221147e-06, "loss": 0.0376, "step": 3249 }, { "epoch": 2.886323268206039, "grad_norm": 0.32710070054568563, "learning_rate": 2.1817988198112095e-06, "loss": 0.0303, "step": 3250 }, { "epoch": 2.8872113676731797, "grad_norm": 0.5116015572070288, "learning_rate": 2.17859831024026e-06, "loss": 0.0497, "step": 3251 }, { "epoch": 2.8880994671403197, "grad_norm": 0.536659787763374, "learning_rate": 2.17539949583221e-06, "loss": 0.0405, "step": 3252 }, { "epoch": 2.88898756660746, "grad_norm": 0.5229345175348173, "learning_rate": 2.1722023785089763e-06, "loss": 0.0361, "step": 3253 }, { "epoch": 2.8898756660746003, "grad_norm": 0.364097366496522, "learning_rate": 2.1690069601914648e-06, "loss": 0.0336, "step": 3254 }, { "epoch": 2.890763765541741, "grad_norm": 0.6525345476050085, "learning_rate": 2.1658132427995543e-06, "loss": 0.0456, "step": 3255 }, { "epoch": 2.891651865008881, "grad_norm": 0.4305618530941698, "learning_rate": 2.162621228252101e-06, "loss": 0.0403, "step": 3256 }, { "epoch": 2.8925399644760215, "grad_norm": 0.311600428359317, "learning_rate": 2.159430918466945e-06, "loss": 0.0345, "step": 3257 }, { "epoch": 2.8934280639431615, "grad_norm": 0.3132058591814133, "learning_rate": 2.156242315360893e-06, "loss": 0.0294, "step": 3258 }, { "epoch": 2.894316163410302, "grad_norm": 0.4910046166944337, "learning_rate": 2.1530554208497357e-06, "loss": 0.0394, "step": 3259 }, { "epoch": 2.895204262877442, "grad_norm": 0.3085842334505055, "learning_rate": 2.1498702368482294e-06, "loss": 0.0291, "step": 3260 }, { "epoch": 2.8960923623445827, "grad_norm": 0.36125305999687723, "learning_rate": 2.146686765270105e-06, "loss": 0.0304, "step": 3261 }, { "epoch": 2.8969804618117228, "grad_norm": 0.5352727077255195, "learning_rate": 2.143505008028067e-06, "loss": 0.0407, "step": 3262 }, { "epoch": 2.8978685612788633, "grad_norm": 0.533765241330171, "learning_rate": 2.1403249670337895e-06, "loss": 0.04, "step": 3263 }, { "epoch": 2.8987566607460034, "grad_norm": 0.3154911523836893, "learning_rate": 2.1371466441979127e-06, "loss": 0.0347, "step": 3264 }, { "epoch": 2.899644760213144, "grad_norm": 0.3210808049942224, "learning_rate": 2.133970041430044e-06, "loss": 0.0329, "step": 3265 }, { "epoch": 2.9005328596802844, "grad_norm": 0.3309739089232218, "learning_rate": 2.1307951606387626e-06, "loss": 0.0303, "step": 3266 }, { "epoch": 2.9014209591474245, "grad_norm": 0.458161854847797, "learning_rate": 2.1276220037316087e-06, "loss": 0.0402, "step": 3267 }, { "epoch": 2.9023090586145646, "grad_norm": 0.4046184614183439, "learning_rate": 2.1244505726150853e-06, "loss": 0.0352, "step": 3268 }, { "epoch": 2.903197158081705, "grad_norm": 0.3900637322521146, "learning_rate": 2.1212808691946646e-06, "loss": 0.0374, "step": 3269 }, { "epoch": 2.9040852575488456, "grad_norm": 0.3843284507800547, "learning_rate": 2.1181128953747735e-06, "loss": 0.044, "step": 3270 }, { "epoch": 2.9049733570159857, "grad_norm": 0.33483239462559267, "learning_rate": 2.1149466530588082e-06, "loss": 0.0306, "step": 3271 }, { "epoch": 2.905861456483126, "grad_norm": 0.44035760639827976, "learning_rate": 2.1117821441491166e-06, "loss": 0.0439, "step": 3272 }, { "epoch": 2.9067495559502663, "grad_norm": 0.5724245552044548, "learning_rate": 2.1086193705470064e-06, "loss": 0.0352, "step": 3273 }, { "epoch": 2.907637655417407, "grad_norm": 0.4610758244552123, "learning_rate": 2.105458334152748e-06, "loss": 0.0373, "step": 3274 }, { "epoch": 2.908525754884547, "grad_norm": 0.3746823307953264, "learning_rate": 2.102299036865566e-06, "loss": 0.0371, "step": 3275 }, { "epoch": 2.9094138543516874, "grad_norm": 0.573966111334757, "learning_rate": 2.0991414805836337e-06, "loss": 0.044, "step": 3276 }, { "epoch": 2.910301953818828, "grad_norm": 0.30787883438799263, "learning_rate": 2.095985667204085e-06, "loss": 0.0349, "step": 3277 }, { "epoch": 2.911190053285968, "grad_norm": 0.340274332990767, "learning_rate": 2.092831598623008e-06, "loss": 0.0315, "step": 3278 }, { "epoch": 2.912078152753108, "grad_norm": 0.3513766217297932, "learning_rate": 2.089679276735438e-06, "loss": 0.0301, "step": 3279 }, { "epoch": 2.9129662522202486, "grad_norm": 0.42793403329507357, "learning_rate": 2.08652870343536e-06, "loss": 0.0376, "step": 3280 }, { "epoch": 2.913854351687389, "grad_norm": 0.45718656349728554, "learning_rate": 2.0833798806157134e-06, "loss": 0.0356, "step": 3281 }, { "epoch": 2.9147424511545292, "grad_norm": 0.3311931727511675, "learning_rate": 2.0802328101683815e-06, "loss": 0.0312, "step": 3282 }, { "epoch": 2.9156305506216698, "grad_norm": 0.3506580566673122, "learning_rate": 2.0770874939841987e-06, "loss": 0.0412, "step": 3283 }, { "epoch": 2.91651865008881, "grad_norm": 0.3534652802800643, "learning_rate": 2.0739439339529423e-06, "loss": 0.0338, "step": 3284 }, { "epoch": 2.9174067495559504, "grad_norm": 0.36758672924618585, "learning_rate": 2.0708021319633326e-06, "loss": 0.0409, "step": 3285 }, { "epoch": 2.9182948490230904, "grad_norm": 0.37238723422318215, "learning_rate": 2.0676620899030393e-06, "loss": 0.0322, "step": 3286 }, { "epoch": 2.919182948490231, "grad_norm": 0.35946999846969785, "learning_rate": 2.0645238096586737e-06, "loss": 0.032, "step": 3287 }, { "epoch": 2.9200710479573715, "grad_norm": 0.5037500981505074, "learning_rate": 2.0613872931157808e-06, "loss": 0.0348, "step": 3288 }, { "epoch": 2.9209591474245116, "grad_norm": 0.41678628046767463, "learning_rate": 2.058252542158855e-06, "loss": 0.04, "step": 3289 }, { "epoch": 2.9218472468916517, "grad_norm": 0.34845412270064835, "learning_rate": 2.055119558671328e-06, "loss": 0.0367, "step": 3290 }, { "epoch": 2.922735346358792, "grad_norm": 0.33363190061050635, "learning_rate": 2.051988344535567e-06, "loss": 0.0334, "step": 3291 }, { "epoch": 2.9236234458259327, "grad_norm": 0.41667074837663, "learning_rate": 2.0488589016328754e-06, "loss": 0.0383, "step": 3292 }, { "epoch": 2.924511545293073, "grad_norm": 0.34657318515762536, "learning_rate": 2.0457312318434975e-06, "loss": 0.0359, "step": 3293 }, { "epoch": 2.9253996447602133, "grad_norm": 0.45150283166425603, "learning_rate": 2.0426053370466058e-06, "loss": 0.042, "step": 3294 }, { "epoch": 2.9262877442273534, "grad_norm": 0.4025553185092821, "learning_rate": 2.0394812191203133e-06, "loss": 0.043, "step": 3295 }, { "epoch": 2.927175843694494, "grad_norm": 0.3517187659910425, "learning_rate": 2.0363588799416594e-06, "loss": 0.0354, "step": 3296 }, { "epoch": 2.928063943161634, "grad_norm": 0.31683729402917576, "learning_rate": 2.033238321386616e-06, "loss": 0.0289, "step": 3297 }, { "epoch": 2.9289520426287745, "grad_norm": 0.3530534662804317, "learning_rate": 2.0301195453300903e-06, "loss": 0.0337, "step": 3298 }, { "epoch": 2.9298401420959146, "grad_norm": 0.33259384039497863, "learning_rate": 2.027002553645912e-06, "loss": 0.0351, "step": 3299 }, { "epoch": 2.930728241563055, "grad_norm": 0.33132222524162663, "learning_rate": 2.0238873482068396e-06, "loss": 0.0361, "step": 3300 }, { "epoch": 2.931616341030195, "grad_norm": 0.2932215710891699, "learning_rate": 2.020773930884563e-06, "loss": 0.033, "step": 3301 }, { "epoch": 2.9325044404973357, "grad_norm": 0.40884860043118654, "learning_rate": 2.017662303549693e-06, "loss": 0.0357, "step": 3302 }, { "epoch": 2.9333925399644762, "grad_norm": 0.3631064484842973, "learning_rate": 2.0145524680717684e-06, "loss": 0.0389, "step": 3303 }, { "epoch": 2.9342806394316163, "grad_norm": 0.40038919515485394, "learning_rate": 2.0114444263192466e-06, "loss": 0.0298, "step": 3304 }, { "epoch": 2.9351687388987564, "grad_norm": 0.3944735488852371, "learning_rate": 2.0083381801595153e-06, "loss": 0.039, "step": 3305 }, { "epoch": 2.936056838365897, "grad_norm": 0.3749654214627426, "learning_rate": 2.0052337314588733e-06, "loss": 0.0323, "step": 3306 }, { "epoch": 2.9369449378330375, "grad_norm": 0.36119220854664474, "learning_rate": 2.002131082082549e-06, "loss": 0.035, "step": 3307 }, { "epoch": 2.9378330373001775, "grad_norm": 0.41339812862832803, "learning_rate": 1.9990302338946838e-06, "loss": 0.0338, "step": 3308 }, { "epoch": 2.938721136767318, "grad_norm": 0.433266835637927, "learning_rate": 1.995931188758336e-06, "loss": 0.0372, "step": 3309 }, { "epoch": 2.939609236234458, "grad_norm": 0.36692614273163887, "learning_rate": 1.992833948535487e-06, "loss": 0.0374, "step": 3310 }, { "epoch": 2.9404973357015987, "grad_norm": 0.40590300122226025, "learning_rate": 1.9897385150870274e-06, "loss": 0.0408, "step": 3311 }, { "epoch": 2.9413854351687387, "grad_norm": 0.4029544927871777, "learning_rate": 1.9866448902727643e-06, "loss": 0.0398, "step": 3312 }, { "epoch": 2.9422735346358793, "grad_norm": 0.33721157266618157, "learning_rate": 1.983553075951421e-06, "loss": 0.0256, "step": 3313 }, { "epoch": 2.94316163410302, "grad_norm": 0.32779352352741326, "learning_rate": 1.9804630739806267e-06, "loss": 0.0342, "step": 3314 }, { "epoch": 2.94404973357016, "grad_norm": 0.3129523937087835, "learning_rate": 1.97737488621693e-06, "loss": 0.0322, "step": 3315 }, { "epoch": 2.9449378330373, "grad_norm": 0.494246292301225, "learning_rate": 1.974288514515783e-06, "loss": 0.044, "step": 3316 }, { "epoch": 2.9458259325044405, "grad_norm": 0.3109120361380824, "learning_rate": 1.9712039607315466e-06, "loss": 0.0326, "step": 3317 }, { "epoch": 2.946714031971581, "grad_norm": 0.47054441810154746, "learning_rate": 1.968121226717493e-06, "loss": 0.0489, "step": 3318 }, { "epoch": 2.947602131438721, "grad_norm": 0.431348841788879, "learning_rate": 1.9650403143258033e-06, "loss": 0.0445, "step": 3319 }, { "epoch": 2.9484902309058616, "grad_norm": 0.42005226880470004, "learning_rate": 1.9619612254075536e-06, "loss": 0.0424, "step": 3320 }, { "epoch": 2.9493783303730017, "grad_norm": 0.3859104877206694, "learning_rate": 1.9588839618127355e-06, "loss": 0.0406, "step": 3321 }, { "epoch": 2.950266429840142, "grad_norm": 0.3768473207600592, "learning_rate": 1.9558085253902394e-06, "loss": 0.033, "step": 3322 }, { "epoch": 2.9511545293072823, "grad_norm": 0.326175984617022, "learning_rate": 1.952734917987856e-06, "loss": 0.0324, "step": 3323 }, { "epoch": 2.952042628774423, "grad_norm": 0.40927173186547333, "learning_rate": 1.949663141452282e-06, "loss": 0.0346, "step": 3324 }, { "epoch": 2.9529307282415633, "grad_norm": 0.48013759818574125, "learning_rate": 1.9465931976291103e-06, "loss": 0.0342, "step": 3325 }, { "epoch": 2.9538188277087034, "grad_norm": 0.3345920652259473, "learning_rate": 1.943525088362831e-06, "loss": 0.0319, "step": 3326 }, { "epoch": 2.9547069271758435, "grad_norm": 0.400664709623364, "learning_rate": 1.9404588154968366e-06, "loss": 0.0357, "step": 3327 }, { "epoch": 2.955595026642984, "grad_norm": 0.3141920159671145, "learning_rate": 1.937394380873418e-06, "loss": 0.0269, "step": 3328 }, { "epoch": 2.9564831261101245, "grad_norm": 0.39910849610319554, "learning_rate": 1.9343317863337506e-06, "loss": 0.0346, "step": 3329 }, { "epoch": 2.9573712255772646, "grad_norm": 0.3900489384340357, "learning_rate": 1.931271033717916e-06, "loss": 0.0325, "step": 3330 }, { "epoch": 2.9582593250444047, "grad_norm": 0.3642120034133262, "learning_rate": 1.9282121248648845e-06, "loss": 0.0358, "step": 3331 }, { "epoch": 2.959147424511545, "grad_norm": 0.5146655274099231, "learning_rate": 1.925155061612518e-06, "loss": 0.0329, "step": 3332 }, { "epoch": 2.9600355239786857, "grad_norm": 0.33719231204509664, "learning_rate": 1.9220998457975687e-06, "loss": 0.0321, "step": 3333 }, { "epoch": 2.960923623445826, "grad_norm": 0.3771256970179299, "learning_rate": 1.9190464792556836e-06, "loss": 0.0313, "step": 3334 }, { "epoch": 2.9618117229129663, "grad_norm": 0.36201294188026895, "learning_rate": 1.915994963821392e-06, "loss": 0.04, "step": 3335 }, { "epoch": 2.9626998223801064, "grad_norm": 0.42619272931162727, "learning_rate": 1.912945301328118e-06, "loss": 0.0348, "step": 3336 }, { "epoch": 2.963587921847247, "grad_norm": 0.3649419226641657, "learning_rate": 1.9098974936081665e-06, "loss": 0.0359, "step": 3337 }, { "epoch": 2.964476021314387, "grad_norm": 0.7621193429331412, "learning_rate": 1.9068515424927297e-06, "loss": 0.0402, "step": 3338 }, { "epoch": 2.9653641207815276, "grad_norm": 0.4018314333188861, "learning_rate": 1.9038074498118875e-06, "loss": 0.0288, "step": 3339 }, { "epoch": 2.966252220248668, "grad_norm": 0.35306495582431857, "learning_rate": 1.9007652173945994e-06, "loss": 0.0324, "step": 3340 }, { "epoch": 2.967140319715808, "grad_norm": 0.3183837603978152, "learning_rate": 1.8977248470687065e-06, "loss": 0.0328, "step": 3341 }, { "epoch": 2.9680284191829482, "grad_norm": 0.38075989944376254, "learning_rate": 1.8946863406609367e-06, "loss": 0.0376, "step": 3342 }, { "epoch": 2.9689165186500888, "grad_norm": 0.3235529230598592, "learning_rate": 1.8916496999968909e-06, "loss": 0.036, "step": 3343 }, { "epoch": 2.9698046181172293, "grad_norm": 0.39967918964708427, "learning_rate": 1.8886149269010556e-06, "loss": 0.0323, "step": 3344 }, { "epoch": 2.9706927175843694, "grad_norm": 0.39920867111551245, "learning_rate": 1.8855820231967892e-06, "loss": 0.0333, "step": 3345 }, { "epoch": 2.97158081705151, "grad_norm": 0.3668464971121366, "learning_rate": 1.8825509907063328e-06, "loss": 0.0338, "step": 3346 }, { "epoch": 2.97246891651865, "grad_norm": 0.36580422811700214, "learning_rate": 1.8795218312507974e-06, "loss": 0.0374, "step": 3347 }, { "epoch": 2.9733570159857905, "grad_norm": 0.2954628970559159, "learning_rate": 1.876494546650175e-06, "loss": 0.0401, "step": 3348 }, { "epoch": 2.9742451154529306, "grad_norm": 0.3273296166549773, "learning_rate": 1.873469138723325e-06, "loss": 0.0354, "step": 3349 }, { "epoch": 2.975133214920071, "grad_norm": 0.4107714869843828, "learning_rate": 1.8704456092879813e-06, "loss": 0.0451, "step": 3350 }, { "epoch": 2.9760213143872116, "grad_norm": 0.3882844503989167, "learning_rate": 1.867423960160753e-06, "loss": 0.0306, "step": 3351 }, { "epoch": 2.9769094138543517, "grad_norm": 0.31073438444954216, "learning_rate": 1.8644041931571138e-06, "loss": 0.0332, "step": 3352 }, { "epoch": 2.977797513321492, "grad_norm": 0.37944811166255143, "learning_rate": 1.8613863100914082e-06, "loss": 0.0429, "step": 3353 }, { "epoch": 2.9786856127886323, "grad_norm": 0.5957741696978557, "learning_rate": 1.8583703127768526e-06, "loss": 0.0327, "step": 3354 }, { "epoch": 2.979573712255773, "grad_norm": 0.37335510195578836, "learning_rate": 1.8553562030255246e-06, "loss": 0.038, "step": 3355 }, { "epoch": 2.980461811722913, "grad_norm": 0.28581665267498463, "learning_rate": 1.852343982648373e-06, "loss": 0.0352, "step": 3356 }, { "epoch": 2.9813499111900534, "grad_norm": 0.34624317000878835, "learning_rate": 1.8493336534552076e-06, "loss": 0.0384, "step": 3357 }, { "epoch": 2.9822380106571935, "grad_norm": 0.3182975122449055, "learning_rate": 1.8463252172547019e-06, "loss": 0.0315, "step": 3358 }, { "epoch": 2.983126110124334, "grad_norm": 0.6737271139227827, "learning_rate": 1.8433186758543948e-06, "loss": 0.048, "step": 3359 }, { "epoch": 2.984014209591474, "grad_norm": 0.36804671209761464, "learning_rate": 1.8403140310606888e-06, "loss": 0.037, "step": 3360 }, { "epoch": 2.9849023090586146, "grad_norm": 0.36278004185633717, "learning_rate": 1.8373112846788366e-06, "loss": 0.0364, "step": 3361 }, { "epoch": 2.9857904085257547, "grad_norm": 0.34080469269804775, "learning_rate": 1.8343104385129612e-06, "loss": 0.0378, "step": 3362 }, { "epoch": 2.9866785079928952, "grad_norm": 0.2905327881702447, "learning_rate": 1.8313114943660403e-06, "loss": 0.0338, "step": 3363 }, { "epoch": 2.9875666074600353, "grad_norm": 0.5281539360329269, "learning_rate": 1.8283144540399078e-06, "loss": 0.0458, "step": 3364 }, { "epoch": 2.988454706927176, "grad_norm": 0.29317350057512703, "learning_rate": 1.8253193193352525e-06, "loss": 0.0358, "step": 3365 }, { "epoch": 2.9893428063943164, "grad_norm": 0.3757051365198419, "learning_rate": 1.8223260920516227e-06, "loss": 0.0456, "step": 3366 }, { "epoch": 2.9902309058614565, "grad_norm": 0.35032849686712814, "learning_rate": 1.8193347739874157e-06, "loss": 0.0342, "step": 3367 }, { "epoch": 2.9911190053285965, "grad_norm": 0.43634666739329175, "learning_rate": 1.8163453669398867e-06, "loss": 0.0327, "step": 3368 }, { "epoch": 2.992007104795737, "grad_norm": 0.35950605478419084, "learning_rate": 1.8133578727051388e-06, "loss": 0.0411, "step": 3369 }, { "epoch": 2.9928952042628776, "grad_norm": 0.3609794901562061, "learning_rate": 1.8103722930781249e-06, "loss": 0.0345, "step": 3370 }, { "epoch": 2.9937833037300177, "grad_norm": 0.35013950214744116, "learning_rate": 1.8073886298526522e-06, "loss": 0.0317, "step": 3371 }, { "epoch": 2.994671403197158, "grad_norm": 0.6651815939906269, "learning_rate": 1.8044068848213763e-06, "loss": 0.0493, "step": 3372 }, { "epoch": 2.9955595026642983, "grad_norm": 0.5234108356175042, "learning_rate": 1.8014270597757926e-06, "loss": 0.0441, "step": 3373 }, { "epoch": 2.996447602131439, "grad_norm": 0.4496691461596499, "learning_rate": 1.7984491565062513e-06, "loss": 0.0401, "step": 3374 }, { "epoch": 2.997335701598579, "grad_norm": 0.37057822260084683, "learning_rate": 1.7954731768019461e-06, "loss": 0.039, "step": 3375 }, { "epoch": 2.9982238010657194, "grad_norm": 0.2938240171494723, "learning_rate": 1.792499122450913e-06, "loss": 0.0282, "step": 3376 }, { "epoch": 2.99911190053286, "grad_norm": 0.3108398912701676, "learning_rate": 1.7895269952400303e-06, "loss": 0.0297, "step": 3377 }, { "epoch": 3.0, "grad_norm": 0.533545404175948, "learning_rate": 1.7865567969550235e-06, "loss": 0.0363, "step": 3378 }, { "epoch": 3.0, "eval_loss": 0.041787039488554, "eval_runtime": 78.1532, "eval_samples_per_second": 194.055, "eval_steps_per_second": 3.033, "step": 3378 }, { "epoch": 3.0008880994671405, "grad_norm": 0.32020596950233093, "learning_rate": 1.7835885293804522e-06, "loss": 0.0302, "step": 3379 }, { "epoch": 3.0017761989342806, "grad_norm": 0.3088566077675897, "learning_rate": 1.7806221942997237e-06, "loss": 0.0273, "step": 3380 }, { "epoch": 3.002664298401421, "grad_norm": 0.3730031254504042, "learning_rate": 1.7776577934950783e-06, "loss": 0.0324, "step": 3381 }, { "epoch": 3.003552397868561, "grad_norm": 0.5123057571119825, "learning_rate": 1.7746953287475944e-06, "loss": 0.0434, "step": 3382 }, { "epoch": 3.0044404973357017, "grad_norm": 0.36113545525063273, "learning_rate": 1.7717348018371915e-06, "loss": 0.0268, "step": 3383 }, { "epoch": 3.005328596802842, "grad_norm": 0.37460870996734363, "learning_rate": 1.7687762145426197e-06, "loss": 0.032, "step": 3384 }, { "epoch": 3.0062166962699823, "grad_norm": 0.3876240880290442, "learning_rate": 1.7658195686414692e-06, "loss": 0.0297, "step": 3385 }, { "epoch": 3.0071047957371224, "grad_norm": 0.3267199322576471, "learning_rate": 1.7628648659101587e-06, "loss": 0.0251, "step": 3386 }, { "epoch": 3.007992895204263, "grad_norm": 0.32689525299773015, "learning_rate": 1.7599121081239406e-06, "loss": 0.0286, "step": 3387 }, { "epoch": 3.008880994671403, "grad_norm": 0.4240848754876425, "learning_rate": 1.7569612970569e-06, "loss": 0.0334, "step": 3388 }, { "epoch": 3.0097690941385435, "grad_norm": 0.36396117051931987, "learning_rate": 1.7540124344819548e-06, "loss": 0.0347, "step": 3389 }, { "epoch": 3.0106571936056836, "grad_norm": 0.35951825126530074, "learning_rate": 1.7510655221708467e-06, "loss": 0.0289, "step": 3390 }, { "epoch": 3.011545293072824, "grad_norm": 0.6340363117228983, "learning_rate": 1.7481205618941472e-06, "loss": 0.0355, "step": 3391 }, { "epoch": 3.0124333925399647, "grad_norm": 0.35804834224058507, "learning_rate": 1.7451775554212592e-06, "loss": 0.0347, "step": 3392 }, { "epoch": 3.0133214920071048, "grad_norm": 0.38625825431415106, "learning_rate": 1.742236504520407e-06, "loss": 0.0307, "step": 3393 }, { "epoch": 3.0142095914742453, "grad_norm": 0.3860254175150108, "learning_rate": 1.7392974109586396e-06, "loss": 0.0262, "step": 3394 }, { "epoch": 3.0150976909413854, "grad_norm": 0.30432603098065447, "learning_rate": 1.736360276501835e-06, "loss": 0.0278, "step": 3395 }, { "epoch": 3.015985790408526, "grad_norm": 0.38427685779628756, "learning_rate": 1.7334251029146882e-06, "loss": 0.042, "step": 3396 }, { "epoch": 3.016873889875666, "grad_norm": 0.453636926549333, "learning_rate": 1.7304918919607216e-06, "loss": 0.0319, "step": 3397 }, { "epoch": 3.0177619893428065, "grad_norm": 0.34657786188265227, "learning_rate": 1.7275606454022748e-06, "loss": 0.037, "step": 3398 }, { "epoch": 3.0186500888099466, "grad_norm": 0.35445950107686525, "learning_rate": 1.7246313650005058e-06, "loss": 0.0324, "step": 3399 }, { "epoch": 3.019538188277087, "grad_norm": 0.3822732277179491, "learning_rate": 1.721704052515395e-06, "loss": 0.0292, "step": 3400 }, { "epoch": 3.020426287744227, "grad_norm": 0.33562751195986795, "learning_rate": 1.7187787097057429e-06, "loss": 0.0316, "step": 3401 }, { "epoch": 3.0213143872113677, "grad_norm": 0.3115479805509103, "learning_rate": 1.7158553383291555e-06, "loss": 0.0317, "step": 3402 }, { "epoch": 3.022202486678508, "grad_norm": 0.3635008070986166, "learning_rate": 1.7129339401420648e-06, "loss": 0.0335, "step": 3403 }, { "epoch": 3.0230905861456483, "grad_norm": 0.36851852787930095, "learning_rate": 1.7100145168997157e-06, "loss": 0.0339, "step": 3404 }, { "epoch": 3.023978685612789, "grad_norm": 0.35430945415155546, "learning_rate": 1.7070970703561624e-06, "loss": 0.0313, "step": 3405 }, { "epoch": 3.024866785079929, "grad_norm": 0.3615631365194653, "learning_rate": 1.7041816022642731e-06, "loss": 0.0356, "step": 3406 }, { "epoch": 3.0257548845470694, "grad_norm": 0.3749744762401782, "learning_rate": 1.701268114375731e-06, "loss": 0.036, "step": 3407 }, { "epoch": 3.0266429840142095, "grad_norm": 0.32980877647347795, "learning_rate": 1.6983566084410224e-06, "loss": 0.0369, "step": 3408 }, { "epoch": 3.02753108348135, "grad_norm": 0.34045371592944046, "learning_rate": 1.6954470862094514e-06, "loss": 0.0314, "step": 3409 }, { "epoch": 3.02841918294849, "grad_norm": 0.44703484677785676, "learning_rate": 1.692539549429123e-06, "loss": 0.0386, "step": 3410 }, { "epoch": 3.0293072824156306, "grad_norm": 0.3865599181591045, "learning_rate": 1.6896339998469509e-06, "loss": 0.0356, "step": 3411 }, { "epoch": 3.0301953818827707, "grad_norm": 0.32527492227949584, "learning_rate": 1.6867304392086575e-06, "loss": 0.034, "step": 3412 }, { "epoch": 3.0310834813499112, "grad_norm": 0.416752950219048, "learning_rate": 1.6838288692587723e-06, "loss": 0.0368, "step": 3413 }, { "epoch": 3.0319715808170513, "grad_norm": 0.3452232934606978, "learning_rate": 1.6809292917406178e-06, "loss": 0.035, "step": 3414 }, { "epoch": 3.032859680284192, "grad_norm": 0.3489591850168152, "learning_rate": 1.678031708396331e-06, "loss": 0.0272, "step": 3415 }, { "epoch": 3.0337477797513324, "grad_norm": 0.40648659249767033, "learning_rate": 1.6751361209668477e-06, "loss": 0.0413, "step": 3416 }, { "epoch": 3.0346358792184724, "grad_norm": 0.39593533001990056, "learning_rate": 1.6722425311919015e-06, "loss": 0.0321, "step": 3417 }, { "epoch": 3.035523978685613, "grad_norm": 0.363604393933297, "learning_rate": 1.669350940810026e-06, "loss": 0.0369, "step": 3418 }, { "epoch": 3.036412078152753, "grad_norm": 0.3925221458198163, "learning_rate": 1.6664613515585582e-06, "loss": 0.0309, "step": 3419 }, { "epoch": 3.0373001776198936, "grad_norm": 0.36561598577530785, "learning_rate": 1.6635737651736266e-06, "loss": 0.0279, "step": 3420 }, { "epoch": 3.0381882770870337, "grad_norm": 0.43074556289531746, "learning_rate": 1.6606881833901627e-06, "loss": 0.038, "step": 3421 }, { "epoch": 3.039076376554174, "grad_norm": 0.3268413196577665, "learning_rate": 1.6578046079418885e-06, "loss": 0.0309, "step": 3422 }, { "epoch": 3.0399644760213143, "grad_norm": 0.38874874112487934, "learning_rate": 1.6549230405613209e-06, "loss": 0.0406, "step": 3423 }, { "epoch": 3.040852575488455, "grad_norm": 0.31768216944866806, "learning_rate": 1.6520434829797744e-06, "loss": 0.0343, "step": 3424 }, { "epoch": 3.041740674955595, "grad_norm": 0.3260546316515089, "learning_rate": 1.6491659369273521e-06, "loss": 0.0311, "step": 3425 }, { "epoch": 3.0426287744227354, "grad_norm": 0.3319666895264348, "learning_rate": 1.6462904041329485e-06, "loss": 0.0283, "step": 3426 }, { "epoch": 3.0435168738898755, "grad_norm": 0.3430621413156639, "learning_rate": 1.6434168863242523e-06, "loss": 0.0335, "step": 3427 }, { "epoch": 3.044404973357016, "grad_norm": 0.3868401764722692, "learning_rate": 1.6405453852277358e-06, "loss": 0.0346, "step": 3428 }, { "epoch": 3.0452930728241565, "grad_norm": 0.3061954731198063, "learning_rate": 1.6376759025686662e-06, "loss": 0.0267, "step": 3429 }, { "epoch": 3.0461811722912966, "grad_norm": 0.40069176783599647, "learning_rate": 1.6348084400710912e-06, "loss": 0.0316, "step": 3430 }, { "epoch": 3.047069271758437, "grad_norm": 0.36681793584149164, "learning_rate": 1.6319429994578518e-06, "loss": 0.026, "step": 3431 }, { "epoch": 3.047957371225577, "grad_norm": 0.39810557057369755, "learning_rate": 1.629079582450566e-06, "loss": 0.0291, "step": 3432 }, { "epoch": 3.0488454706927177, "grad_norm": 0.3300323779556455, "learning_rate": 1.6262181907696456e-06, "loss": 0.0347, "step": 3433 }, { "epoch": 3.049733570159858, "grad_norm": 0.37920489565568494, "learning_rate": 1.6233588261342769e-06, "loss": 0.0273, "step": 3434 }, { "epoch": 3.0506216696269983, "grad_norm": 0.3149149943652191, "learning_rate": 1.620501490262431e-06, "loss": 0.026, "step": 3435 }, { "epoch": 3.0515097690941384, "grad_norm": 0.4071584210435631, "learning_rate": 1.6176461848708647e-06, "loss": 0.0481, "step": 3436 }, { "epoch": 3.052397868561279, "grad_norm": 0.5098304758250216, "learning_rate": 1.6147929116751081e-06, "loss": 0.0358, "step": 3437 }, { "epoch": 3.053285968028419, "grad_norm": 0.3488799001580875, "learning_rate": 1.611941672389473e-06, "loss": 0.03, "step": 3438 }, { "epoch": 3.0541740674955595, "grad_norm": 0.35323693058259226, "learning_rate": 1.6090924687270515e-06, "loss": 0.0298, "step": 3439 }, { "epoch": 3.0550621669626996, "grad_norm": 0.347601327837638, "learning_rate": 1.6062453023997083e-06, "loss": 0.0316, "step": 3440 }, { "epoch": 3.05595026642984, "grad_norm": 0.4492026480839529, "learning_rate": 1.603400175118089e-06, "loss": 0.0424, "step": 3441 }, { "epoch": 3.0568383658969807, "grad_norm": 0.3047539539857411, "learning_rate": 1.6005570885916095e-06, "loss": 0.0333, "step": 3442 }, { "epoch": 3.0577264653641207, "grad_norm": 0.46885412379655417, "learning_rate": 1.59771604452846e-06, "loss": 0.0366, "step": 3443 }, { "epoch": 3.0586145648312613, "grad_norm": 0.44731834331586895, "learning_rate": 1.5948770446356065e-06, "loss": 0.0357, "step": 3444 }, { "epoch": 3.0595026642984013, "grad_norm": 0.3054708287316336, "learning_rate": 1.5920400906187878e-06, "loss": 0.0244, "step": 3445 }, { "epoch": 3.060390763765542, "grad_norm": 0.44178108992771253, "learning_rate": 1.5892051841825084e-06, "loss": 0.0407, "step": 3446 }, { "epoch": 3.061278863232682, "grad_norm": 0.4012261005684536, "learning_rate": 1.5863723270300441e-06, "loss": 0.0394, "step": 3447 }, { "epoch": 3.0621669626998225, "grad_norm": 0.3769259339107391, "learning_rate": 1.583541520863443e-06, "loss": 0.0318, "step": 3448 }, { "epoch": 3.0630550621669625, "grad_norm": 0.37362646028931695, "learning_rate": 1.580712767383516e-06, "loss": 0.0319, "step": 3449 }, { "epoch": 3.063943161634103, "grad_norm": 0.30727083447610515, "learning_rate": 1.5778860682898456e-06, "loss": 0.0277, "step": 3450 }, { "epoch": 3.064831261101243, "grad_norm": 0.29004399809092374, "learning_rate": 1.5750614252807767e-06, "loss": 0.0285, "step": 3451 }, { "epoch": 3.0657193605683837, "grad_norm": 0.3830849061399326, "learning_rate": 1.5722388400534177e-06, "loss": 0.035, "step": 3452 }, { "epoch": 3.0666074600355238, "grad_norm": 0.39043040622784064, "learning_rate": 1.569418314303644e-06, "loss": 0.0325, "step": 3453 }, { "epoch": 3.0674955595026643, "grad_norm": 0.3514013874663258, "learning_rate": 1.5665998497260959e-06, "loss": 0.034, "step": 3454 }, { "epoch": 3.068383658969805, "grad_norm": 0.3894652781072553, "learning_rate": 1.5637834480141646e-06, "loss": 0.04, "step": 3455 }, { "epoch": 3.069271758436945, "grad_norm": 0.5953228076292884, "learning_rate": 1.5609691108600128e-06, "loss": 0.0404, "step": 3456 }, { "epoch": 3.0701598579040854, "grad_norm": 0.3809853596944762, "learning_rate": 1.5581568399545593e-06, "loss": 0.0311, "step": 3457 }, { "epoch": 3.0710479573712255, "grad_norm": 0.3513474082125203, "learning_rate": 1.555346636987481e-06, "loss": 0.0323, "step": 3458 }, { "epoch": 3.071936056838366, "grad_norm": 0.4450240937216318, "learning_rate": 1.552538503647209e-06, "loss": 0.0313, "step": 3459 }, { "epoch": 3.072824156305506, "grad_norm": 0.3584246496100684, "learning_rate": 1.5497324416209391e-06, "loss": 0.033, "step": 3460 }, { "epoch": 3.0737122557726466, "grad_norm": 0.43791254251751843, "learning_rate": 1.5469284525946138e-06, "loss": 0.0375, "step": 3461 }, { "epoch": 3.0746003552397867, "grad_norm": 0.34501952519306195, "learning_rate": 1.5441265382529368e-06, "loss": 0.029, "step": 3462 }, { "epoch": 3.075488454706927, "grad_norm": 0.33819998442252974, "learning_rate": 1.5413267002793619e-06, "loss": 0.0311, "step": 3463 }, { "epoch": 3.0763765541740673, "grad_norm": 0.364941288060177, "learning_rate": 1.5385289403560938e-06, "loss": 0.0308, "step": 3464 }, { "epoch": 3.077264653641208, "grad_norm": 0.37049804392147895, "learning_rate": 1.5357332601640945e-06, "loss": 0.0279, "step": 3465 }, { "epoch": 3.0781527531083483, "grad_norm": 0.35224684811311013, "learning_rate": 1.532939661383071e-06, "loss": 0.0274, "step": 3466 }, { "epoch": 3.0790408525754884, "grad_norm": 0.33960419330735814, "learning_rate": 1.5301481456914802e-06, "loss": 0.0322, "step": 3467 }, { "epoch": 3.079928952042629, "grad_norm": 0.3514171333499501, "learning_rate": 1.5273587147665315e-06, "loss": 0.0313, "step": 3468 }, { "epoch": 3.080817051509769, "grad_norm": 0.38806742277357753, "learning_rate": 1.5245713702841758e-06, "loss": 0.0345, "step": 3469 }, { "epoch": 3.0817051509769096, "grad_norm": 0.3434592806521134, "learning_rate": 1.5217861139191175e-06, "loss": 0.029, "step": 3470 }, { "epoch": 3.0825932504440496, "grad_norm": 0.3581274216121454, "learning_rate": 1.5190029473447987e-06, "loss": 0.0322, "step": 3471 }, { "epoch": 3.08348134991119, "grad_norm": 0.324637577500593, "learning_rate": 1.5162218722334138e-06, "loss": 0.0273, "step": 3472 }, { "epoch": 3.0843694493783302, "grad_norm": 0.47742319524449367, "learning_rate": 1.5134428902558924e-06, "loss": 0.0348, "step": 3473 }, { "epoch": 3.0852575488454708, "grad_norm": 0.3498761689207883, "learning_rate": 1.5106660030819149e-06, "loss": 0.0271, "step": 3474 }, { "epoch": 3.086145648312611, "grad_norm": 0.3624470718730505, "learning_rate": 1.507891212379896e-06, "loss": 0.0268, "step": 3475 }, { "epoch": 3.0870337477797514, "grad_norm": 0.4346417954949858, "learning_rate": 1.5051185198169933e-06, "loss": 0.0377, "step": 3476 }, { "epoch": 3.0879218472468914, "grad_norm": 0.33645009045652774, "learning_rate": 1.5023479270591063e-06, "loss": 0.0275, "step": 3477 }, { "epoch": 3.088809946714032, "grad_norm": 0.40173607645775294, "learning_rate": 1.4995794357708693e-06, "loss": 0.0409, "step": 3478 }, { "epoch": 3.0896980461811725, "grad_norm": 0.40010418491350547, "learning_rate": 1.4968130476156545e-06, "loss": 0.0325, "step": 3479 }, { "epoch": 3.0905861456483126, "grad_norm": 0.35042283762753534, "learning_rate": 1.494048764255574e-06, "loss": 0.0314, "step": 3480 }, { "epoch": 3.091474245115453, "grad_norm": 0.4682559608994265, "learning_rate": 1.4912865873514697e-06, "loss": 0.0366, "step": 3481 }, { "epoch": 3.092362344582593, "grad_norm": 0.3374770521113918, "learning_rate": 1.4885265185629244e-06, "loss": 0.0271, "step": 3482 }, { "epoch": 3.0932504440497337, "grad_norm": 0.37484643713753174, "learning_rate": 1.4857685595482484e-06, "loss": 0.0323, "step": 3483 }, { "epoch": 3.094138543516874, "grad_norm": 0.34481804532192073, "learning_rate": 1.483012711964486e-06, "loss": 0.0287, "step": 3484 }, { "epoch": 3.0950266429840143, "grad_norm": 0.4734565209167152, "learning_rate": 1.4802589774674148e-06, "loss": 0.0352, "step": 3485 }, { "epoch": 3.0959147424511544, "grad_norm": 0.37127966682290336, "learning_rate": 1.477507357711545e-06, "loss": 0.0332, "step": 3486 }, { "epoch": 3.096802841918295, "grad_norm": 0.3543911235539375, "learning_rate": 1.4747578543501063e-06, "loss": 0.0313, "step": 3487 }, { "epoch": 3.097690941385435, "grad_norm": 0.38039943600968423, "learning_rate": 1.4720104690350672e-06, "loss": 0.0349, "step": 3488 }, { "epoch": 3.0985790408525755, "grad_norm": 0.3245828792851648, "learning_rate": 1.4692652034171207e-06, "loss": 0.0287, "step": 3489 }, { "epoch": 3.0994671403197156, "grad_norm": 0.42861540701989975, "learning_rate": 1.4665220591456847e-06, "loss": 0.0331, "step": 3490 }, { "epoch": 3.100355239786856, "grad_norm": 0.32183958469817114, "learning_rate": 1.4637810378689004e-06, "loss": 0.0273, "step": 3491 }, { "epoch": 3.1012433392539966, "grad_norm": 0.45972692428559464, "learning_rate": 1.4610421412336413e-06, "loss": 0.0302, "step": 3492 }, { "epoch": 3.1021314387211367, "grad_norm": 0.3759598952389504, "learning_rate": 1.4583053708854945e-06, "loss": 0.0329, "step": 3493 }, { "epoch": 3.1030195381882772, "grad_norm": 0.36353279121847576, "learning_rate": 1.455570728468778e-06, "loss": 0.0335, "step": 3494 }, { "epoch": 3.1039076376554173, "grad_norm": 0.3290160590761642, "learning_rate": 1.4528382156265269e-06, "loss": 0.0299, "step": 3495 }, { "epoch": 3.104795737122558, "grad_norm": 0.3595669023727179, "learning_rate": 1.4501078340004954e-06, "loss": 0.0325, "step": 3496 }, { "epoch": 3.105683836589698, "grad_norm": 0.35434918469827165, "learning_rate": 1.4473795852311606e-06, "loss": 0.0316, "step": 3497 }, { "epoch": 3.1065719360568385, "grad_norm": 0.4031990945442707, "learning_rate": 1.4446534709577214e-06, "loss": 0.0344, "step": 3498 }, { "epoch": 3.1074600355239785, "grad_norm": 0.5073170920433319, "learning_rate": 1.4419294928180828e-06, "loss": 0.0337, "step": 3499 }, { "epoch": 3.108348134991119, "grad_norm": 0.33208651903203795, "learning_rate": 1.4392076524488764e-06, "loss": 0.0262, "step": 3500 }, { "epoch": 3.109236234458259, "grad_norm": 0.3717968372001129, "learning_rate": 1.4364879514854485e-06, "loss": 0.0344, "step": 3501 }, { "epoch": 3.1101243339253997, "grad_norm": 0.3808171070996276, "learning_rate": 1.4337703915618556e-06, "loss": 0.0317, "step": 3502 }, { "epoch": 3.11101243339254, "grad_norm": 0.3570466028704032, "learning_rate": 1.4310549743108688e-06, "loss": 0.0268, "step": 3503 }, { "epoch": 3.1119005328596803, "grad_norm": 0.3570749694866279, "learning_rate": 1.4283417013639767e-06, "loss": 0.0324, "step": 3504 }, { "epoch": 3.112788632326821, "grad_norm": 0.362176667644257, "learning_rate": 1.425630574351372e-06, "loss": 0.038, "step": 3505 }, { "epoch": 3.113676731793961, "grad_norm": 0.3251101417555494, "learning_rate": 1.422921594901966e-06, "loss": 0.0362, "step": 3506 }, { "epoch": 3.1145648312611014, "grad_norm": 0.3388937962327327, "learning_rate": 1.4202147646433734e-06, "loss": 0.0267, "step": 3507 }, { "epoch": 3.1154529307282415, "grad_norm": 0.36826712170779247, "learning_rate": 1.4175100852019185e-06, "loss": 0.0398, "step": 3508 }, { "epoch": 3.116341030195382, "grad_norm": 0.3517639196196876, "learning_rate": 1.4148075582026387e-06, "loss": 0.028, "step": 3509 }, { "epoch": 3.117229129662522, "grad_norm": 0.36309715329027414, "learning_rate": 1.4121071852692708e-06, "loss": 0.0254, "step": 3510 }, { "epoch": 3.1181172291296626, "grad_norm": 0.34317416863813455, "learning_rate": 1.4094089680242634e-06, "loss": 0.0277, "step": 3511 }, { "epoch": 3.1190053285968027, "grad_norm": 0.39704784916876024, "learning_rate": 1.4067129080887642e-06, "loss": 0.034, "step": 3512 }, { "epoch": 3.119893428063943, "grad_norm": 0.3447554710079465, "learning_rate": 1.4040190070826321e-06, "loss": 0.0311, "step": 3513 }, { "epoch": 3.1207815275310833, "grad_norm": 0.46479627750960195, "learning_rate": 1.4013272666244209e-06, "loss": 0.0345, "step": 3514 }, { "epoch": 3.121669626998224, "grad_norm": 0.3254104020571842, "learning_rate": 1.398637688331393e-06, "loss": 0.0295, "step": 3515 }, { "epoch": 3.122557726465364, "grad_norm": 0.40157747673258, "learning_rate": 1.395950273819507e-06, "loss": 0.0329, "step": 3516 }, { "epoch": 3.1234458259325044, "grad_norm": 0.3852837069169171, "learning_rate": 1.393265024703422e-06, "loss": 0.0308, "step": 3517 }, { "epoch": 3.124333925399645, "grad_norm": 0.3276630375695141, "learning_rate": 1.3905819425965005e-06, "loss": 0.0309, "step": 3518 }, { "epoch": 3.125222024866785, "grad_norm": 0.5304847611809405, "learning_rate": 1.3879010291107986e-06, "loss": 0.0532, "step": 3519 }, { "epoch": 3.1261101243339255, "grad_norm": 0.41905821351311295, "learning_rate": 1.3852222858570685e-06, "loss": 0.0309, "step": 3520 }, { "epoch": 3.1269982238010656, "grad_norm": 0.39103923686669484, "learning_rate": 1.382545714444764e-06, "loss": 0.029, "step": 3521 }, { "epoch": 3.127886323268206, "grad_norm": 0.3350588699618402, "learning_rate": 1.379871316482027e-06, "loss": 0.0271, "step": 3522 }, { "epoch": 3.1287744227353462, "grad_norm": 0.32239838791621245, "learning_rate": 1.3771990935757013e-06, "loss": 0.0279, "step": 3523 }, { "epoch": 3.1296625222024868, "grad_norm": 0.4123712401960722, "learning_rate": 1.3745290473313177e-06, "loss": 0.0314, "step": 3524 }, { "epoch": 3.130550621669627, "grad_norm": 0.38723740273270757, "learning_rate": 1.3718611793530995e-06, "loss": 0.0324, "step": 3525 }, { "epoch": 3.1314387211367674, "grad_norm": 0.32223296368530385, "learning_rate": 1.3691954912439648e-06, "loss": 0.0261, "step": 3526 }, { "epoch": 3.1323268206039074, "grad_norm": 0.39004639051812723, "learning_rate": 1.366531984605523e-06, "loss": 0.0274, "step": 3527 }, { "epoch": 3.133214920071048, "grad_norm": 0.3821103643848609, "learning_rate": 1.3638706610380648e-06, "loss": 0.039, "step": 3528 }, { "epoch": 3.1341030195381885, "grad_norm": 0.3200448555791571, "learning_rate": 1.3612115221405764e-06, "loss": 0.036, "step": 3529 }, { "epoch": 3.1349911190053286, "grad_norm": 0.4274217582113707, "learning_rate": 1.3585545695107317e-06, "loss": 0.0348, "step": 3530 }, { "epoch": 3.135879218472469, "grad_norm": 0.3815320210874098, "learning_rate": 1.3558998047448869e-06, "loss": 0.0405, "step": 3531 }, { "epoch": 3.136767317939609, "grad_norm": 0.3967245046942835, "learning_rate": 1.3532472294380843e-06, "loss": 0.0329, "step": 3532 }, { "epoch": 3.1376554174067497, "grad_norm": 0.3739682684199583, "learning_rate": 1.350596845184055e-06, "loss": 0.0254, "step": 3533 }, { "epoch": 3.1385435168738898, "grad_norm": 0.3162116602600176, "learning_rate": 1.3479486535752078e-06, "loss": 0.0307, "step": 3534 }, { "epoch": 3.1394316163410303, "grad_norm": 0.4208812448914787, "learning_rate": 1.34530265620264e-06, "loss": 0.0376, "step": 3535 }, { "epoch": 3.1403197158081704, "grad_norm": 0.28686341216107925, "learning_rate": 1.3426588546561264e-06, "loss": 0.0275, "step": 3536 }, { "epoch": 3.141207815275311, "grad_norm": 0.464660142600853, "learning_rate": 1.340017250524121e-06, "loss": 0.0362, "step": 3537 }, { "epoch": 3.142095914742451, "grad_norm": 0.5120325348928773, "learning_rate": 1.337377845393763e-06, "loss": 0.041, "step": 3538 }, { "epoch": 3.1429840142095915, "grad_norm": 0.3199015168528209, "learning_rate": 1.3347406408508695e-06, "loss": 0.0254, "step": 3539 }, { "epoch": 3.143872113676732, "grad_norm": 0.4338499140507683, "learning_rate": 1.3321056384799285e-06, "loss": 0.0331, "step": 3540 }, { "epoch": 3.144760213143872, "grad_norm": 0.41380574169602147, "learning_rate": 1.3294728398641127e-06, "loss": 0.0321, "step": 3541 }, { "epoch": 3.1456483126110126, "grad_norm": 0.369763445539379, "learning_rate": 1.3268422465852687e-06, "loss": 0.0269, "step": 3542 }, { "epoch": 3.1465364120781527, "grad_norm": 0.4148583659600249, "learning_rate": 1.3242138602239168e-06, "loss": 0.0353, "step": 3543 }, { "epoch": 3.1474245115452932, "grad_norm": 0.33109060873080703, "learning_rate": 1.3215876823592493e-06, "loss": 0.0304, "step": 3544 }, { "epoch": 3.1483126110124333, "grad_norm": 0.382132967449249, "learning_rate": 1.318963714569138e-06, "loss": 0.0327, "step": 3545 }, { "epoch": 3.149200710479574, "grad_norm": 0.3383325218121581, "learning_rate": 1.31634195843012e-06, "loss": 0.0299, "step": 3546 }, { "epoch": 3.150088809946714, "grad_norm": 0.40393250614316256, "learning_rate": 1.313722415517409e-06, "loss": 0.029, "step": 3547 }, { "epoch": 3.1509769094138544, "grad_norm": 0.4546541345002346, "learning_rate": 1.3111050874048853e-06, "loss": 0.042, "step": 3548 }, { "epoch": 3.1518650088809945, "grad_norm": 0.4499379743205146, "learning_rate": 1.3084899756650981e-06, "loss": 0.0344, "step": 3549 }, { "epoch": 3.152753108348135, "grad_norm": 0.35340031710547076, "learning_rate": 1.3058770818692701e-06, "loss": 0.0341, "step": 3550 }, { "epoch": 3.153641207815275, "grad_norm": 0.34575268590107633, "learning_rate": 1.303266407587286e-06, "loss": 0.0333, "step": 3551 }, { "epoch": 3.1545293072824157, "grad_norm": 0.3360853754282288, "learning_rate": 1.3006579543876963e-06, "loss": 0.0264, "step": 3552 }, { "epoch": 3.1554174067495557, "grad_norm": 0.34728357967406603, "learning_rate": 1.2980517238377243e-06, "loss": 0.0333, "step": 3553 }, { "epoch": 3.1563055062166963, "grad_norm": 0.4519787099256083, "learning_rate": 1.2954477175032493e-06, "loss": 0.0367, "step": 3554 }, { "epoch": 3.157193605683837, "grad_norm": 0.41641816261413234, "learning_rate": 1.292845936948821e-06, "loss": 0.0307, "step": 3555 }, { "epoch": 3.158081705150977, "grad_norm": 0.33086456984655066, "learning_rate": 1.290246383737646e-06, "loss": 0.0307, "step": 3556 }, { "epoch": 3.1589698046181174, "grad_norm": 0.45776048239984535, "learning_rate": 1.2876490594315988e-06, "loss": 0.0309, "step": 3557 }, { "epoch": 3.1598579040852575, "grad_norm": 0.3276100401593595, "learning_rate": 1.285053965591208e-06, "loss": 0.0218, "step": 3558 }, { "epoch": 3.160746003552398, "grad_norm": 0.3921587024554653, "learning_rate": 1.2824611037756686e-06, "loss": 0.0322, "step": 3559 }, { "epoch": 3.161634103019538, "grad_norm": 0.33801097072638053, "learning_rate": 1.2798704755428303e-06, "loss": 0.0282, "step": 3560 }, { "epoch": 3.1625222024866786, "grad_norm": 0.3276014010818129, "learning_rate": 1.2772820824492005e-06, "loss": 0.0338, "step": 3561 }, { "epoch": 3.1634103019538187, "grad_norm": 0.33107384453826255, "learning_rate": 1.2746959260499481e-06, "loss": 0.0285, "step": 3562 }, { "epoch": 3.164298401420959, "grad_norm": 0.5525909627135244, "learning_rate": 1.2721120078988935e-06, "loss": 0.0362, "step": 3563 }, { "epoch": 3.1651865008880993, "grad_norm": 0.3125784896110762, "learning_rate": 1.2695303295485128e-06, "loss": 0.0266, "step": 3564 }, { "epoch": 3.16607460035524, "grad_norm": 0.5033226516615978, "learning_rate": 1.2669508925499402e-06, "loss": 0.0294, "step": 3565 }, { "epoch": 3.1669626998223803, "grad_norm": 0.49455207444213073, "learning_rate": 1.2643736984529587e-06, "loss": 0.0314, "step": 3566 }, { "epoch": 3.1678507992895204, "grad_norm": 0.4549611309469651, "learning_rate": 1.2617987488060074e-06, "loss": 0.0351, "step": 3567 }, { "epoch": 3.168738898756661, "grad_norm": 0.5322615508545987, "learning_rate": 1.259226045156174e-06, "loss": 0.0294, "step": 3568 }, { "epoch": 3.169626998223801, "grad_norm": 0.4492201294052334, "learning_rate": 1.256655589049197e-06, "loss": 0.0299, "step": 3569 }, { "epoch": 3.1705150976909415, "grad_norm": 0.39680288347095183, "learning_rate": 1.2540873820294663e-06, "loss": 0.0322, "step": 3570 }, { "epoch": 3.1714031971580816, "grad_norm": 0.3126403079807289, "learning_rate": 1.2515214256400216e-06, "loss": 0.0261, "step": 3571 }, { "epoch": 3.172291296625222, "grad_norm": 0.2995851897727122, "learning_rate": 1.2489577214225467e-06, "loss": 0.025, "step": 3572 }, { "epoch": 3.173179396092362, "grad_norm": 0.4722294181818512, "learning_rate": 1.2463962709173727e-06, "loss": 0.0386, "step": 3573 }, { "epoch": 3.1740674955595027, "grad_norm": 0.4563629162787923, "learning_rate": 1.2438370756634798e-06, "loss": 0.0317, "step": 3574 }, { "epoch": 3.174955595026643, "grad_norm": 0.3541427562621811, "learning_rate": 1.2412801371984896e-06, "loss": 0.0298, "step": 3575 }, { "epoch": 3.1758436944937833, "grad_norm": 0.3596867300973276, "learning_rate": 1.2387254570586714e-06, "loss": 0.0335, "step": 3576 }, { "epoch": 3.1767317939609234, "grad_norm": 0.46727923275068173, "learning_rate": 1.2361730367789348e-06, "loss": 0.0391, "step": 3577 }, { "epoch": 3.177619893428064, "grad_norm": 0.41535192165244617, "learning_rate": 1.2336228778928312e-06, "loss": 0.0339, "step": 3578 }, { "epoch": 3.1785079928952045, "grad_norm": 0.3385146511794204, "learning_rate": 1.231074981932555e-06, "loss": 0.0298, "step": 3579 }, { "epoch": 3.1793960923623446, "grad_norm": 0.37876673668081506, "learning_rate": 1.2285293504289448e-06, "loss": 0.0379, "step": 3580 }, { "epoch": 3.180284191829485, "grad_norm": 0.2931565929018687, "learning_rate": 1.225985984911468e-06, "loss": 0.0258, "step": 3581 }, { "epoch": 3.181172291296625, "grad_norm": 0.37354509891361926, "learning_rate": 1.2234448869082393e-06, "loss": 0.0324, "step": 3582 }, { "epoch": 3.1820603907637657, "grad_norm": 0.37746159995857514, "learning_rate": 1.2209060579460114e-06, "loss": 0.0269, "step": 3583 }, { "epoch": 3.1829484902309058, "grad_norm": 0.4082476549440637, "learning_rate": 1.2183694995501687e-06, "loss": 0.0309, "step": 3584 }, { "epoch": 3.1838365896980463, "grad_norm": 0.3999879375086408, "learning_rate": 1.2158352132447326e-06, "loss": 0.0331, "step": 3585 }, { "epoch": 3.1847246891651864, "grad_norm": 0.44253878404960434, "learning_rate": 1.2133032005523632e-06, "loss": 0.0362, "step": 3586 }, { "epoch": 3.185612788632327, "grad_norm": 0.42602035316738923, "learning_rate": 1.2107734629943485e-06, "loss": 0.0348, "step": 3587 }, { "epoch": 3.186500888099467, "grad_norm": 0.3403485636547022, "learning_rate": 1.2082460020906168e-06, "loss": 0.0318, "step": 3588 }, { "epoch": 3.1873889875666075, "grad_norm": 0.3629239463382843, "learning_rate": 1.2057208193597226e-06, "loss": 0.0248, "step": 3589 }, { "epoch": 3.1882770870337476, "grad_norm": 0.3635748301532728, "learning_rate": 1.2031979163188523e-06, "loss": 0.0295, "step": 3590 }, { "epoch": 3.189165186500888, "grad_norm": 0.5736161809384074, "learning_rate": 1.200677294483827e-06, "loss": 0.0352, "step": 3591 }, { "epoch": 3.1900532859680286, "grad_norm": 0.41314686201750805, "learning_rate": 1.198158955369092e-06, "loss": 0.0294, "step": 3592 }, { "epoch": 3.1909413854351687, "grad_norm": 0.35802399099814725, "learning_rate": 1.1956429004877229e-06, "loss": 0.0297, "step": 3593 }, { "epoch": 3.191829484902309, "grad_norm": 0.45446775683013857, "learning_rate": 1.1931291313514255e-06, "loss": 0.0343, "step": 3594 }, { "epoch": 3.1927175843694493, "grad_norm": 0.3815567088530151, "learning_rate": 1.1906176494705269e-06, "loss": 0.0313, "step": 3595 }, { "epoch": 3.19360568383659, "grad_norm": 0.3338661128814227, "learning_rate": 1.1881084563539864e-06, "loss": 0.0294, "step": 3596 }, { "epoch": 3.19449378330373, "grad_norm": 0.39651084280681126, "learning_rate": 1.1856015535093818e-06, "loss": 0.0395, "step": 3597 }, { "epoch": 3.1953818827708704, "grad_norm": 0.30600919417760303, "learning_rate": 1.1830969424429206e-06, "loss": 0.0278, "step": 3598 }, { "epoch": 3.1962699822380105, "grad_norm": 0.3985018150523524, "learning_rate": 1.1805946246594274e-06, "loss": 0.0301, "step": 3599 }, { "epoch": 3.197158081705151, "grad_norm": 0.3330658386535985, "learning_rate": 1.1780946016623563e-06, "loss": 0.0288, "step": 3600 }, { "epoch": 3.198046181172291, "grad_norm": 0.38800269698605144, "learning_rate": 1.1755968749537755e-06, "loss": 0.0371, "step": 3601 }, { "epoch": 3.1989342806394316, "grad_norm": 0.3772262775640011, "learning_rate": 1.1731014460343764e-06, "loss": 0.0328, "step": 3602 }, { "epoch": 3.199822380106572, "grad_norm": 0.35801569723417365, "learning_rate": 1.170608316403472e-06, "loss": 0.0355, "step": 3603 }, { "epoch": 3.2007104795737122, "grad_norm": 0.540743442831892, "learning_rate": 1.1681174875589912e-06, "loss": 0.0306, "step": 3604 }, { "epoch": 3.2015985790408528, "grad_norm": 0.38639688907139, "learning_rate": 1.1656289609974786e-06, "loss": 0.0353, "step": 3605 }, { "epoch": 3.202486678507993, "grad_norm": 0.31038169012942846, "learning_rate": 1.1631427382141013e-06, "loss": 0.0268, "step": 3606 }, { "epoch": 3.2033747779751334, "grad_norm": 0.5787522461060308, "learning_rate": 1.160658820702637e-06, "loss": 0.0338, "step": 3607 }, { "epoch": 3.2042628774422734, "grad_norm": 0.4370600645764752, "learning_rate": 1.1581772099554828e-06, "loss": 0.0272, "step": 3608 }, { "epoch": 3.205150976909414, "grad_norm": 0.34153399006094054, "learning_rate": 1.1556979074636448e-06, "loss": 0.0277, "step": 3609 }, { "epoch": 3.206039076376554, "grad_norm": 0.39807487230475347, "learning_rate": 1.153220914716745e-06, "loss": 0.0257, "step": 3610 }, { "epoch": 3.2069271758436946, "grad_norm": 0.36300900520000173, "learning_rate": 1.1507462332030183e-06, "loss": 0.0277, "step": 3611 }, { "epoch": 3.2078152753108347, "grad_norm": 0.7205542852667735, "learning_rate": 1.1482738644093134e-06, "loss": 0.0434, "step": 3612 }, { "epoch": 3.208703374777975, "grad_norm": 0.34081759783713905, "learning_rate": 1.1458038098210795e-06, "loss": 0.0246, "step": 3613 }, { "epoch": 3.2095914742451153, "grad_norm": 0.41700014772652777, "learning_rate": 1.1433360709223855e-06, "loss": 0.0324, "step": 3614 }, { "epoch": 3.210479573712256, "grad_norm": 0.35544687024847266, "learning_rate": 1.1408706491959076e-06, "loss": 0.0342, "step": 3615 }, { "epoch": 3.211367673179396, "grad_norm": 0.45607285714623136, "learning_rate": 1.1384075461229255e-06, "loss": 0.0372, "step": 3616 }, { "epoch": 3.2122557726465364, "grad_norm": 0.3212607067426791, "learning_rate": 1.135946763183327e-06, "loss": 0.0244, "step": 3617 }, { "epoch": 3.213143872113677, "grad_norm": 0.33666676900365416, "learning_rate": 1.1334883018556103e-06, "loss": 0.0331, "step": 3618 }, { "epoch": 3.214031971580817, "grad_norm": 0.3985437893793705, "learning_rate": 1.1310321636168714e-06, "loss": 0.0332, "step": 3619 }, { "epoch": 3.2149200710479575, "grad_norm": 0.3132500002866778, "learning_rate": 1.1285783499428182e-06, "loss": 0.0314, "step": 3620 }, { "epoch": 3.2158081705150976, "grad_norm": 0.44934975259489535, "learning_rate": 1.126126862307757e-06, "loss": 0.0407, "step": 3621 }, { "epoch": 3.216696269982238, "grad_norm": 0.38076278712215256, "learning_rate": 1.1236777021845957e-06, "loss": 0.0315, "step": 3622 }, { "epoch": 3.217584369449378, "grad_norm": 0.3971270966244105, "learning_rate": 1.1212308710448477e-06, "loss": 0.0318, "step": 3623 }, { "epoch": 3.2184724689165187, "grad_norm": 0.32239116405325563, "learning_rate": 1.1187863703586283e-06, "loss": 0.0272, "step": 3624 }, { "epoch": 3.219360568383659, "grad_norm": 0.37343600464931165, "learning_rate": 1.1163442015946442e-06, "loss": 0.0308, "step": 3625 }, { "epoch": 3.2202486678507993, "grad_norm": 0.36166634491368127, "learning_rate": 1.1139043662202082e-06, "loss": 0.0293, "step": 3626 }, { "epoch": 3.2211367673179394, "grad_norm": 0.3654215711524558, "learning_rate": 1.1114668657012324e-06, "loss": 0.0314, "step": 3627 }, { "epoch": 3.22202486678508, "grad_norm": 0.3603595596939604, "learning_rate": 1.10903170150222e-06, "loss": 0.0263, "step": 3628 }, { "epoch": 3.2229129662522205, "grad_norm": 0.3347442955939939, "learning_rate": 1.1065988750862732e-06, "loss": 0.0259, "step": 3629 }, { "epoch": 3.2238010657193605, "grad_norm": 0.36054957342562477, "learning_rate": 1.1041683879150928e-06, "loss": 0.0268, "step": 3630 }, { "epoch": 3.224689165186501, "grad_norm": 0.4058356455460025, "learning_rate": 1.1017402414489674e-06, "loss": 0.0307, "step": 3631 }, { "epoch": 3.225577264653641, "grad_norm": 0.45335634106980394, "learning_rate": 1.0993144371467874e-06, "loss": 0.0326, "step": 3632 }, { "epoch": 3.2264653641207817, "grad_norm": 0.32254800073807566, "learning_rate": 1.096890976466029e-06, "loss": 0.0335, "step": 3633 }, { "epoch": 3.2273534635879217, "grad_norm": 0.43680237472285177, "learning_rate": 1.094469860862763e-06, "loss": 0.0328, "step": 3634 }, { "epoch": 3.2282415630550623, "grad_norm": 0.34419604781032265, "learning_rate": 1.0920510917916527e-06, "loss": 0.0344, "step": 3635 }, { "epoch": 3.2291296625222023, "grad_norm": 0.31588049720715605, "learning_rate": 1.089634670705948e-06, "loss": 0.0317, "step": 3636 }, { "epoch": 3.230017761989343, "grad_norm": 0.3584527839153107, "learning_rate": 1.0872205990574924e-06, "loss": 0.0273, "step": 3637 }, { "epoch": 3.230905861456483, "grad_norm": 0.35768532221840543, "learning_rate": 1.0848088782967137e-06, "loss": 0.0251, "step": 3638 }, { "epoch": 3.2317939609236235, "grad_norm": 0.37716262645451376, "learning_rate": 1.082399509872631e-06, "loss": 0.0339, "step": 3639 }, { "epoch": 3.232682060390764, "grad_norm": 0.3900464102557058, "learning_rate": 1.0799924952328456e-06, "loss": 0.0358, "step": 3640 }, { "epoch": 3.233570159857904, "grad_norm": 0.30070035854918137, "learning_rate": 1.0775878358235503e-06, "loss": 0.0275, "step": 3641 }, { "epoch": 3.2344582593250446, "grad_norm": 0.3530274441342446, "learning_rate": 1.0751855330895179e-06, "loss": 0.0308, "step": 3642 }, { "epoch": 3.2353463587921847, "grad_norm": 0.3323833981612693, "learning_rate": 1.0727855884741057e-06, "loss": 0.0291, "step": 3643 }, { "epoch": 3.236234458259325, "grad_norm": 0.3005373035476294, "learning_rate": 1.0703880034192582e-06, "loss": 0.0253, "step": 3644 }, { "epoch": 3.2371225577264653, "grad_norm": 0.3682642479597278, "learning_rate": 1.0679927793654983e-06, "loss": 0.0304, "step": 3645 }, { "epoch": 3.238010657193606, "grad_norm": 0.3422044006741604, "learning_rate": 1.0655999177519304e-06, "loss": 0.0335, "step": 3646 }, { "epoch": 3.238898756660746, "grad_norm": 0.3129341268530191, "learning_rate": 1.063209420016243e-06, "loss": 0.0326, "step": 3647 }, { "epoch": 3.2397868561278864, "grad_norm": 0.39978841909230495, "learning_rate": 1.0608212875946994e-06, "loss": 0.0298, "step": 3648 }, { "epoch": 3.2406749555950265, "grad_norm": 0.45583744131392623, "learning_rate": 1.0584355219221475e-06, "loss": 0.0333, "step": 3649 }, { "epoch": 3.241563055062167, "grad_norm": 0.34424345973016157, "learning_rate": 1.0560521244320083e-06, "loss": 0.0236, "step": 3650 }, { "epoch": 3.242451154529307, "grad_norm": 0.36684106852920867, "learning_rate": 1.0536710965562801e-06, "loss": 0.0414, "step": 3651 }, { "epoch": 3.2433392539964476, "grad_norm": 0.4431679278597177, "learning_rate": 1.0512924397255409e-06, "loss": 0.0332, "step": 3652 }, { "epoch": 3.2442273534635877, "grad_norm": 0.3450258871559446, "learning_rate": 1.0489161553689454e-06, "loss": 0.031, "step": 3653 }, { "epoch": 3.2451154529307282, "grad_norm": 0.28470546275432324, "learning_rate": 1.0465422449142137e-06, "loss": 0.0281, "step": 3654 }, { "epoch": 3.2460035523978688, "grad_norm": 0.4127530639860459, "learning_rate": 1.0441707097876486e-06, "loss": 0.0353, "step": 3655 }, { "epoch": 3.246891651865009, "grad_norm": 0.3568083807868309, "learning_rate": 1.0418015514141239e-06, "loss": 0.035, "step": 3656 }, { "epoch": 3.2477797513321494, "grad_norm": 0.36552784917385517, "learning_rate": 1.0394347712170837e-06, "loss": 0.0294, "step": 3657 }, { "epoch": 3.2486678507992894, "grad_norm": 0.3670258157620859, "learning_rate": 1.037070370618542e-06, "loss": 0.0317, "step": 3658 }, { "epoch": 3.24955595026643, "grad_norm": 0.40799529442652116, "learning_rate": 1.0347083510390882e-06, "loss": 0.0326, "step": 3659 }, { "epoch": 3.25044404973357, "grad_norm": 0.3450119468412042, "learning_rate": 1.0323487138978738e-06, "loss": 0.0266, "step": 3660 }, { "epoch": 3.2513321492007106, "grad_norm": 0.4010754328666231, "learning_rate": 1.0299914606126276e-06, "loss": 0.0381, "step": 3661 }, { "epoch": 3.2522202486678506, "grad_norm": 0.33156606244503334, "learning_rate": 1.0276365925996384e-06, "loss": 0.0309, "step": 3662 }, { "epoch": 3.253108348134991, "grad_norm": 0.3950135284211048, "learning_rate": 1.0252841112737649e-06, "loss": 0.029, "step": 3663 }, { "epoch": 3.2539964476021312, "grad_norm": 0.36065627802528166, "learning_rate": 1.022934018048432e-06, "loss": 0.0316, "step": 3664 }, { "epoch": 3.2548845470692718, "grad_norm": 0.423051441368762, "learning_rate": 1.0205863143356338e-06, "loss": 0.0325, "step": 3665 }, { "epoch": 3.2557726465364123, "grad_norm": 0.3434621044795289, "learning_rate": 1.0182410015459183e-06, "loss": 0.026, "step": 3666 }, { "epoch": 3.2566607460035524, "grad_norm": 0.3955697345905485, "learning_rate": 1.0158980810884057e-06, "loss": 0.0353, "step": 3667 }, { "epoch": 3.257548845470693, "grad_norm": 0.400489083511983, "learning_rate": 1.0135575543707782e-06, "loss": 0.0331, "step": 3668 }, { "epoch": 3.258436944937833, "grad_norm": 0.5068290095323994, "learning_rate": 1.0112194227992767e-06, "loss": 0.0363, "step": 3669 }, { "epoch": 3.2593250444049735, "grad_norm": 0.38441100164085706, "learning_rate": 1.0088836877787024e-06, "loss": 0.0363, "step": 3670 }, { "epoch": 3.2602131438721136, "grad_norm": 0.3085506929583767, "learning_rate": 1.006550350712422e-06, "loss": 0.0265, "step": 3671 }, { "epoch": 3.261101243339254, "grad_norm": 0.37596699763301317, "learning_rate": 1.0042194130023546e-06, "loss": 0.038, "step": 3672 }, { "epoch": 3.261989342806394, "grad_norm": 0.46539946467784293, "learning_rate": 1.0018908760489844e-06, "loss": 0.034, "step": 3673 }, { "epoch": 3.2628774422735347, "grad_norm": 0.36104104474291415, "learning_rate": 9.995647412513475e-07, "loss": 0.0362, "step": 3674 }, { "epoch": 3.263765541740675, "grad_norm": 0.3990660872797956, "learning_rate": 9.97241010007039e-07, "loss": 0.0331, "step": 3675 }, { "epoch": 3.2646536412078153, "grad_norm": 0.42276274619842286, "learning_rate": 9.949196837122121e-07, "loss": 0.0388, "step": 3676 }, { "epoch": 3.265541740674956, "grad_norm": 0.35859010345599995, "learning_rate": 9.926007637615715e-07, "loss": 0.0325, "step": 3677 }, { "epoch": 3.266429840142096, "grad_norm": 0.3222964953326768, "learning_rate": 9.902842515483763e-07, "loss": 0.0256, "step": 3678 }, { "epoch": 3.267317939609236, "grad_norm": 0.3574957365393956, "learning_rate": 9.879701484644411e-07, "loss": 0.0345, "step": 3679 }, { "epoch": 3.2682060390763765, "grad_norm": 0.35860403384823814, "learning_rate": 9.856584559001342e-07, "loss": 0.0369, "step": 3680 }, { "epoch": 3.269094138543517, "grad_norm": 0.37048339629496846, "learning_rate": 9.833491752443713e-07, "loss": 0.0284, "step": 3681 }, { "epoch": 3.269982238010657, "grad_norm": 0.34021955727332254, "learning_rate": 9.81042307884621e-07, "loss": 0.031, "step": 3682 }, { "epoch": 3.2708703374777977, "grad_norm": 0.4122784589503335, "learning_rate": 9.787378552069044e-07, "loss": 0.0325, "step": 3683 }, { "epoch": 3.2717584369449377, "grad_norm": 0.4373114567342978, "learning_rate": 9.764358185957867e-07, "loss": 0.0272, "step": 3684 }, { "epoch": 3.2726465364120783, "grad_norm": 0.31816932071306603, "learning_rate": 9.741361994343867e-07, "loss": 0.0295, "step": 3685 }, { "epoch": 3.2735346358792183, "grad_norm": 0.6459369984798691, "learning_rate": 9.71838999104368e-07, "loss": 0.0283, "step": 3686 }, { "epoch": 3.274422735346359, "grad_norm": 0.3766304584171301, "learning_rate": 9.695442189859395e-07, "loss": 0.0401, "step": 3687 }, { "epoch": 3.275310834813499, "grad_norm": 0.33935473355987256, "learning_rate": 9.672518604578595e-07, "loss": 0.031, "step": 3688 }, { "epoch": 3.2761989342806395, "grad_norm": 0.40498961036649267, "learning_rate": 9.649619248974302e-07, "loss": 0.0383, "step": 3689 }, { "epoch": 3.2770870337477795, "grad_norm": 0.3818979587980272, "learning_rate": 9.626744136804944e-07, "loss": 0.0318, "step": 3690 }, { "epoch": 3.27797513321492, "grad_norm": 0.3179592431463596, "learning_rate": 9.603893281814446e-07, "loss": 0.0258, "step": 3691 }, { "epoch": 3.2788632326820606, "grad_norm": 0.412061975287786, "learning_rate": 9.581066697732106e-07, "loss": 0.0295, "step": 3692 }, { "epoch": 3.2797513321492007, "grad_norm": 0.35376706053661183, "learning_rate": 9.558264398272675e-07, "loss": 0.0244, "step": 3693 }, { "epoch": 3.280639431616341, "grad_norm": 0.3007743907618776, "learning_rate": 9.53548639713629e-07, "loss": 0.0244, "step": 3694 }, { "epoch": 3.2815275310834813, "grad_norm": 0.4158163193069238, "learning_rate": 9.512732708008487e-07, "loss": 0.0325, "step": 3695 }, { "epoch": 3.282415630550622, "grad_norm": 0.35365647232122, "learning_rate": 9.490003344560211e-07, "loss": 0.0344, "step": 3696 }, { "epoch": 3.283303730017762, "grad_norm": 0.3791989904442274, "learning_rate": 9.467298320447804e-07, "loss": 0.0353, "step": 3697 }, { "epoch": 3.2841918294849024, "grad_norm": 0.3458884379167756, "learning_rate": 9.444617649312954e-07, "loss": 0.028, "step": 3698 }, { "epoch": 3.2850799289520425, "grad_norm": 0.38332292715688804, "learning_rate": 9.421961344782709e-07, "loss": 0.0278, "step": 3699 }, { "epoch": 3.285968028419183, "grad_norm": 0.420476771200935, "learning_rate": 9.399329420469527e-07, "loss": 0.0361, "step": 3700 }, { "epoch": 3.286856127886323, "grad_norm": 0.6265528642038826, "learning_rate": 9.376721889971158e-07, "loss": 0.0434, "step": 3701 }, { "epoch": 3.2877442273534636, "grad_norm": 0.3224220289092714, "learning_rate": 9.354138766870758e-07, "loss": 0.0296, "step": 3702 }, { "epoch": 3.288632326820604, "grad_norm": 0.3949143641031206, "learning_rate": 9.331580064736768e-07, "loss": 0.0296, "step": 3703 }, { "epoch": 3.289520426287744, "grad_norm": 0.3544687226222743, "learning_rate": 9.309045797122962e-07, "loss": 0.029, "step": 3704 }, { "epoch": 3.2904085257548847, "grad_norm": 0.3556846381666713, "learning_rate": 9.286535977568456e-07, "loss": 0.0322, "step": 3705 }, { "epoch": 3.291296625222025, "grad_norm": 0.36853334223341294, "learning_rate": 9.264050619597697e-07, "loss": 0.0305, "step": 3706 }, { "epoch": 3.2921847246891653, "grad_norm": 0.4737052791958284, "learning_rate": 9.241589736720347e-07, "loss": 0.0336, "step": 3707 }, { "epoch": 3.2930728241563054, "grad_norm": 0.3466761014452016, "learning_rate": 9.219153342431453e-07, "loss": 0.0333, "step": 3708 }, { "epoch": 3.293960923623446, "grad_norm": 0.3956546324980311, "learning_rate": 9.196741450211322e-07, "loss": 0.0334, "step": 3709 }, { "epoch": 3.294849023090586, "grad_norm": 0.30393453654068525, "learning_rate": 9.174354073525521e-07, "loss": 0.0267, "step": 3710 }, { "epoch": 3.2957371225577266, "grad_norm": 0.32512515289364163, "learning_rate": 9.151991225824891e-07, "loss": 0.0275, "step": 3711 }, { "epoch": 3.2966252220248666, "grad_norm": 0.3899953032700139, "learning_rate": 9.129652920545562e-07, "loss": 0.0364, "step": 3712 }, { "epoch": 3.297513321492007, "grad_norm": 0.3984208555582086, "learning_rate": 9.107339171108887e-07, "loss": 0.0348, "step": 3713 }, { "epoch": 3.2984014209591472, "grad_norm": 0.32484139207266705, "learning_rate": 9.085049990921497e-07, "loss": 0.0296, "step": 3714 }, { "epoch": 3.2992895204262878, "grad_norm": 0.4137772419589482, "learning_rate": 9.062785393375228e-07, "loss": 0.0402, "step": 3715 }, { "epoch": 3.300177619893428, "grad_norm": 0.38249895639311854, "learning_rate": 9.040545391847155e-07, "loss": 0.0344, "step": 3716 }, { "epoch": 3.3010657193605684, "grad_norm": 0.3664008503037077, "learning_rate": 9.018329999699599e-07, "loss": 0.0345, "step": 3717 }, { "epoch": 3.301953818827709, "grad_norm": 0.42487867897661014, "learning_rate": 8.996139230280076e-07, "loss": 0.0292, "step": 3718 }, { "epoch": 3.302841918294849, "grad_norm": 0.37442585855265675, "learning_rate": 8.973973096921285e-07, "loss": 0.0302, "step": 3719 }, { "epoch": 3.3037300177619895, "grad_norm": 0.40539585791226573, "learning_rate": 8.95183161294118e-07, "loss": 0.0344, "step": 3720 }, { "epoch": 3.3046181172291296, "grad_norm": 0.3981479478515684, "learning_rate": 8.929714791642846e-07, "loss": 0.0296, "step": 3721 }, { "epoch": 3.30550621669627, "grad_norm": 0.4407351031638227, "learning_rate": 8.907622646314601e-07, "loss": 0.0375, "step": 3722 }, { "epoch": 3.30639431616341, "grad_norm": 0.39268346732364684, "learning_rate": 8.885555190229889e-07, "loss": 0.0327, "step": 3723 }, { "epoch": 3.3072824156305507, "grad_norm": 0.3315670230253401, "learning_rate": 8.86351243664737e-07, "loss": 0.0344, "step": 3724 }, { "epoch": 3.308170515097691, "grad_norm": 0.36826434450412143, "learning_rate": 8.841494398810813e-07, "loss": 0.0287, "step": 3725 }, { "epoch": 3.3090586145648313, "grad_norm": 0.40366740190123895, "learning_rate": 8.819501089949179e-07, "loss": 0.0257, "step": 3726 }, { "epoch": 3.3099467140319714, "grad_norm": 0.3099488079064351, "learning_rate": 8.797532523276542e-07, "loss": 0.0255, "step": 3727 }, { "epoch": 3.310834813499112, "grad_norm": 0.5004868511195822, "learning_rate": 8.775588711992117e-07, "loss": 0.0309, "step": 3728 }, { "epoch": 3.3117229129662524, "grad_norm": 0.4152989274534055, "learning_rate": 8.753669669280263e-07, "loss": 0.0439, "step": 3729 }, { "epoch": 3.3126110124333925, "grad_norm": 0.4499840275721616, "learning_rate": 8.731775408310439e-07, "loss": 0.0331, "step": 3730 }, { "epoch": 3.313499111900533, "grad_norm": 0.48187952598845685, "learning_rate": 8.709905942237206e-07, "loss": 0.0306, "step": 3731 }, { "epoch": 3.314387211367673, "grad_norm": 0.610831295084475, "learning_rate": 8.688061284200266e-07, "loss": 0.0384, "step": 3732 }, { "epoch": 3.3152753108348136, "grad_norm": 0.4034821390647482, "learning_rate": 8.666241447324364e-07, "loss": 0.0311, "step": 3733 }, { "epoch": 3.3161634103019537, "grad_norm": 0.6499533896580538, "learning_rate": 8.644446444719385e-07, "loss": 0.0395, "step": 3734 }, { "epoch": 3.3170515097690942, "grad_norm": 0.4237121525319725, "learning_rate": 8.622676289480248e-07, "loss": 0.04, "step": 3735 }, { "epoch": 3.3179396092362343, "grad_norm": 0.3803000710175403, "learning_rate": 8.600930994686962e-07, "loss": 0.0325, "step": 3736 }, { "epoch": 3.318827708703375, "grad_norm": 0.3969071493516198, "learning_rate": 8.579210573404606e-07, "loss": 0.0308, "step": 3737 }, { "epoch": 3.319715808170515, "grad_norm": 0.39349131446670027, "learning_rate": 8.557515038683328e-07, "loss": 0.0294, "step": 3738 }, { "epoch": 3.3206039076376554, "grad_norm": 0.37583904590243433, "learning_rate": 8.535844403558263e-07, "loss": 0.0337, "step": 3739 }, { "epoch": 3.321492007104796, "grad_norm": 0.3232199398831246, "learning_rate": 8.51419868104964e-07, "loss": 0.0232, "step": 3740 }, { "epoch": 3.322380106571936, "grad_norm": 0.39660634854943194, "learning_rate": 8.492577884162728e-07, "loss": 0.0344, "step": 3741 }, { "epoch": 3.323268206039076, "grad_norm": 0.5936043101926092, "learning_rate": 8.47098202588778e-07, "loss": 0.0327, "step": 3742 }, { "epoch": 3.3241563055062167, "grad_norm": 0.40726891703153134, "learning_rate": 8.449411119200074e-07, "loss": 0.0387, "step": 3743 }, { "epoch": 3.325044404973357, "grad_norm": 0.3523287663473434, "learning_rate": 8.42786517705993e-07, "loss": 0.032, "step": 3744 }, { "epoch": 3.3259325044404973, "grad_norm": 0.3076092239957714, "learning_rate": 8.406344212412615e-07, "loss": 0.0281, "step": 3745 }, { "epoch": 3.326820603907638, "grad_norm": 0.3798972726179211, "learning_rate": 8.384848238188447e-07, "loss": 0.0294, "step": 3746 }, { "epoch": 3.327708703374778, "grad_norm": 0.3587293031362081, "learning_rate": 8.363377267302691e-07, "loss": 0.031, "step": 3747 }, { "epoch": 3.3285968028419184, "grad_norm": 0.3541181682472956, "learning_rate": 8.341931312655582e-07, "loss": 0.029, "step": 3748 }, { "epoch": 3.3294849023090585, "grad_norm": 0.3766437324480442, "learning_rate": 8.320510387132358e-07, "loss": 0.0343, "step": 3749 }, { "epoch": 3.330373001776199, "grad_norm": 0.34932698482645635, "learning_rate": 8.299114503603229e-07, "loss": 0.0293, "step": 3750 }, { "epoch": 3.331261101243339, "grad_norm": 0.42408207272194803, "learning_rate": 8.277743674923272e-07, "loss": 0.0333, "step": 3751 }, { "epoch": 3.3321492007104796, "grad_norm": 0.39452469954487357, "learning_rate": 8.256397913932612e-07, "loss": 0.0331, "step": 3752 }, { "epoch": 3.3330373001776197, "grad_norm": 0.3351727421594338, "learning_rate": 8.235077233456273e-07, "loss": 0.0322, "step": 3753 }, { "epoch": 3.33392539964476, "grad_norm": 0.37069347321101703, "learning_rate": 8.213781646304209e-07, "loss": 0.0308, "step": 3754 }, { "epoch": 3.3348134991119007, "grad_norm": 0.4026695093751538, "learning_rate": 8.192511165271267e-07, "loss": 0.0348, "step": 3755 }, { "epoch": 3.335701598579041, "grad_norm": 0.39525398196727407, "learning_rate": 8.171265803137279e-07, "loss": 0.0313, "step": 3756 }, { "epoch": 3.3365896980461813, "grad_norm": 0.376849915981171, "learning_rate": 8.150045572666921e-07, "loss": 0.0296, "step": 3757 }, { "epoch": 3.3374777975133214, "grad_norm": 0.38621599822308506, "learning_rate": 8.128850486609813e-07, "loss": 0.0375, "step": 3758 }, { "epoch": 3.338365896980462, "grad_norm": 0.44042208618378603, "learning_rate": 8.107680557700442e-07, "loss": 0.0317, "step": 3759 }, { "epoch": 3.339253996447602, "grad_norm": 0.334702391912227, "learning_rate": 8.086535798658168e-07, "loss": 0.0338, "step": 3760 }, { "epoch": 3.3401420959147425, "grad_norm": 0.31134998390749746, "learning_rate": 8.065416222187283e-07, "loss": 0.0281, "step": 3761 }, { "epoch": 3.3410301953818826, "grad_norm": 0.330437831672938, "learning_rate": 8.044321840976876e-07, "loss": 0.0282, "step": 3762 }, { "epoch": 3.341918294849023, "grad_norm": 0.37975124020700324, "learning_rate": 8.02325266770097e-07, "loss": 0.0366, "step": 3763 }, { "epoch": 3.342806394316163, "grad_norm": 0.32857743009123846, "learning_rate": 8.002208715018383e-07, "loss": 0.031, "step": 3764 }, { "epoch": 3.3436944937833037, "grad_norm": 0.3337041915180823, "learning_rate": 7.981189995572825e-07, "loss": 0.0268, "step": 3765 }, { "epoch": 3.3445825932504443, "grad_norm": 0.431321499055408, "learning_rate": 7.960196521992797e-07, "loss": 0.0266, "step": 3766 }, { "epoch": 3.3454706927175843, "grad_norm": 0.3830801589024942, "learning_rate": 7.939228306891695e-07, "loss": 0.0417, "step": 3767 }, { "epoch": 3.346358792184725, "grad_norm": 0.34391562471941695, "learning_rate": 7.918285362867684e-07, "loss": 0.0353, "step": 3768 }, { "epoch": 3.347246891651865, "grad_norm": 0.33981672446127464, "learning_rate": 7.897367702503755e-07, "loss": 0.0301, "step": 3769 }, { "epoch": 3.3481349911190055, "grad_norm": 0.31377349052923625, "learning_rate": 7.876475338367745e-07, "loss": 0.0293, "step": 3770 }, { "epoch": 3.3490230905861456, "grad_norm": 0.33842976003769915, "learning_rate": 7.855608283012251e-07, "loss": 0.025, "step": 3771 }, { "epoch": 3.349911190053286, "grad_norm": 0.3429127010785955, "learning_rate": 7.834766548974665e-07, "loss": 0.0327, "step": 3772 }, { "epoch": 3.350799289520426, "grad_norm": 0.7929300728178934, "learning_rate": 7.813950148777205e-07, "loss": 0.0334, "step": 3773 }, { "epoch": 3.3516873889875667, "grad_norm": 0.3677005393618327, "learning_rate": 7.793159094926822e-07, "loss": 0.0248, "step": 3774 }, { "epoch": 3.3525754884547068, "grad_norm": 0.39449591355166663, "learning_rate": 7.772393399915284e-07, "loss": 0.0328, "step": 3775 }, { "epoch": 3.3534635879218473, "grad_norm": 0.34005889484332136, "learning_rate": 7.751653076219073e-07, "loss": 0.0291, "step": 3776 }, { "epoch": 3.354351687388988, "grad_norm": 0.6237973720484827, "learning_rate": 7.730938136299448e-07, "loss": 0.0352, "step": 3777 }, { "epoch": 3.355239786856128, "grad_norm": 0.39755909316445903, "learning_rate": 7.710248592602438e-07, "loss": 0.0343, "step": 3778 }, { "epoch": 3.356127886323268, "grad_norm": 0.3321730432216352, "learning_rate": 7.689584457558808e-07, "loss": 0.0296, "step": 3779 }, { "epoch": 3.3570159857904085, "grad_norm": 0.33482379380260235, "learning_rate": 7.668945743584005e-07, "loss": 0.0305, "step": 3780 }, { "epoch": 3.357904085257549, "grad_norm": 0.33133186856325647, "learning_rate": 7.648332463078261e-07, "loss": 0.0301, "step": 3781 }, { "epoch": 3.358792184724689, "grad_norm": 0.31778126152603536, "learning_rate": 7.62774462842652e-07, "loss": 0.0275, "step": 3782 }, { "epoch": 3.3596802841918296, "grad_norm": 0.3147898697081914, "learning_rate": 7.607182251998419e-07, "loss": 0.0311, "step": 3783 }, { "epoch": 3.3605683836589697, "grad_norm": 0.3680446869382627, "learning_rate": 7.586645346148292e-07, "loss": 0.033, "step": 3784 }, { "epoch": 3.3614564831261102, "grad_norm": 0.4776113172426281, "learning_rate": 7.566133923215202e-07, "loss": 0.0304, "step": 3785 }, { "epoch": 3.3623445825932503, "grad_norm": 0.3297207566950773, "learning_rate": 7.545647995522865e-07, "loss": 0.0304, "step": 3786 }, { "epoch": 3.363232682060391, "grad_norm": 0.3819800062499834, "learning_rate": 7.525187575379717e-07, "loss": 0.0348, "step": 3787 }, { "epoch": 3.364120781527531, "grad_norm": 0.3428977890729436, "learning_rate": 7.504752675078836e-07, "loss": 0.0286, "step": 3788 }, { "epoch": 3.3650088809946714, "grad_norm": 0.3577025132434273, "learning_rate": 7.484343306897973e-07, "loss": 0.0303, "step": 3789 }, { "epoch": 3.3658969804618115, "grad_norm": 0.7296831038086915, "learning_rate": 7.463959483099547e-07, "loss": 0.0438, "step": 3790 }, { "epoch": 3.366785079928952, "grad_norm": 0.4401383270258742, "learning_rate": 7.443601215930662e-07, "loss": 0.0336, "step": 3791 }, { "epoch": 3.3676731793960926, "grad_norm": 0.36126138694487814, "learning_rate": 7.423268517622972e-07, "loss": 0.0335, "step": 3792 }, { "epoch": 3.3685612788632326, "grad_norm": 0.3572275024120499, "learning_rate": 7.402961400392867e-07, "loss": 0.0314, "step": 3793 }, { "epoch": 3.369449378330373, "grad_norm": 0.4306726882285608, "learning_rate": 7.382679876441329e-07, "loss": 0.0304, "step": 3794 }, { "epoch": 3.3703374777975132, "grad_norm": 0.38675925468022065, "learning_rate": 7.362423957953957e-07, "loss": 0.0351, "step": 3795 }, { "epoch": 3.3712255772646538, "grad_norm": 0.4434276372282579, "learning_rate": 7.34219365710096e-07, "loss": 0.0311, "step": 3796 }, { "epoch": 3.372113676731794, "grad_norm": 0.41839888548301196, "learning_rate": 7.321988986037193e-07, "loss": 0.0294, "step": 3797 }, { "epoch": 3.3730017761989344, "grad_norm": 0.316783129372031, "learning_rate": 7.301809956902051e-07, "loss": 0.0298, "step": 3798 }, { "epoch": 3.3738898756660745, "grad_norm": 0.4591931338383639, "learning_rate": 7.281656581819596e-07, "loss": 0.031, "step": 3799 }, { "epoch": 3.374777975133215, "grad_norm": 0.3422224581805127, "learning_rate": 7.261528872898421e-07, "loss": 0.0218, "step": 3800 }, { "epoch": 3.375666074600355, "grad_norm": 0.3703462348547607, "learning_rate": 7.241426842231697e-07, "loss": 0.0366, "step": 3801 }, { "epoch": 3.3765541740674956, "grad_norm": 0.4809962800480066, "learning_rate": 7.221350501897217e-07, "loss": 0.0329, "step": 3802 }, { "epoch": 3.377442273534636, "grad_norm": 0.3213178428399724, "learning_rate": 7.201299863957295e-07, "loss": 0.0263, "step": 3803 }, { "epoch": 3.378330373001776, "grad_norm": 0.33091908597318836, "learning_rate": 7.181274940458804e-07, "loss": 0.0277, "step": 3804 }, { "epoch": 3.3792184724689167, "grad_norm": 0.3270173800334416, "learning_rate": 7.161275743433182e-07, "loss": 0.0289, "step": 3805 }, { "epoch": 3.380106571936057, "grad_norm": 0.3530297617803138, "learning_rate": 7.141302284896424e-07, "loss": 0.0307, "step": 3806 }, { "epoch": 3.3809946714031973, "grad_norm": 0.4346169163369569, "learning_rate": 7.121354576849027e-07, "loss": 0.0328, "step": 3807 }, { "epoch": 3.3818827708703374, "grad_norm": 0.3463311142800992, "learning_rate": 7.101432631276022e-07, "loss": 0.0314, "step": 3808 }, { "epoch": 3.382770870337478, "grad_norm": 0.3468160599220919, "learning_rate": 7.081536460146999e-07, "loss": 0.0303, "step": 3809 }, { "epoch": 3.383658969804618, "grad_norm": 0.2985138989315133, "learning_rate": 7.061666075416002e-07, "loss": 0.0244, "step": 3810 }, { "epoch": 3.3845470692717585, "grad_norm": 0.35681644177401894, "learning_rate": 7.041821489021639e-07, "loss": 0.0314, "step": 3811 }, { "epoch": 3.3854351687388986, "grad_norm": 0.3627834172517948, "learning_rate": 7.022002712886989e-07, "loss": 0.034, "step": 3812 }, { "epoch": 3.386323268206039, "grad_norm": 0.3246212083977764, "learning_rate": 7.002209758919609e-07, "loss": 0.0334, "step": 3813 }, { "epoch": 3.387211367673179, "grad_norm": 0.37909799404114786, "learning_rate": 6.982442639011589e-07, "loss": 0.04, "step": 3814 }, { "epoch": 3.3880994671403197, "grad_norm": 0.35868018390808487, "learning_rate": 6.962701365039448e-07, "loss": 0.0416, "step": 3815 }, { "epoch": 3.38898756660746, "grad_norm": 0.35761964905048566, "learning_rate": 6.942985948864195e-07, "loss": 0.0353, "step": 3816 }, { "epoch": 3.3898756660746003, "grad_norm": 0.377788647672182, "learning_rate": 6.92329640233132e-07, "loss": 0.0301, "step": 3817 }, { "epoch": 3.390763765541741, "grad_norm": 0.34355742105522385, "learning_rate": 6.903632737270732e-07, "loss": 0.0309, "step": 3818 }, { "epoch": 3.391651865008881, "grad_norm": 0.3606687716210664, "learning_rate": 6.883994965496832e-07, "loss": 0.0349, "step": 3819 }, { "epoch": 3.3925399644760215, "grad_norm": 0.37505313589347944, "learning_rate": 6.864383098808453e-07, "loss": 0.036, "step": 3820 }, { "epoch": 3.3934280639431615, "grad_norm": 0.3730142697141216, "learning_rate": 6.844797148988824e-07, "loss": 0.0348, "step": 3821 }, { "epoch": 3.394316163410302, "grad_norm": 0.37808648585134375, "learning_rate": 6.825237127805645e-07, "loss": 0.0322, "step": 3822 }, { "epoch": 3.395204262877442, "grad_norm": 0.45339569519833145, "learning_rate": 6.805703047011048e-07, "loss": 0.0379, "step": 3823 }, { "epoch": 3.3960923623445827, "grad_norm": 0.32829297562673115, "learning_rate": 6.786194918341532e-07, "loss": 0.0283, "step": 3824 }, { "epoch": 3.3969804618117228, "grad_norm": 0.4386721053287848, "learning_rate": 6.766712753518029e-07, "loss": 0.0406, "step": 3825 }, { "epoch": 3.3978685612788633, "grad_norm": 0.3074810407951773, "learning_rate": 6.747256564245886e-07, "loss": 0.0261, "step": 3826 }, { "epoch": 3.3987566607460034, "grad_norm": 0.3103220152117213, "learning_rate": 6.727826362214806e-07, "loss": 0.0257, "step": 3827 }, { "epoch": 3.399644760213144, "grad_norm": 0.5819057664604828, "learning_rate": 6.708422159098927e-07, "loss": 0.0338, "step": 3828 }, { "epoch": 3.4005328596802844, "grad_norm": 0.3441348225626167, "learning_rate": 6.68904396655673e-07, "loss": 0.0316, "step": 3829 }, { "epoch": 3.4014209591474245, "grad_norm": 0.34261708764837284, "learning_rate": 6.66969179623106e-07, "loss": 0.0342, "step": 3830 }, { "epoch": 3.402309058614565, "grad_norm": 0.3593789300176169, "learning_rate": 6.650365659749158e-07, "loss": 0.0333, "step": 3831 }, { "epoch": 3.403197158081705, "grad_norm": 0.38578061504573385, "learning_rate": 6.631065568722633e-07, "loss": 0.0312, "step": 3832 }, { "epoch": 3.4040852575488456, "grad_norm": 0.36694991342779637, "learning_rate": 6.61179153474738e-07, "loss": 0.0338, "step": 3833 }, { "epoch": 3.4049733570159857, "grad_norm": 0.3054111576522598, "learning_rate": 6.592543569403709e-07, "loss": 0.027, "step": 3834 }, { "epoch": 3.405861456483126, "grad_norm": 0.5506357580003206, "learning_rate": 6.573321684256239e-07, "loss": 0.0374, "step": 3835 }, { "epoch": 3.4067495559502663, "grad_norm": 0.39472711253687154, "learning_rate": 6.554125890853913e-07, "loss": 0.0275, "step": 3836 }, { "epoch": 3.407637655417407, "grad_norm": 0.35724093468200274, "learning_rate": 6.534956200729997e-07, "loss": 0.0248, "step": 3837 }, { "epoch": 3.408525754884547, "grad_norm": 0.31993450972444076, "learning_rate": 6.515812625402096e-07, "loss": 0.0317, "step": 3838 }, { "epoch": 3.4094138543516874, "grad_norm": 0.35658484753581615, "learning_rate": 6.496695176372092e-07, "loss": 0.0335, "step": 3839 }, { "epoch": 3.410301953818828, "grad_norm": 0.38624270368575286, "learning_rate": 6.477603865126214e-07, "loss": 0.031, "step": 3840 }, { "epoch": 3.411190053285968, "grad_norm": 0.38377401466630545, "learning_rate": 6.458538703134937e-07, "loss": 0.0353, "step": 3841 }, { "epoch": 3.412078152753108, "grad_norm": 0.3571595356343655, "learning_rate": 6.439499701853046e-07, "loss": 0.0331, "step": 3842 }, { "epoch": 3.4129662522202486, "grad_norm": 0.3937687453830249, "learning_rate": 6.420486872719634e-07, "loss": 0.0373, "step": 3843 }, { "epoch": 3.413854351687389, "grad_norm": 0.3466251624699343, "learning_rate": 6.401500227158026e-07, "loss": 0.0314, "step": 3844 }, { "epoch": 3.4147424511545292, "grad_norm": 0.33100852598698377, "learning_rate": 6.382539776575825e-07, "loss": 0.0328, "step": 3845 }, { "epoch": 3.4156305506216698, "grad_norm": 0.4058975020108741, "learning_rate": 6.363605532364931e-07, "loss": 0.0331, "step": 3846 }, { "epoch": 3.41651865008881, "grad_norm": 0.37458646195238915, "learning_rate": 6.344697505901448e-07, "loss": 0.0362, "step": 3847 }, { "epoch": 3.4174067495559504, "grad_norm": 0.3249928537684176, "learning_rate": 6.325815708545774e-07, "loss": 0.0286, "step": 3848 }, { "epoch": 3.4182948490230904, "grad_norm": 0.40535681839925725, "learning_rate": 6.30696015164251e-07, "loss": 0.0368, "step": 3849 }, { "epoch": 3.419182948490231, "grad_norm": 0.2808714507262059, "learning_rate": 6.288130846520518e-07, "loss": 0.0181, "step": 3850 }, { "epoch": 3.420071047957371, "grad_norm": 0.36029572937951054, "learning_rate": 6.269327804492869e-07, "loss": 0.0323, "step": 3851 }, { "epoch": 3.4209591474245116, "grad_norm": 0.4272451003033633, "learning_rate": 6.250551036856872e-07, "loss": 0.03, "step": 3852 }, { "epoch": 3.4218472468916517, "grad_norm": 0.42111729049541885, "learning_rate": 6.231800554894029e-07, "loss": 0.0321, "step": 3853 }, { "epoch": 3.422735346358792, "grad_norm": 0.5370252081560716, "learning_rate": 6.213076369870064e-07, "loss": 0.0397, "step": 3854 }, { "epoch": 3.4236234458259327, "grad_norm": 0.5099818288497758, "learning_rate": 6.194378493034902e-07, "loss": 0.0408, "step": 3855 }, { "epoch": 3.424511545293073, "grad_norm": 0.3515689219016062, "learning_rate": 6.175706935622655e-07, "loss": 0.0293, "step": 3856 }, { "epoch": 3.4253996447602133, "grad_norm": 0.333320779183509, "learning_rate": 6.157061708851608e-07, "loss": 0.0309, "step": 3857 }, { "epoch": 3.4262877442273534, "grad_norm": 0.31170693918969006, "learning_rate": 6.138442823924262e-07, "loss": 0.028, "step": 3858 }, { "epoch": 3.427175843694494, "grad_norm": 0.4287412167074321, "learning_rate": 6.119850292027257e-07, "loss": 0.0345, "step": 3859 }, { "epoch": 3.428063943161634, "grad_norm": 0.34100670754132295, "learning_rate": 6.101284124331425e-07, "loss": 0.0327, "step": 3860 }, { "epoch": 3.4289520426287745, "grad_norm": 0.3658376617686606, "learning_rate": 6.082744331991742e-07, "loss": 0.0295, "step": 3861 }, { "epoch": 3.4298401420959146, "grad_norm": 0.3125205886058981, "learning_rate": 6.064230926147324e-07, "loss": 0.0233, "step": 3862 }, { "epoch": 3.430728241563055, "grad_norm": 0.3845938917922705, "learning_rate": 6.045743917921465e-07, "loss": 0.0307, "step": 3863 }, { "epoch": 3.431616341030195, "grad_norm": 0.3723638375926968, "learning_rate": 6.027283318421606e-07, "loss": 0.0425, "step": 3864 }, { "epoch": 3.4325044404973357, "grad_norm": 0.39544546921658086, "learning_rate": 6.008849138739253e-07, "loss": 0.028, "step": 3865 }, { "epoch": 3.4333925399644762, "grad_norm": 0.3377847882668556, "learning_rate": 5.990441389950103e-07, "loss": 0.0216, "step": 3866 }, { "epoch": 3.4342806394316163, "grad_norm": 0.34808826310593893, "learning_rate": 5.972060083113973e-07, "loss": 0.0265, "step": 3867 }, { "epoch": 3.435168738898757, "grad_norm": 0.36342666131894075, "learning_rate": 5.953705229274758e-07, "loss": 0.0272, "step": 3868 }, { "epoch": 3.436056838365897, "grad_norm": 0.3648828072127289, "learning_rate": 5.935376839460466e-07, "loss": 0.033, "step": 3869 }, { "epoch": 3.4369449378330375, "grad_norm": 0.37588337787367926, "learning_rate": 5.917074924683236e-07, "loss": 0.0334, "step": 3870 }, { "epoch": 3.4378330373001775, "grad_norm": 0.41140078655464724, "learning_rate": 5.898799495939256e-07, "loss": 0.036, "step": 3871 }, { "epoch": 3.438721136767318, "grad_norm": 0.3686579848929117, "learning_rate": 5.880550564208848e-07, "loss": 0.0367, "step": 3872 }, { "epoch": 3.439609236234458, "grad_norm": 0.3536856126935369, "learning_rate": 5.862328140456375e-07, "loss": 0.0274, "step": 3873 }, { "epoch": 3.4404973357015987, "grad_norm": 0.36570242055566476, "learning_rate": 5.844132235630273e-07, "loss": 0.0239, "step": 3874 }, { "epoch": 3.4413854351687387, "grad_norm": 0.3286523043915312, "learning_rate": 5.825962860663076e-07, "loss": 0.0283, "step": 3875 }, { "epoch": 3.4422735346358793, "grad_norm": 0.4536081363912329, "learning_rate": 5.807820026471383e-07, "loss": 0.0346, "step": 3876 }, { "epoch": 3.44316163410302, "grad_norm": 0.3770356725855568, "learning_rate": 5.789703743955782e-07, "loss": 0.0336, "step": 3877 }, { "epoch": 3.44404973357016, "grad_norm": 0.38629127404201613, "learning_rate": 5.771614024000966e-07, "loss": 0.0297, "step": 3878 }, { "epoch": 3.4449378330373, "grad_norm": 0.45554001871843214, "learning_rate": 5.753550877475672e-07, "loss": 0.0395, "step": 3879 }, { "epoch": 3.4458259325044405, "grad_norm": 0.3828994357826869, "learning_rate": 5.735514315232643e-07, "loss": 0.0259, "step": 3880 }, { "epoch": 3.446714031971581, "grad_norm": 0.3400942987707126, "learning_rate": 5.717504348108649e-07, "loss": 0.0271, "step": 3881 }, { "epoch": 3.447602131438721, "grad_norm": 0.40395498260738727, "learning_rate": 5.699520986924506e-07, "loss": 0.0285, "step": 3882 }, { "epoch": 3.4484902309058616, "grad_norm": 0.4610290801217755, "learning_rate": 5.681564242485011e-07, "loss": 0.0361, "step": 3883 }, { "epoch": 3.4493783303730017, "grad_norm": 0.3904955359718084, "learning_rate": 5.663634125579015e-07, "loss": 0.0374, "step": 3884 }, { "epoch": 3.450266429840142, "grad_norm": 0.4051125274418542, "learning_rate": 5.64573064697933e-07, "loss": 0.0314, "step": 3885 }, { "epoch": 3.4511545293072823, "grad_norm": 0.41715274332674607, "learning_rate": 5.627853817442764e-07, "loss": 0.0387, "step": 3886 }, { "epoch": 3.452042628774423, "grad_norm": 0.41375043211529533, "learning_rate": 5.610003647710155e-07, "loss": 0.0358, "step": 3887 }, { "epoch": 3.452930728241563, "grad_norm": 0.3849340825322553, "learning_rate": 5.592180148506266e-07, "loss": 0.0357, "step": 3888 }, { "epoch": 3.4538188277087034, "grad_norm": 0.38366477763105916, "learning_rate": 5.574383330539884e-07, "loss": 0.033, "step": 3889 }, { "epoch": 3.4547069271758435, "grad_norm": 0.36351596629126487, "learning_rate": 5.55661320450373e-07, "loss": 0.0276, "step": 3890 }, { "epoch": 3.455595026642984, "grad_norm": 0.32163822954491966, "learning_rate": 5.53886978107453e-07, "loss": 0.0243, "step": 3891 }, { "epoch": 3.4564831261101245, "grad_norm": 0.3333783406879094, "learning_rate": 5.521153070912905e-07, "loss": 0.0278, "step": 3892 }, { "epoch": 3.4573712255772646, "grad_norm": 0.36992938438025735, "learning_rate": 5.50346308466349e-07, "loss": 0.0319, "step": 3893 }, { "epoch": 3.458259325044405, "grad_norm": 0.4150549761478952, "learning_rate": 5.48579983295483e-07, "loss": 0.0375, "step": 3894 }, { "epoch": 3.459147424511545, "grad_norm": 0.3905131262318993, "learning_rate": 5.46816332639939e-07, "loss": 0.0333, "step": 3895 }, { "epoch": 3.4600355239786857, "grad_norm": 0.29509890571878317, "learning_rate": 5.450553575593614e-07, "loss": 0.0268, "step": 3896 }, { "epoch": 3.460923623445826, "grad_norm": 0.33757827339222873, "learning_rate": 5.432970591117842e-07, "loss": 0.0352, "step": 3897 }, { "epoch": 3.4618117229129663, "grad_norm": 0.3982455299082649, "learning_rate": 5.415414383536311e-07, "loss": 0.0371, "step": 3898 }, { "epoch": 3.4626998223801064, "grad_norm": 0.36234715051564076, "learning_rate": 5.397884963397215e-07, "loss": 0.0308, "step": 3899 }, { "epoch": 3.463587921847247, "grad_norm": 0.34629684712057873, "learning_rate": 5.380382341232626e-07, "loss": 0.0306, "step": 3900 }, { "epoch": 3.464476021314387, "grad_norm": 0.3027187589773608, "learning_rate": 5.362906527558525e-07, "loss": 0.0312, "step": 3901 }, { "epoch": 3.4653641207815276, "grad_norm": 0.3772026083756171, "learning_rate": 5.345457532874782e-07, "loss": 0.0347, "step": 3902 }, { "epoch": 3.466252220248668, "grad_norm": 0.34451385929900896, "learning_rate": 5.328035367665141e-07, "loss": 0.03, "step": 3903 }, { "epoch": 3.467140319715808, "grad_norm": 0.33209481947448777, "learning_rate": 5.310640042397242e-07, "loss": 0.0267, "step": 3904 }, { "epoch": 3.4680284191829482, "grad_norm": 0.4281994897154462, "learning_rate": 5.293271567522629e-07, "loss": 0.0327, "step": 3905 }, { "epoch": 3.4689165186500888, "grad_norm": 0.36843999056279964, "learning_rate": 5.275929953476627e-07, "loss": 0.0301, "step": 3906 }, { "epoch": 3.4698046181172293, "grad_norm": 0.332406773470646, "learning_rate": 5.258615210678508e-07, "loss": 0.0285, "step": 3907 }, { "epoch": 3.4706927175843694, "grad_norm": 0.3390248706610025, "learning_rate": 5.241327349531367e-07, "loss": 0.0333, "step": 3908 }, { "epoch": 3.47158081705151, "grad_norm": 0.4752550168976335, "learning_rate": 5.224066380422143e-07, "loss": 0.0388, "step": 3909 }, { "epoch": 3.47246891651865, "grad_norm": 0.3984019141090526, "learning_rate": 5.206832313721611e-07, "loss": 0.0317, "step": 3910 }, { "epoch": 3.4733570159857905, "grad_norm": 0.3447408401617846, "learning_rate": 5.189625159784411e-07, "loss": 0.0233, "step": 3911 }, { "epoch": 3.4742451154529306, "grad_norm": 0.30433019089185304, "learning_rate": 5.172444928948983e-07, "loss": 0.0313, "step": 3912 }, { "epoch": 3.475133214920071, "grad_norm": 0.398223469497285, "learning_rate": 5.155291631537618e-07, "loss": 0.0288, "step": 3913 }, { "epoch": 3.476021314387211, "grad_norm": 0.3324608022558975, "learning_rate": 5.1381652778564e-07, "loss": 0.0338, "step": 3914 }, { "epoch": 3.4769094138543517, "grad_norm": 0.3742206764308991, "learning_rate": 5.121065878195237e-07, "loss": 0.0253, "step": 3915 }, { "epoch": 3.477797513321492, "grad_norm": 0.36840244472830547, "learning_rate": 5.103993442827832e-07, "loss": 0.037, "step": 3916 }, { "epoch": 3.4786856127886323, "grad_norm": 0.2886020552610924, "learning_rate": 5.08694798201173e-07, "loss": 0.0257, "step": 3917 }, { "epoch": 3.479573712255773, "grad_norm": 0.4458380464085219, "learning_rate": 5.069929505988192e-07, "loss": 0.0367, "step": 3918 }, { "epoch": 3.480461811722913, "grad_norm": 0.4297763864054579, "learning_rate": 5.052938024982328e-07, "loss": 0.0366, "step": 3919 }, { "epoch": 3.4813499111900534, "grad_norm": 0.2868224853190072, "learning_rate": 5.035973549203021e-07, "loss": 0.0231, "step": 3920 }, { "epoch": 3.4822380106571935, "grad_norm": 0.4033760364862946, "learning_rate": 5.019036088842905e-07, "loss": 0.033, "step": 3921 }, { "epoch": 3.483126110124334, "grad_norm": 0.3599477556106266, "learning_rate": 5.002125654078388e-07, "loss": 0.0318, "step": 3922 }, { "epoch": 3.484014209591474, "grad_norm": 0.36309612247904965, "learning_rate": 4.985242255069661e-07, "loss": 0.0293, "step": 3923 }, { "epoch": 3.4849023090586146, "grad_norm": 0.3850741748039999, "learning_rate": 4.968385901960648e-07, "loss": 0.0341, "step": 3924 }, { "epoch": 3.4857904085257547, "grad_norm": 0.3885361747137568, "learning_rate": 4.951556604879049e-07, "loss": 0.0308, "step": 3925 }, { "epoch": 3.4866785079928952, "grad_norm": 0.3632966053399173, "learning_rate": 4.934754373936274e-07, "loss": 0.0304, "step": 3926 }, { "epoch": 3.4875666074600353, "grad_norm": 0.3930929590615878, "learning_rate": 4.917979219227487e-07, "loss": 0.0384, "step": 3927 }, { "epoch": 3.488454706927176, "grad_norm": 0.3896699517381978, "learning_rate": 4.901231150831609e-07, "loss": 0.035, "step": 3928 }, { "epoch": 3.4893428063943164, "grad_norm": 0.47188392203720475, "learning_rate": 4.884510178811242e-07, "loss": 0.04, "step": 3929 }, { "epoch": 3.4902309058614565, "grad_norm": 0.44499118386396463, "learning_rate": 4.867816313212731e-07, "loss": 0.0353, "step": 3930 }, { "epoch": 3.491119005328597, "grad_norm": 0.2983175269243176, "learning_rate": 4.851149564066143e-07, "loss": 0.025, "step": 3931 }, { "epoch": 3.492007104795737, "grad_norm": 0.44784121725297044, "learning_rate": 4.834509941385246e-07, "loss": 0.0348, "step": 3932 }, { "epoch": 3.4928952042628776, "grad_norm": 0.38652222379864165, "learning_rate": 4.817897455167503e-07, "loss": 0.0337, "step": 3933 }, { "epoch": 3.4937833037300177, "grad_norm": 0.37253923439096487, "learning_rate": 4.801312115394064e-07, "loss": 0.0321, "step": 3934 }, { "epoch": 3.494671403197158, "grad_norm": 0.30324285721890126, "learning_rate": 4.784753932029806e-07, "loss": 0.0306, "step": 3935 }, { "epoch": 3.4955595026642983, "grad_norm": 0.3492390703031562, "learning_rate": 4.7682229150232405e-07, "loss": 0.0314, "step": 3936 }, { "epoch": 3.496447602131439, "grad_norm": 0.33696592266827297, "learning_rate": 4.751719074306604e-07, "loss": 0.0289, "step": 3937 }, { "epoch": 3.497335701598579, "grad_norm": 0.3262949590640255, "learning_rate": 4.7352424197957767e-07, "loss": 0.0282, "step": 3938 }, { "epoch": 3.4982238010657194, "grad_norm": 0.33102629789968224, "learning_rate": 4.718792961390295e-07, "loss": 0.0269, "step": 3939 }, { "epoch": 3.49911190053286, "grad_norm": 0.39103829467973583, "learning_rate": 4.7023707089733915e-07, "loss": 0.0291, "step": 3940 }, { "epoch": 3.5, "grad_norm": 0.3244603190489018, "learning_rate": 4.6859756724119297e-07, "loss": 0.0289, "step": 3941 }, { "epoch": 3.50088809946714, "grad_norm": 0.4009753789908271, "learning_rate": 4.669607861556402e-07, "loss": 0.0418, "step": 3942 }, { "epoch": 3.5017761989342806, "grad_norm": 0.44225750663258634, "learning_rate": 4.653267286240998e-07, "loss": 0.0285, "step": 3943 }, { "epoch": 3.502664298401421, "grad_norm": 0.36099165123017973, "learning_rate": 4.6369539562834797e-07, "loss": 0.0278, "step": 3944 }, { "epoch": 3.503552397868561, "grad_norm": 0.37669963389260863, "learning_rate": 4.6206678814852855e-07, "loss": 0.0311, "step": 3945 }, { "epoch": 3.5044404973357017, "grad_norm": 0.3653911199451643, "learning_rate": 4.6044090716314825e-07, "loss": 0.0338, "step": 3946 }, { "epoch": 3.505328596802842, "grad_norm": 0.39058917353430855, "learning_rate": 4.5881775364906957e-07, "loss": 0.0344, "step": 3947 }, { "epoch": 3.5062166962699823, "grad_norm": 0.3922029872761941, "learning_rate": 4.571973285815223e-07, "loss": 0.0278, "step": 3948 }, { "epoch": 3.5071047957371224, "grad_norm": 0.4153649844627601, "learning_rate": 4.555796329340967e-07, "loss": 0.0323, "step": 3949 }, { "epoch": 3.507992895204263, "grad_norm": 0.3922422286850339, "learning_rate": 4.539646676787396e-07, "loss": 0.0331, "step": 3950 }, { "epoch": 3.5088809946714035, "grad_norm": 0.3714168950930685, "learning_rate": 4.523524337857582e-07, "loss": 0.0314, "step": 3951 }, { "epoch": 3.5097690941385435, "grad_norm": 0.4241931810501754, "learning_rate": 4.507429322238221e-07, "loss": 0.0325, "step": 3952 }, { "epoch": 3.5106571936056836, "grad_norm": 0.3622181214141427, "learning_rate": 4.4913616395995505e-07, "loss": 0.0301, "step": 3953 }, { "epoch": 3.511545293072824, "grad_norm": 0.4654443865234001, "learning_rate": 4.4753212995954145e-07, "loss": 0.039, "step": 3954 }, { "epoch": 3.5124333925399647, "grad_norm": 0.3334121843075633, "learning_rate": 4.4593083118632143e-07, "loss": 0.0341, "step": 3955 }, { "epoch": 3.5133214920071048, "grad_norm": 0.405683466103701, "learning_rate": 4.4433226860239043e-07, "loss": 0.0327, "step": 3956 }, { "epoch": 3.5142095914742453, "grad_norm": 0.43076733535358774, "learning_rate": 4.42736443168203e-07, "loss": 0.0382, "step": 3957 }, { "epoch": 3.5150976909413854, "grad_norm": 0.35411383060158536, "learning_rate": 4.4114335584256986e-07, "loss": 0.0298, "step": 3958 }, { "epoch": 3.515985790408526, "grad_norm": 0.35235900431486816, "learning_rate": 4.3955300758264987e-07, "loss": 0.0308, "step": 3959 }, { "epoch": 3.516873889875666, "grad_norm": 0.3366112372387163, "learning_rate": 4.3796539934396307e-07, "loss": 0.028, "step": 3960 }, { "epoch": 3.5177619893428065, "grad_norm": 0.31567997834720296, "learning_rate": 4.363805320803821e-07, "loss": 0.0318, "step": 3961 }, { "epoch": 3.5186500888099466, "grad_norm": 0.4634528280904752, "learning_rate": 4.3479840674413077e-07, "loss": 0.03, "step": 3962 }, { "epoch": 3.519538188277087, "grad_norm": 0.4008635228535786, "learning_rate": 4.3321902428578475e-07, "loss": 0.031, "step": 3963 }, { "epoch": 3.520426287744227, "grad_norm": 0.42058392430256764, "learning_rate": 4.316423856542751e-07, "loss": 0.0337, "step": 3964 }, { "epoch": 3.5213143872113677, "grad_norm": 0.3750614354675925, "learning_rate": 4.3006849179688115e-07, "loss": 0.0333, "step": 3965 }, { "epoch": 3.522202486678508, "grad_norm": 0.4078111616286517, "learning_rate": 4.28497343659236e-07, "loss": 0.0316, "step": 3966 }, { "epoch": 3.5230905861456483, "grad_norm": 0.323934139235122, "learning_rate": 4.269289421853212e-07, "loss": 0.0288, "step": 3967 }, { "epoch": 3.5239786856127884, "grad_norm": 0.36104854254096586, "learning_rate": 4.253632883174663e-07, "loss": 0.0312, "step": 3968 }, { "epoch": 3.524866785079929, "grad_norm": 0.3784468723502184, "learning_rate": 4.2380038299635494e-07, "loss": 0.0312, "step": 3969 }, { "epoch": 3.5257548845470694, "grad_norm": 0.3332069586993085, "learning_rate": 4.2224022716101544e-07, "loss": 0.0273, "step": 3970 }, { "epoch": 3.5266429840142095, "grad_norm": 0.3065169726343551, "learning_rate": 4.2068282174882344e-07, "loss": 0.0283, "step": 3971 }, { "epoch": 3.52753108348135, "grad_norm": 0.37434731360945855, "learning_rate": 4.191281676955061e-07, "loss": 0.0333, "step": 3972 }, { "epoch": 3.52841918294849, "grad_norm": 0.39887429182412376, "learning_rate": 4.1757626593513514e-07, "loss": 0.0295, "step": 3973 }, { "epoch": 3.5293072824156306, "grad_norm": 0.35927540568607647, "learning_rate": 4.160271174001285e-07, "loss": 0.0294, "step": 3974 }, { "epoch": 3.5301953818827707, "grad_norm": 0.3264131244118038, "learning_rate": 4.144807230212483e-07, "loss": 0.0285, "step": 3975 }, { "epoch": 3.5310834813499112, "grad_norm": 0.33956545997361354, "learning_rate": 4.12937083727607e-07, "loss": 0.0266, "step": 3976 }, { "epoch": 3.5319715808170518, "grad_norm": 0.334835037466751, "learning_rate": 4.113962004466554e-07, "loss": 0.0301, "step": 3977 }, { "epoch": 3.532859680284192, "grad_norm": 0.3407365741387528, "learning_rate": 4.0985807410419463e-07, "loss": 0.0324, "step": 3978 }, { "epoch": 3.533747779751332, "grad_norm": 0.377144850050285, "learning_rate": 4.0832270562436436e-07, "loss": 0.0282, "step": 3979 }, { "epoch": 3.5346358792184724, "grad_norm": 0.33712492550672146, "learning_rate": 4.0679009592964834e-07, "loss": 0.0279, "step": 3980 }, { "epoch": 3.535523978685613, "grad_norm": 0.9349739434990134, "learning_rate": 4.052602459408761e-07, "loss": 0.0375, "step": 3981 }, { "epoch": 3.536412078152753, "grad_norm": 0.30334810360924563, "learning_rate": 4.037331565772157e-07, "loss": 0.0247, "step": 3982 }, { "epoch": 3.5373001776198936, "grad_norm": 0.30281540391674105, "learning_rate": 4.0220882875617594e-07, "loss": 0.0308, "step": 3983 }, { "epoch": 3.5381882770870337, "grad_norm": 0.31139441547970165, "learning_rate": 4.0068726339360985e-07, "loss": 0.027, "step": 3984 }, { "epoch": 3.539076376554174, "grad_norm": 0.3528748309527303, "learning_rate": 3.991684614037078e-07, "loss": 0.0273, "step": 3985 }, { "epoch": 3.5399644760213143, "grad_norm": 0.37054804647531303, "learning_rate": 3.9765242369900205e-07, "loss": 0.0278, "step": 3986 }, { "epoch": 3.540852575488455, "grad_norm": 0.3897783120052963, "learning_rate": 3.9613915119036175e-07, "loss": 0.042, "step": 3987 }, { "epoch": 3.5417406749555953, "grad_norm": 0.4120397041206129, "learning_rate": 3.946286447869957e-07, "loss": 0.0428, "step": 3988 }, { "epoch": 3.5426287744227354, "grad_norm": 0.3341949376027205, "learning_rate": 3.9312090539645077e-07, "loss": 0.0241, "step": 3989 }, { "epoch": 3.5435168738898755, "grad_norm": 0.421389210832489, "learning_rate": 3.9161593392461394e-07, "loss": 0.0293, "step": 3990 }, { "epoch": 3.544404973357016, "grad_norm": 0.3121689018226074, "learning_rate": 3.901137312757025e-07, "loss": 0.0305, "step": 3991 }, { "epoch": 3.5452930728241565, "grad_norm": 0.34591273554346214, "learning_rate": 3.886142983522767e-07, "loss": 0.0275, "step": 3992 }, { "epoch": 3.5461811722912966, "grad_norm": 0.35138389068617915, "learning_rate": 3.8711763605523035e-07, "loss": 0.0344, "step": 3993 }, { "epoch": 3.5470692717584367, "grad_norm": 0.33989097463707235, "learning_rate": 3.856237452837919e-07, "loss": 0.0296, "step": 3994 }, { "epoch": 3.547957371225577, "grad_norm": 0.3381630441049522, "learning_rate": 3.841326269355244e-07, "loss": 0.0226, "step": 3995 }, { "epoch": 3.5488454706927177, "grad_norm": 0.34907906949366113, "learning_rate": 3.8264428190632807e-07, "loss": 0.0297, "step": 3996 }, { "epoch": 3.549733570159858, "grad_norm": 0.3866613987831623, "learning_rate": 3.81158711090433e-07, "loss": 0.029, "step": 3997 }, { "epoch": 3.5506216696269983, "grad_norm": 0.3960749087467944, "learning_rate": 3.796759153804053e-07, "loss": 0.0349, "step": 3998 }, { "epoch": 3.5515097690941384, "grad_norm": 0.37080346096911626, "learning_rate": 3.7819589566714223e-07, "loss": 0.027, "step": 3999 }, { "epoch": 3.552397868561279, "grad_norm": 0.35462359752587536, "learning_rate": 3.7671865283987254e-07, "loss": 0.0282, "step": 4000 }, { "epoch": 3.553285968028419, "grad_norm": 0.34920229960648946, "learning_rate": 3.7524418778615903e-07, "loss": 0.0306, "step": 4001 }, { "epoch": 3.5541740674955595, "grad_norm": 0.381088153269292, "learning_rate": 3.7377250139189526e-07, "loss": 0.0335, "step": 4002 }, { "epoch": 3.5550621669627, "grad_norm": 0.6854256037254168, "learning_rate": 3.7230359454130115e-07, "loss": 0.031, "step": 4003 }, { "epoch": 3.55595026642984, "grad_norm": 0.33517642609484855, "learning_rate": 3.7083746811693134e-07, "loss": 0.0298, "step": 4004 }, { "epoch": 3.55683836589698, "grad_norm": 0.31742503207285194, "learning_rate": 3.693741229996689e-07, "loss": 0.0285, "step": 4005 }, { "epoch": 3.5577264653641207, "grad_norm": 0.3882259802457257, "learning_rate": 3.679135600687239e-07, "loss": 0.0342, "step": 4006 }, { "epoch": 3.5586145648312613, "grad_norm": 0.4027570793881262, "learning_rate": 3.664557802016366e-07, "loss": 0.0336, "step": 4007 }, { "epoch": 3.5595026642984013, "grad_norm": 0.3155717693854574, "learning_rate": 3.6500078427427534e-07, "loss": 0.0313, "step": 4008 }, { "epoch": 3.560390763765542, "grad_norm": 0.3891831380287714, "learning_rate": 3.6354857316083293e-07, "loss": 0.0283, "step": 4009 }, { "epoch": 3.561278863232682, "grad_norm": 0.32920527491640894, "learning_rate": 3.620991477338337e-07, "loss": 0.0256, "step": 4010 }, { "epoch": 3.5621669626998225, "grad_norm": 0.42504686946340525, "learning_rate": 3.606525088641244e-07, "loss": 0.0351, "step": 4011 }, { "epoch": 3.5630550621669625, "grad_norm": 0.361425081542175, "learning_rate": 3.5920865742087807e-07, "loss": 0.0323, "step": 4012 }, { "epoch": 3.563943161634103, "grad_norm": 0.3690993683842551, "learning_rate": 3.577675942715958e-07, "loss": 0.0354, "step": 4013 }, { "epoch": 3.5648312611012436, "grad_norm": 0.4222093409356401, "learning_rate": 3.5632932028209944e-07, "loss": 0.0309, "step": 4014 }, { "epoch": 3.5657193605683837, "grad_norm": 0.363286725553558, "learning_rate": 3.548938363165388e-07, "loss": 0.035, "step": 4015 }, { "epoch": 3.5666074600355238, "grad_norm": 0.3116321285071317, "learning_rate": 3.534611432373836e-07, "loss": 0.0255, "step": 4016 }, { "epoch": 3.5674955595026643, "grad_norm": 0.32923152944008094, "learning_rate": 3.5203124190543117e-07, "loss": 0.0236, "step": 4017 }, { "epoch": 3.568383658969805, "grad_norm": 0.38168496867714735, "learning_rate": 3.506041331797966e-07, "loss": 0.0367, "step": 4018 }, { "epoch": 3.569271758436945, "grad_norm": 0.3777866499507286, "learning_rate": 3.491798179179212e-07, "loss": 0.0268, "step": 4019 }, { "epoch": 3.5701598579040854, "grad_norm": 0.493230161127167, "learning_rate": 3.4775829697556607e-07, "loss": 0.0359, "step": 4020 }, { "epoch": 3.5710479573712255, "grad_norm": 0.5076369573429343, "learning_rate": 3.4633957120681294e-07, "loss": 0.0263, "step": 4021 }, { "epoch": 3.571936056838366, "grad_norm": 0.34645969509892166, "learning_rate": 3.4492364146406555e-07, "loss": 0.0271, "step": 4022 }, { "epoch": 3.572824156305506, "grad_norm": 0.36133779292090595, "learning_rate": 3.435105085980467e-07, "loss": 0.0319, "step": 4023 }, { "epoch": 3.5737122557726466, "grad_norm": 0.4159037386682091, "learning_rate": 3.4210017345779777e-07, "loss": 0.0384, "step": 4024 }, { "epoch": 3.5746003552397867, "grad_norm": 0.34398044780432635, "learning_rate": 3.406926368906832e-07, "loss": 0.0265, "step": 4025 }, { "epoch": 3.575488454706927, "grad_norm": 0.3926123342266739, "learning_rate": 3.392878997423804e-07, "loss": 0.0401, "step": 4026 }, { "epoch": 3.5763765541740673, "grad_norm": 0.46764869355924493, "learning_rate": 3.378859628568903e-07, "loss": 0.0465, "step": 4027 }, { "epoch": 3.577264653641208, "grad_norm": 0.3734611748174983, "learning_rate": 3.3648682707652757e-07, "loss": 0.0274, "step": 4028 }, { "epoch": 3.5781527531083483, "grad_norm": 0.3317830032125016, "learning_rate": 3.350904932419241e-07, "loss": 0.0265, "step": 4029 }, { "epoch": 3.5790408525754884, "grad_norm": 0.3140576416703216, "learning_rate": 3.3369696219202996e-07, "loss": 0.026, "step": 4030 }, { "epoch": 3.5799289520426285, "grad_norm": 0.42006519972387857, "learning_rate": 3.3230623476411317e-07, "loss": 0.0357, "step": 4031 }, { "epoch": 3.580817051509769, "grad_norm": 0.48292394902355373, "learning_rate": 3.3091831179375036e-07, "loss": 0.0325, "step": 4032 }, { "epoch": 3.5817051509769096, "grad_norm": 0.2983300115029716, "learning_rate": 3.295331941148394e-07, "loss": 0.0302, "step": 4033 }, { "epoch": 3.5825932504440496, "grad_norm": 0.3894965984084569, "learning_rate": 3.2815088255959195e-07, "loss": 0.0385, "step": 4034 }, { "epoch": 3.58348134991119, "grad_norm": 0.2699639757693006, "learning_rate": 3.267713779585319e-07, "loss": 0.0205, "step": 4035 }, { "epoch": 3.5843694493783302, "grad_norm": 0.3772247650517208, "learning_rate": 3.253946811404957e-07, "loss": 0.0286, "step": 4036 }, { "epoch": 3.5852575488454708, "grad_norm": 0.38702656678018194, "learning_rate": 3.2402079293263666e-07, "loss": 0.0302, "step": 4037 }, { "epoch": 3.586145648312611, "grad_norm": 0.3441910102358272, "learning_rate": 3.2264971416041625e-07, "loss": 0.0286, "step": 4038 }, { "epoch": 3.5870337477797514, "grad_norm": 0.4852995878459521, "learning_rate": 3.212814456476121e-07, "loss": 0.0381, "step": 4039 }, { "epoch": 3.587921847246892, "grad_norm": 0.44584480610267, "learning_rate": 3.1991598821631e-07, "loss": 0.0356, "step": 4040 }, { "epoch": 3.588809946714032, "grad_norm": 0.29655385436452336, "learning_rate": 3.185533426869081e-07, "loss": 0.0291, "step": 4041 }, { "epoch": 3.589698046181172, "grad_norm": 0.26659825956610705, "learning_rate": 3.1719350987811537e-07, "loss": 0.02, "step": 4042 }, { "epoch": 3.5905861456483126, "grad_norm": 0.3981329067663555, "learning_rate": 3.1583649060695223e-07, "loss": 0.0307, "step": 4043 }, { "epoch": 3.591474245115453, "grad_norm": 0.3341932030061367, "learning_rate": 3.1448228568874417e-07, "loss": 0.0297, "step": 4044 }, { "epoch": 3.592362344582593, "grad_norm": 0.28358982706492647, "learning_rate": 3.131308959371293e-07, "loss": 0.0213, "step": 4045 }, { "epoch": 3.5932504440497337, "grad_norm": 0.31834932434390856, "learning_rate": 3.1178232216405536e-07, "loss": 0.0272, "step": 4046 }, { "epoch": 3.594138543516874, "grad_norm": 0.35842766921854496, "learning_rate": 3.104365651797753e-07, "loss": 0.0309, "step": 4047 }, { "epoch": 3.5950266429840143, "grad_norm": 0.2983067942130664, "learning_rate": 3.090936257928501e-07, "loss": 0.0227, "step": 4048 }, { "epoch": 3.5959147424511544, "grad_norm": 0.3557084676876161, "learning_rate": 3.077535048101493e-07, "loss": 0.0282, "step": 4049 }, { "epoch": 3.596802841918295, "grad_norm": 0.33551728305788187, "learning_rate": 3.0641620303684837e-07, "loss": 0.0279, "step": 4050 }, { "epoch": 3.5976909413854354, "grad_norm": 0.32555454976180176, "learning_rate": 3.0508172127642896e-07, "loss": 0.0251, "step": 4051 }, { "epoch": 3.5985790408525755, "grad_norm": 0.3926909676347029, "learning_rate": 3.0375006033067865e-07, "loss": 0.0336, "step": 4052 }, { "epoch": 3.5994671403197156, "grad_norm": 0.34034680626207947, "learning_rate": 3.024212209996885e-07, "loss": 0.0339, "step": 4053 }, { "epoch": 3.600355239786856, "grad_norm": 0.3748121404576559, "learning_rate": 3.010952040818571e-07, "loss": 0.0372, "step": 4054 }, { "epoch": 3.6012433392539966, "grad_norm": 0.3177708420804335, "learning_rate": 2.9977201037388604e-07, "loss": 0.0295, "step": 4055 }, { "epoch": 3.6021314387211367, "grad_norm": 0.41559526622614357, "learning_rate": 2.9845164067077883e-07, "loss": 0.0316, "step": 4056 }, { "epoch": 3.6030195381882772, "grad_norm": 0.36826358350633054, "learning_rate": 2.971340957658447e-07, "loss": 0.0355, "step": 4057 }, { "epoch": 3.6039076376554173, "grad_norm": 0.37983089868627984, "learning_rate": 2.9581937645069614e-07, "loss": 0.0362, "step": 4058 }, { "epoch": 3.604795737122558, "grad_norm": 0.34783175136997246, "learning_rate": 2.9450748351524504e-07, "loss": 0.0316, "step": 4059 }, { "epoch": 3.605683836589698, "grad_norm": 0.39126772271877386, "learning_rate": 2.931984177477071e-07, "loss": 0.0354, "step": 4060 }, { "epoch": 3.6065719360568385, "grad_norm": 0.36861111492060794, "learning_rate": 2.918921799345997e-07, "loss": 0.0275, "step": 4061 }, { "epoch": 3.6074600355239785, "grad_norm": 0.3572386465416735, "learning_rate": 2.905887708607397e-07, "loss": 0.0309, "step": 4062 }, { "epoch": 3.608348134991119, "grad_norm": 0.39986346163611447, "learning_rate": 2.8928819130924656e-07, "loss": 0.0407, "step": 4063 }, { "epoch": 3.609236234458259, "grad_norm": 0.33235581644169715, "learning_rate": 2.8799044206153704e-07, "loss": 0.0276, "step": 4064 }, { "epoch": 3.6101243339253997, "grad_norm": 0.37094624908904245, "learning_rate": 2.8669552389732845e-07, "loss": 0.0334, "step": 4065 }, { "epoch": 3.61101243339254, "grad_norm": 0.3842345252591025, "learning_rate": 2.854034375946385e-07, "loss": 0.0329, "step": 4066 }, { "epoch": 3.6119005328596803, "grad_norm": 0.33697913363697846, "learning_rate": 2.841141839297823e-07, "loss": 0.0302, "step": 4067 }, { "epoch": 3.6127886323268203, "grad_norm": 0.37985667672995926, "learning_rate": 2.828277636773713e-07, "loss": 0.0337, "step": 4068 }, { "epoch": 3.613676731793961, "grad_norm": 0.40078877511862643, "learning_rate": 2.815441776103184e-07, "loss": 0.0453, "step": 4069 }, { "epoch": 3.6145648312611014, "grad_norm": 0.3060248641723875, "learning_rate": 2.802634264998294e-07, "loss": 0.0323, "step": 4070 }, { "epoch": 3.6154529307282415, "grad_norm": 0.3375294217896488, "learning_rate": 2.7898551111541105e-07, "loss": 0.0372, "step": 4071 }, { "epoch": 3.616341030195382, "grad_norm": 0.3845114968023121, "learning_rate": 2.7771043222486416e-07, "loss": 0.0273, "step": 4072 }, { "epoch": 3.617229129662522, "grad_norm": 0.341062787058291, "learning_rate": 2.7643819059428367e-07, "loss": 0.0271, "step": 4073 }, { "epoch": 3.6181172291296626, "grad_norm": 0.3356574146060281, "learning_rate": 2.751687869880626e-07, "loss": 0.0306, "step": 4074 }, { "epoch": 3.6190053285968027, "grad_norm": 0.3162372354593641, "learning_rate": 2.7390222216888804e-07, "loss": 0.0281, "step": 4075 }, { "epoch": 3.619893428063943, "grad_norm": 0.45588194472857524, "learning_rate": 2.7263849689774135e-07, "loss": 0.0412, "step": 4076 }, { "epoch": 3.6207815275310837, "grad_norm": 0.3597778905910394, "learning_rate": 2.7137761193389623e-07, "loss": 0.0315, "step": 4077 }, { "epoch": 3.621669626998224, "grad_norm": 0.3617714366812742, "learning_rate": 2.701195680349228e-07, "loss": 0.0318, "step": 4078 }, { "epoch": 3.622557726465364, "grad_norm": 0.41357031731153654, "learning_rate": 2.68864365956682e-07, "loss": 0.0363, "step": 4079 }, { "epoch": 3.6234458259325044, "grad_norm": 0.37322028907983257, "learning_rate": 2.676120064533289e-07, "loss": 0.0303, "step": 4080 }, { "epoch": 3.624333925399645, "grad_norm": 0.32393294890052976, "learning_rate": 2.663624902773088e-07, "loss": 0.0245, "step": 4081 }, { "epoch": 3.625222024866785, "grad_norm": 0.30567162060640496, "learning_rate": 2.651158181793595e-07, "loss": 0.0234, "step": 4082 }, { "epoch": 3.6261101243339255, "grad_norm": 0.33495454755969983, "learning_rate": 2.6387199090851135e-07, "loss": 0.0319, "step": 4083 }, { "epoch": 3.6269982238010656, "grad_norm": 0.3942255313247027, "learning_rate": 2.6263100921208484e-07, "loss": 0.0364, "step": 4084 }, { "epoch": 3.627886323268206, "grad_norm": 0.5529320872827523, "learning_rate": 2.6139287383568745e-07, "loss": 0.0333, "step": 4085 }, { "epoch": 3.6287744227353462, "grad_norm": 0.3566634202518894, "learning_rate": 2.6015758552322135e-07, "loss": 0.0297, "step": 4086 }, { "epoch": 3.6296625222024868, "grad_norm": 0.42591971243674315, "learning_rate": 2.5892514501687673e-07, "loss": 0.0424, "step": 4087 }, { "epoch": 3.630550621669627, "grad_norm": 0.41957409707316257, "learning_rate": 2.576955530571312e-07, "loss": 0.0363, "step": 4088 }, { "epoch": 3.6314387211367674, "grad_norm": 0.37826897762300055, "learning_rate": 2.5646881038275115e-07, "loss": 0.0295, "step": 4089 }, { "epoch": 3.6323268206039074, "grad_norm": 0.33846011705029777, "learning_rate": 2.552449177307931e-07, "loss": 0.0289, "step": 4090 }, { "epoch": 3.633214920071048, "grad_norm": 0.3434400150376539, "learning_rate": 2.540238758365987e-07, "loss": 0.0275, "step": 4091 }, { "epoch": 3.6341030195381885, "grad_norm": 0.7356552257127704, "learning_rate": 2.52805685433799e-07, "loss": 0.0429, "step": 4092 }, { "epoch": 3.6349911190053286, "grad_norm": 0.3460474954037151, "learning_rate": 2.515903472543102e-07, "loss": 0.0322, "step": 4093 }, { "epoch": 3.6358792184724686, "grad_norm": 0.31851118641004755, "learning_rate": 2.503778620283348e-07, "loss": 0.0267, "step": 4094 }, { "epoch": 3.636767317939609, "grad_norm": 0.6093920804945067, "learning_rate": 2.491682304843629e-07, "loss": 0.0318, "step": 4095 }, { "epoch": 3.6376554174067497, "grad_norm": 0.40552284870398103, "learning_rate": 2.4796145334916867e-07, "loss": 0.0285, "step": 4096 }, { "epoch": 3.6385435168738898, "grad_norm": 0.3387147964505112, "learning_rate": 2.4675753134781043e-07, "loss": 0.028, "step": 4097 }, { "epoch": 3.6394316163410303, "grad_norm": 0.32833452932555013, "learning_rate": 2.455564652036324e-07, "loss": 0.0286, "step": 4098 }, { "epoch": 3.6403197158081704, "grad_norm": 0.33956735996608145, "learning_rate": 2.4435825563826455e-07, "loss": 0.0277, "step": 4099 }, { "epoch": 3.641207815275311, "grad_norm": 0.42900278824587085, "learning_rate": 2.431629033716171e-07, "loss": 0.0463, "step": 4100 }, { "epoch": 3.642095914742451, "grad_norm": 0.2931273940040887, "learning_rate": 2.4197040912188496e-07, "loss": 0.0235, "step": 4101 }, { "epoch": 3.6429840142095915, "grad_norm": 0.5045463380588222, "learning_rate": 2.4078077360554673e-07, "loss": 0.0287, "step": 4102 }, { "epoch": 3.643872113676732, "grad_norm": 0.32863453823771394, "learning_rate": 2.395939975373618e-07, "loss": 0.023, "step": 4103 }, { "epoch": 3.644760213143872, "grad_norm": 0.3280952166052903, "learning_rate": 2.3841008163037415e-07, "loss": 0.0313, "step": 4104 }, { "epoch": 3.645648312611012, "grad_norm": 0.33935062342771694, "learning_rate": 2.3722902659590653e-07, "loss": 0.0256, "step": 4105 }, { "epoch": 3.6465364120781527, "grad_norm": 0.34394834952041614, "learning_rate": 2.3605083314356349e-07, "loss": 0.0276, "step": 4106 }, { "epoch": 3.6474245115452932, "grad_norm": 0.35006997907059395, "learning_rate": 2.3487550198123153e-07, "loss": 0.0295, "step": 4107 }, { "epoch": 3.6483126110124333, "grad_norm": 0.35845802214579003, "learning_rate": 2.3370303381507643e-07, "loss": 0.0293, "step": 4108 }, { "epoch": 3.649200710479574, "grad_norm": 0.2961638070375035, "learning_rate": 2.3253342934954347e-07, "loss": 0.0228, "step": 4109 }, { "epoch": 3.650088809946714, "grad_norm": 0.3581970416631591, "learning_rate": 2.3136668928735838e-07, "loss": 0.0326, "step": 4110 }, { "epoch": 3.6509769094138544, "grad_norm": 0.37317477081009887, "learning_rate": 2.3020281432952485e-07, "loss": 0.0305, "step": 4111 }, { "epoch": 3.6518650088809945, "grad_norm": 0.3321849730320183, "learning_rate": 2.2904180517532682e-07, "loss": 0.0338, "step": 4112 }, { "epoch": 3.652753108348135, "grad_norm": 0.34095544835376845, "learning_rate": 2.2788366252232408e-07, "loss": 0.0309, "step": 4113 }, { "epoch": 3.6536412078152756, "grad_norm": 0.3135931275379927, "learning_rate": 2.2672838706635557e-07, "loss": 0.0345, "step": 4114 }, { "epoch": 3.6545293072824157, "grad_norm": 0.3398201526476228, "learning_rate": 2.255759795015372e-07, "loss": 0.0277, "step": 4115 }, { "epoch": 3.6554174067495557, "grad_norm": 0.4258577290727356, "learning_rate": 2.2442644052026286e-07, "loss": 0.0358, "step": 4116 }, { "epoch": 3.6563055062166963, "grad_norm": 0.344712297338292, "learning_rate": 2.2327977081320064e-07, "loss": 0.0364, "step": 4117 }, { "epoch": 3.657193605683837, "grad_norm": 0.2844802997510866, "learning_rate": 2.2213597106929608e-07, "loss": 0.0254, "step": 4118 }, { "epoch": 3.658081705150977, "grad_norm": 0.3482327656423746, "learning_rate": 2.209950419757717e-07, "loss": 0.0385, "step": 4119 }, { "epoch": 3.6589698046181174, "grad_norm": 0.409568753384068, "learning_rate": 2.1985698421812308e-07, "loss": 0.0299, "step": 4120 }, { "epoch": 3.6598579040852575, "grad_norm": 0.30398466916264566, "learning_rate": 2.1872179848012099e-07, "loss": 0.0297, "step": 4121 }, { "epoch": 3.660746003552398, "grad_norm": 0.3286674712491026, "learning_rate": 2.1758948544381153e-07, "loss": 0.0277, "step": 4122 }, { "epoch": 3.661634103019538, "grad_norm": 0.38599884179691046, "learning_rate": 2.164600457895144e-07, "loss": 0.022, "step": 4123 }, { "epoch": 3.6625222024866786, "grad_norm": 0.3298170939527743, "learning_rate": 2.153334801958229e-07, "loss": 0.0281, "step": 4124 }, { "epoch": 3.6634103019538187, "grad_norm": 0.35507918769242575, "learning_rate": 2.1420978933960334e-07, "loss": 0.0298, "step": 4125 }, { "epoch": 3.664298401420959, "grad_norm": 0.657045311502623, "learning_rate": 2.130889738959946e-07, "loss": 0.0396, "step": 4126 }, { "epoch": 3.6651865008880993, "grad_norm": 0.28407464356676004, "learning_rate": 2.119710345384085e-07, "loss": 0.0237, "step": 4127 }, { "epoch": 3.66607460035524, "grad_norm": 0.4714399982634715, "learning_rate": 2.1085597193853002e-07, "loss": 0.0345, "step": 4128 }, { "epoch": 3.6669626998223803, "grad_norm": 0.3325805043726439, "learning_rate": 2.0974378676631269e-07, "loss": 0.0323, "step": 4129 }, { "epoch": 3.6678507992895204, "grad_norm": 0.3715649450962102, "learning_rate": 2.086344796899825e-07, "loss": 0.0348, "step": 4130 }, { "epoch": 3.6687388987566605, "grad_norm": 0.34322855603072555, "learning_rate": 2.0752805137603914e-07, "loss": 0.0308, "step": 4131 }, { "epoch": 3.669626998223801, "grad_norm": 0.3079586875780006, "learning_rate": 2.064245024892475e-07, "loss": 0.0286, "step": 4132 }, { "epoch": 3.6705150976909415, "grad_norm": 0.4725254838554427, "learning_rate": 2.0532383369264663e-07, "loss": 0.0354, "step": 4133 }, { "epoch": 3.6714031971580816, "grad_norm": 0.3993837533428711, "learning_rate": 2.042260456475431e-07, "loss": 0.0315, "step": 4134 }, { "epoch": 3.672291296625222, "grad_norm": 0.37875590996423947, "learning_rate": 2.0313113901351266e-07, "loss": 0.0346, "step": 4135 }, { "epoch": 3.673179396092362, "grad_norm": 0.33922148147615033, "learning_rate": 2.0203911444840184e-07, "loss": 0.0278, "step": 4136 }, { "epoch": 3.6740674955595027, "grad_norm": 0.28865242326757395, "learning_rate": 2.009499726083225e-07, "loss": 0.0249, "step": 4137 }, { "epoch": 3.674955595026643, "grad_norm": 0.4428997092567071, "learning_rate": 1.9986371414765615e-07, "loss": 0.0318, "step": 4138 }, { "epoch": 3.6758436944937833, "grad_norm": 0.39516553402180626, "learning_rate": 1.987803397190524e-07, "loss": 0.04, "step": 4139 }, { "epoch": 3.676731793960924, "grad_norm": 0.33091490578280347, "learning_rate": 1.9769984997342838e-07, "loss": 0.028, "step": 4140 }, { "epoch": 3.677619893428064, "grad_norm": 0.3335222004990329, "learning_rate": 1.966222455599659e-07, "loss": 0.0295, "step": 4141 }, { "epoch": 3.678507992895204, "grad_norm": 0.3245358540412729, "learning_rate": 1.9554752712611368e-07, "loss": 0.0257, "step": 4142 }, { "epoch": 3.6793960923623446, "grad_norm": 0.2964035236786363, "learning_rate": 1.9447569531758913e-07, "loss": 0.0224, "step": 4143 }, { "epoch": 3.680284191829485, "grad_norm": 0.3386943141170172, "learning_rate": 1.9340675077837156e-07, "loss": 0.03, "step": 4144 }, { "epoch": 3.681172291296625, "grad_norm": 0.34385542148242515, "learning_rate": 1.9234069415070944e-07, "loss": 0.0276, "step": 4145 }, { "epoch": 3.6820603907637657, "grad_norm": 0.40638885409888764, "learning_rate": 1.9127752607511263e-07, "loss": 0.0329, "step": 4146 }, { "epoch": 3.6829484902309058, "grad_norm": 0.2950797220950981, "learning_rate": 1.902172471903563e-07, "loss": 0.0278, "step": 4147 }, { "epoch": 3.6838365896980463, "grad_norm": 0.4421989622826594, "learning_rate": 1.891598581334825e-07, "loss": 0.0424, "step": 4148 }, { "epoch": 3.6847246891651864, "grad_norm": 0.3369769711437819, "learning_rate": 1.8810535953979304e-07, "loss": 0.0386, "step": 4149 }, { "epoch": 3.685612788632327, "grad_norm": 0.312409083332414, "learning_rate": 1.8705375204285503e-07, "loss": 0.029, "step": 4150 }, { "epoch": 3.6865008880994674, "grad_norm": 0.5383907853717661, "learning_rate": 1.8600503627449972e-07, "loss": 0.0335, "step": 4151 }, { "epoch": 3.6873889875666075, "grad_norm": 0.5185575405835587, "learning_rate": 1.8495921286481755e-07, "loss": 0.0369, "step": 4152 }, { "epoch": 3.6882770870337476, "grad_norm": 0.688257047676778, "learning_rate": 1.8391628244216531e-07, "loss": 0.045, "step": 4153 }, { "epoch": 3.689165186500888, "grad_norm": 0.3663439018419157, "learning_rate": 1.8287624563315842e-07, "loss": 0.0284, "step": 4154 }, { "epoch": 3.6900532859680286, "grad_norm": 0.34515058573917856, "learning_rate": 1.8183910306267427e-07, "loss": 0.0269, "step": 4155 }, { "epoch": 3.6909413854351687, "grad_norm": 0.38381647555908577, "learning_rate": 1.8080485535385327e-07, "loss": 0.0349, "step": 4156 }, { "epoch": 3.691829484902309, "grad_norm": 0.3845552524534645, "learning_rate": 1.797735031280956e-07, "loss": 0.0371, "step": 4157 }, { "epoch": 3.6927175843694493, "grad_norm": 0.3009108341562991, "learning_rate": 1.787450470050589e-07, "loss": 0.023, "step": 4158 }, { "epoch": 3.69360568383659, "grad_norm": 0.34032029486921384, "learning_rate": 1.77719487602665e-07, "loss": 0.0337, "step": 4159 }, { "epoch": 3.69449378330373, "grad_norm": 0.3783272835463549, "learning_rate": 1.7669682553709323e-07, "loss": 0.0413, "step": 4160 }, { "epoch": 3.6953818827708704, "grad_norm": 0.36760174865925677, "learning_rate": 1.7567706142278318e-07, "loss": 0.0332, "step": 4161 }, { "epoch": 3.6962699822380105, "grad_norm": 0.30744482215918945, "learning_rate": 1.7466019587243088e-07, "loss": 0.0255, "step": 4162 }, { "epoch": 3.697158081705151, "grad_norm": 0.4482029084810956, "learning_rate": 1.7364622949699373e-07, "loss": 0.0333, "step": 4163 }, { "epoch": 3.698046181172291, "grad_norm": 0.3382286333629386, "learning_rate": 1.726351629056855e-07, "loss": 0.0282, "step": 4164 }, { "epoch": 3.6989342806394316, "grad_norm": 0.3602323470891706, "learning_rate": 1.716269967059786e-07, "loss": 0.0337, "step": 4165 }, { "epoch": 3.699822380106572, "grad_norm": 0.3716082610197423, "learning_rate": 1.706217315036024e-07, "loss": 0.0247, "step": 4166 }, { "epoch": 3.7007104795737122, "grad_norm": 0.40300697352721165, "learning_rate": 1.6961936790254207e-07, "loss": 0.0315, "step": 4167 }, { "epoch": 3.7015985790408523, "grad_norm": 0.27629597358248276, "learning_rate": 1.6861990650504256e-07, "loss": 0.0214, "step": 4168 }, { "epoch": 3.702486678507993, "grad_norm": 0.3685144138535106, "learning_rate": 1.6762334791160296e-07, "loss": 0.0378, "step": 4169 }, { "epoch": 3.7033747779751334, "grad_norm": 0.473328561248784, "learning_rate": 1.6662969272097652e-07, "loss": 0.0307, "step": 4170 }, { "epoch": 3.7042628774422734, "grad_norm": 0.38700449724979513, "learning_rate": 1.6563894153017567e-07, "loss": 0.0335, "step": 4171 }, { "epoch": 3.705150976909414, "grad_norm": 0.36352673886304254, "learning_rate": 1.6465109493446708e-07, "loss": 0.031, "step": 4172 }, { "epoch": 3.706039076376554, "grad_norm": 0.3335281985604826, "learning_rate": 1.6366615352737092e-07, "loss": 0.0296, "step": 4173 }, { "epoch": 3.7069271758436946, "grad_norm": 0.3298592306508931, "learning_rate": 1.6268411790066218e-07, "loss": 0.0307, "step": 4174 }, { "epoch": 3.7078152753108347, "grad_norm": 0.34974447407247544, "learning_rate": 1.6170498864437112e-07, "loss": 0.0322, "step": 4175 }, { "epoch": 3.708703374777975, "grad_norm": 0.37732363449954476, "learning_rate": 1.6072876634677993e-07, "loss": 0.0378, "step": 4176 }, { "epoch": 3.7095914742451157, "grad_norm": 0.3972473100874786, "learning_rate": 1.5975545159442717e-07, "loss": 0.0266, "step": 4177 }, { "epoch": 3.710479573712256, "grad_norm": 0.31543674418786594, "learning_rate": 1.5878504497210123e-07, "loss": 0.0283, "step": 4178 }, { "epoch": 3.711367673179396, "grad_norm": 0.2643553116005103, "learning_rate": 1.5781754706284512e-07, "loss": 0.0284, "step": 4179 }, { "epoch": 3.7122557726465364, "grad_norm": 0.44185218697847933, "learning_rate": 1.568529584479539e-07, "loss": 0.0399, "step": 4180 }, { "epoch": 3.713143872113677, "grad_norm": 0.3586063831653754, "learning_rate": 1.558912797069745e-07, "loss": 0.0265, "step": 4181 }, { "epoch": 3.714031971580817, "grad_norm": 0.3405430171087667, "learning_rate": 1.5493251141770527e-07, "loss": 0.0333, "step": 4182 }, { "epoch": 3.7149200710479575, "grad_norm": 0.423317334564177, "learning_rate": 1.5397665415619655e-07, "loss": 0.038, "step": 4183 }, { "epoch": 3.7158081705150976, "grad_norm": 0.32880566440929193, "learning_rate": 1.5302370849674952e-07, "loss": 0.0309, "step": 4184 }, { "epoch": 3.716696269982238, "grad_norm": 0.34239075545054704, "learning_rate": 1.5207367501191617e-07, "loss": 0.0255, "step": 4185 }, { "epoch": 3.717584369449378, "grad_norm": 0.3189902999675348, "learning_rate": 1.5112655427249656e-07, "loss": 0.0261, "step": 4186 }, { "epoch": 3.7184724689165187, "grad_norm": 0.3956973296104195, "learning_rate": 1.501823468475444e-07, "loss": 0.031, "step": 4187 }, { "epoch": 3.719360568383659, "grad_norm": 0.41930525520246403, "learning_rate": 1.4924105330436034e-07, "loss": 0.0365, "step": 4188 }, { "epoch": 3.7202486678507993, "grad_norm": 0.3660565318001229, "learning_rate": 1.4830267420849587e-07, "loss": 0.027, "step": 4189 }, { "epoch": 3.7211367673179394, "grad_norm": 0.3434992461021154, "learning_rate": 1.4736721012375055e-07, "loss": 0.0339, "step": 4190 }, { "epoch": 3.72202486678508, "grad_norm": 0.3560428097284687, "learning_rate": 1.4643466161217145e-07, "loss": 0.032, "step": 4191 }, { "epoch": 3.7229129662522205, "grad_norm": 0.35961153697449205, "learning_rate": 1.4550502923405652e-07, "loss": 0.026, "step": 4192 }, { "epoch": 3.7238010657193605, "grad_norm": 0.34050757138628135, "learning_rate": 1.445783135479495e-07, "loss": 0.031, "step": 4193 }, { "epoch": 3.7246891651865006, "grad_norm": 0.3436294314699083, "learning_rate": 1.4365451511064276e-07, "loss": 0.0285, "step": 4194 }, { "epoch": 3.725577264653641, "grad_norm": 0.38277294080370794, "learning_rate": 1.427336344771757e-07, "loss": 0.0334, "step": 4195 }, { "epoch": 3.7264653641207817, "grad_norm": 0.31483558549985985, "learning_rate": 1.4181567220083403e-07, "loss": 0.0278, "step": 4196 }, { "epoch": 3.7273534635879217, "grad_norm": 0.36868641216833004, "learning_rate": 1.4090062883315103e-07, "loss": 0.0289, "step": 4197 }, { "epoch": 3.7282415630550623, "grad_norm": 0.3247370386901695, "learning_rate": 1.3998850492390637e-07, "loss": 0.0325, "step": 4198 }, { "epoch": 3.7291296625222023, "grad_norm": 0.34494670182583104, "learning_rate": 1.3907930102112334e-07, "loss": 0.0241, "step": 4199 }, { "epoch": 3.730017761989343, "grad_norm": 0.33992028442056216, "learning_rate": 1.3817301767107327e-07, "loss": 0.0275, "step": 4200 }, { "epoch": 3.730905861456483, "grad_norm": 0.3610905282697854, "learning_rate": 1.3726965541827287e-07, "loss": 0.0263, "step": 4201 }, { "epoch": 3.7317939609236235, "grad_norm": 0.32422628854444807, "learning_rate": 1.3636921480548239e-07, "loss": 0.0283, "step": 4202 }, { "epoch": 3.732682060390764, "grad_norm": 0.35359932676372424, "learning_rate": 1.3547169637370627e-07, "loss": 0.0318, "step": 4203 }, { "epoch": 3.733570159857904, "grad_norm": 0.4050953095439511, "learning_rate": 1.3457710066219542e-07, "loss": 0.031, "step": 4204 }, { "epoch": 3.734458259325044, "grad_norm": 0.419891533641244, "learning_rate": 1.336854282084432e-07, "loss": 0.0427, "step": 4205 }, { "epoch": 3.7353463587921847, "grad_norm": 0.3003440978143265, "learning_rate": 1.3279667954818664e-07, "loss": 0.0236, "step": 4206 }, { "epoch": 3.736234458259325, "grad_norm": 0.37555342930424257, "learning_rate": 1.3191085521540635e-07, "loss": 0.0277, "step": 4207 }, { "epoch": 3.7371225577264653, "grad_norm": 0.361372911496851, "learning_rate": 1.3102795574232607e-07, "loss": 0.0285, "step": 4208 }, { "epoch": 3.738010657193606, "grad_norm": 0.34710037215510453, "learning_rate": 1.3014798165941144e-07, "loss": 0.0349, "step": 4209 }, { "epoch": 3.738898756660746, "grad_norm": 0.37898887202273684, "learning_rate": 1.292709334953729e-07, "loss": 0.0305, "step": 4210 }, { "epoch": 3.7397868561278864, "grad_norm": 0.413910408702082, "learning_rate": 1.283968117771589e-07, "loss": 0.0324, "step": 4211 }, { "epoch": 3.7406749555950265, "grad_norm": 0.29718868611173865, "learning_rate": 1.275256170299627e-07, "loss": 0.0268, "step": 4212 }, { "epoch": 3.741563055062167, "grad_norm": 0.32730887641162354, "learning_rate": 1.266573497772189e-07, "loss": 0.0354, "step": 4213 }, { "epoch": 3.7424511545293075, "grad_norm": 0.35097057765889, "learning_rate": 1.257920105406013e-07, "loss": 0.0268, "step": 4214 }, { "epoch": 3.7433392539964476, "grad_norm": 0.38122809128474444, "learning_rate": 1.2492959984002573e-07, "loss": 0.0335, "step": 4215 }, { "epoch": 3.7442273534635877, "grad_norm": 0.4330875326163401, "learning_rate": 1.2407011819364878e-07, "loss": 0.0287, "step": 4216 }, { "epoch": 3.7451154529307282, "grad_norm": 0.3957769309250494, "learning_rate": 1.2321356611786627e-07, "loss": 0.0339, "step": 4217 }, { "epoch": 3.7460035523978688, "grad_norm": 0.4296555474055646, "learning_rate": 1.2235994412731543e-07, "loss": 0.038, "step": 4218 }, { "epoch": 3.746891651865009, "grad_norm": 0.3301880668188309, "learning_rate": 1.2150925273487045e-07, "loss": 0.0271, "step": 4219 }, { "epoch": 3.7477797513321494, "grad_norm": 0.4316727360217353, "learning_rate": 1.2066149245164692e-07, "loss": 0.0334, "step": 4220 }, { "epoch": 3.7486678507992894, "grad_norm": 0.3544806840083174, "learning_rate": 1.1981666378699908e-07, "loss": 0.0338, "step": 4221 }, { "epoch": 3.74955595026643, "grad_norm": 0.3617935118362104, "learning_rate": 1.189747672485192e-07, "loss": 0.0355, "step": 4222 }, { "epoch": 3.75044404973357, "grad_norm": 0.39600797799712956, "learning_rate": 1.1813580334203711e-07, "loss": 0.0313, "step": 4223 }, { "epoch": 3.7513321492007106, "grad_norm": 0.37376421002392135, "learning_rate": 1.172997725716224e-07, "loss": 0.0353, "step": 4224 }, { "epoch": 3.7522202486678506, "grad_norm": 0.39413298598820423, "learning_rate": 1.1646667543958157e-07, "loss": 0.03, "step": 4225 }, { "epoch": 3.753108348134991, "grad_norm": 0.3137860846716177, "learning_rate": 1.1563651244645869e-07, "loss": 0.0255, "step": 4226 }, { "epoch": 3.7539964476021312, "grad_norm": 0.37435698529593536, "learning_rate": 1.148092840910342e-07, "loss": 0.0398, "step": 4227 }, { "epoch": 3.7548845470692718, "grad_norm": 0.3541195168548642, "learning_rate": 1.1398499087032666e-07, "loss": 0.0319, "step": 4228 }, { "epoch": 3.7557726465364123, "grad_norm": 0.3604044892364321, "learning_rate": 1.1316363327958879e-07, "loss": 0.0341, "step": 4229 }, { "epoch": 3.7566607460035524, "grad_norm": 0.4187936879223956, "learning_rate": 1.1234521181231306e-07, "loss": 0.0406, "step": 4230 }, { "epoch": 3.7575488454706925, "grad_norm": 0.32822177029173305, "learning_rate": 1.1152972696022447e-07, "loss": 0.0314, "step": 4231 }, { "epoch": 3.758436944937833, "grad_norm": 0.35407604113573227, "learning_rate": 1.107171792132855e-07, "loss": 0.0307, "step": 4232 }, { "epoch": 3.7593250444049735, "grad_norm": 0.3546460353575967, "learning_rate": 1.0990756905969402e-07, "loss": 0.0291, "step": 4233 }, { "epoch": 3.7602131438721136, "grad_norm": 0.4577208780159834, "learning_rate": 1.0910089698588145e-07, "loss": 0.0328, "step": 4234 }, { "epoch": 3.761101243339254, "grad_norm": 0.35986670260273723, "learning_rate": 1.0829716347651509e-07, "loss": 0.0274, "step": 4235 }, { "epoch": 3.761989342806394, "grad_norm": 0.29072196982265625, "learning_rate": 1.0749636901449701e-07, "loss": 0.0261, "step": 4236 }, { "epoch": 3.7628774422735347, "grad_norm": 0.37516091498542603, "learning_rate": 1.0669851408096233e-07, "loss": 0.0335, "step": 4237 }, { "epoch": 3.763765541740675, "grad_norm": 0.3523270057886884, "learning_rate": 1.0590359915528092e-07, "loss": 0.0331, "step": 4238 }, { "epoch": 3.7646536412078153, "grad_norm": 0.36768387999095853, "learning_rate": 1.0511162471505631e-07, "loss": 0.032, "step": 4239 }, { "epoch": 3.765541740674956, "grad_norm": 0.428925812305749, "learning_rate": 1.0432259123612399e-07, "loss": 0.0443, "step": 4240 }, { "epoch": 3.766429840142096, "grad_norm": 0.31788241076246565, "learning_rate": 1.035364991925536e-07, "loss": 0.0297, "step": 4241 }, { "epoch": 3.767317939609236, "grad_norm": 0.3583843501974432, "learning_rate": 1.0275334905664847e-07, "loss": 0.0253, "step": 4242 }, { "epoch": 3.7682060390763765, "grad_norm": 0.30819284282278403, "learning_rate": 1.0197314129894109e-07, "loss": 0.0256, "step": 4243 }, { "epoch": 3.769094138543517, "grad_norm": 0.35591547028474246, "learning_rate": 1.0119587638819983e-07, "loss": 0.0304, "step": 4244 }, { "epoch": 3.769982238010657, "grad_norm": 0.3476573908870843, "learning_rate": 1.0042155479142335e-07, "loss": 0.0316, "step": 4245 }, { "epoch": 3.7708703374777977, "grad_norm": 0.3897359438223438, "learning_rate": 9.965017697384116e-08, "loss": 0.0279, "step": 4246 }, { "epoch": 3.7717584369449377, "grad_norm": 0.3523787578693443, "learning_rate": 9.888174339891421e-08, "loss": 0.0312, "step": 4247 }, { "epoch": 3.7726465364120783, "grad_norm": 0.5531277406393773, "learning_rate": 9.811625452833651e-08, "loss": 0.0409, "step": 4248 }, { "epoch": 3.7735346358792183, "grad_norm": 0.381447451834388, "learning_rate": 9.73537108220296e-08, "loss": 0.0369, "step": 4249 }, { "epoch": 3.774422735346359, "grad_norm": 0.43838551605035114, "learning_rate": 9.659411273814922e-08, "loss": 0.0241, "step": 4250 }, { "epoch": 3.7753108348134994, "grad_norm": 0.3433131363492874, "learning_rate": 9.583746073307809e-08, "loss": 0.029, "step": 4251 }, { "epoch": 3.7761989342806395, "grad_norm": 0.4028789491262155, "learning_rate": 9.508375526142976e-08, "loss": 0.0254, "step": 4252 }, { "epoch": 3.7770870337477795, "grad_norm": 0.4384693436592215, "learning_rate": 9.433299677604868e-08, "loss": 0.029, "step": 4253 }, { "epoch": 3.77797513321492, "grad_norm": 0.35262305060561455, "learning_rate": 9.35851857280079e-08, "loss": 0.0368, "step": 4254 }, { "epoch": 3.7788632326820606, "grad_norm": 0.39423915341441945, "learning_rate": 9.284032256660857e-08, "loss": 0.0327, "step": 4255 }, { "epoch": 3.7797513321492007, "grad_norm": 0.3333976254416016, "learning_rate": 9.209840773938161e-08, "loss": 0.0304, "step": 4256 }, { "epoch": 3.780639431616341, "grad_norm": 0.3813482214944881, "learning_rate": 9.135944169208766e-08, "loss": 0.0363, "step": 4257 }, { "epoch": 3.7815275310834813, "grad_norm": 0.33686074845626796, "learning_rate": 9.062342486871267e-08, "loss": 0.0275, "step": 4258 }, { "epoch": 3.782415630550622, "grad_norm": 0.32966389403848834, "learning_rate": 8.989035771147458e-08, "loss": 0.0271, "step": 4259 }, { "epoch": 3.783303730017762, "grad_norm": 0.37048752323192036, "learning_rate": 8.916024066081552e-08, "loss": 0.0385, "step": 4260 }, { "epoch": 3.7841918294849024, "grad_norm": 0.3761651368578672, "learning_rate": 8.84330741554068e-08, "loss": 0.0312, "step": 4261 }, { "epoch": 3.7850799289520425, "grad_norm": 0.2769064763216777, "learning_rate": 8.770885863214729e-08, "loss": 0.0258, "step": 4262 }, { "epoch": 3.785968028419183, "grad_norm": 0.3477019498066257, "learning_rate": 8.698759452616112e-08, "loss": 0.0313, "step": 4263 }, { "epoch": 3.786856127886323, "grad_norm": 0.35835074590053034, "learning_rate": 8.626928227080167e-08, "loss": 0.0311, "step": 4264 }, { "epoch": 3.7877442273534636, "grad_norm": 0.37515477929002006, "learning_rate": 8.555392229764592e-08, "loss": 0.0376, "step": 4265 }, { "epoch": 3.788632326820604, "grad_norm": 0.36780112941468734, "learning_rate": 8.484151503650062e-08, "loss": 0.0275, "step": 4266 }, { "epoch": 3.789520426287744, "grad_norm": 0.30232429359883534, "learning_rate": 8.413206091539505e-08, "loss": 0.0236, "step": 4267 }, { "epoch": 3.7904085257548843, "grad_norm": 0.3621536801089417, "learning_rate": 8.342556036058492e-08, "loss": 0.0364, "step": 4268 }, { "epoch": 3.791296625222025, "grad_norm": 0.33754912437109746, "learning_rate": 8.272201379655398e-08, "loss": 0.0334, "step": 4269 }, { "epoch": 3.7921847246891653, "grad_norm": 0.378479374416979, "learning_rate": 8.20214216460069e-08, "loss": 0.0301, "step": 4270 }, { "epoch": 3.7930728241563054, "grad_norm": 0.39100241252671175, "learning_rate": 8.132378432987753e-08, "loss": 0.0337, "step": 4271 }, { "epoch": 3.793960923623446, "grad_norm": 0.39194682078387527, "learning_rate": 8.062910226732112e-08, "loss": 0.0383, "step": 4272 }, { "epoch": 3.794849023090586, "grad_norm": 0.34143733677569477, "learning_rate": 7.993737587571825e-08, "loss": 0.0303, "step": 4273 }, { "epoch": 3.7957371225577266, "grad_norm": 0.36064013407949275, "learning_rate": 7.924860557067593e-08, "loss": 0.0313, "step": 4274 }, { "epoch": 3.7966252220248666, "grad_norm": 0.4843753019752317, "learning_rate": 7.856279176602144e-08, "loss": 0.0365, "step": 4275 }, { "epoch": 3.797513321492007, "grad_norm": 0.3560252848603473, "learning_rate": 7.78799348738074e-08, "loss": 0.0295, "step": 4276 }, { "epoch": 3.7984014209591477, "grad_norm": 0.38564615796744883, "learning_rate": 7.72000353043112e-08, "loss": 0.0326, "step": 4277 }, { "epoch": 3.7992895204262878, "grad_norm": 0.303037033637724, "learning_rate": 7.652309346603048e-08, "loss": 0.0232, "step": 4278 }, { "epoch": 3.800177619893428, "grad_norm": 0.332761121905067, "learning_rate": 7.584910976568937e-08, "loss": 0.0221, "step": 4279 }, { "epoch": 3.8010657193605684, "grad_norm": 0.34664824674177536, "learning_rate": 7.517808460823117e-08, "loss": 0.0267, "step": 4280 }, { "epoch": 3.801953818827709, "grad_norm": 0.3414872510743276, "learning_rate": 7.451001839682336e-08, "loss": 0.0293, "step": 4281 }, { "epoch": 3.802841918294849, "grad_norm": 0.31486452720736763, "learning_rate": 7.3844911532856e-08, "loss": 0.0267, "step": 4282 }, { "epoch": 3.8037300177619895, "grad_norm": 0.35659179633742877, "learning_rate": 7.318276441594108e-08, "loss": 0.0271, "step": 4283 }, { "epoch": 3.8046181172291296, "grad_norm": 0.34424266531677655, "learning_rate": 7.252357744391036e-08, "loss": 0.0245, "step": 4284 }, { "epoch": 3.80550621669627, "grad_norm": 0.42009285441865224, "learning_rate": 7.186735101281928e-08, "loss": 0.0275, "step": 4285 }, { "epoch": 3.80639431616341, "grad_norm": 0.3910495428886012, "learning_rate": 7.121408551694408e-08, "loss": 0.0334, "step": 4286 }, { "epoch": 3.8072824156305507, "grad_norm": 0.3887130690681448, "learning_rate": 7.056378134878139e-08, "loss": 0.0263, "step": 4287 }, { "epoch": 3.808170515097691, "grad_norm": 0.41585764515781903, "learning_rate": 6.991643889904865e-08, "loss": 0.0351, "step": 4288 }, { "epoch": 3.8090586145648313, "grad_norm": 0.36689636763640526, "learning_rate": 6.927205855668417e-08, "loss": 0.0355, "step": 4289 }, { "epoch": 3.8099467140319714, "grad_norm": 0.34791282762484455, "learning_rate": 6.863064070884607e-08, "loss": 0.0373, "step": 4290 }, { "epoch": 3.810834813499112, "grad_norm": 0.3631896445310664, "learning_rate": 6.799218574091326e-08, "loss": 0.0307, "step": 4291 }, { "epoch": 3.8117229129662524, "grad_norm": 0.46817744999701366, "learning_rate": 6.735669403648503e-08, "loss": 0.0439, "step": 4292 }, { "epoch": 3.8126110124333925, "grad_norm": 0.38259950260942327, "learning_rate": 6.672416597737763e-08, "loss": 0.0283, "step": 4293 }, { "epoch": 3.8134991119005326, "grad_norm": 0.40337343124177544, "learning_rate": 6.609460194362927e-08, "loss": 0.0336, "step": 4294 }, { "epoch": 3.814387211367673, "grad_norm": 0.38495762258543054, "learning_rate": 6.546800231349793e-08, "loss": 0.033, "step": 4295 }, { "epoch": 3.8152753108348136, "grad_norm": 0.3290737448978435, "learning_rate": 6.484436746345634e-08, "loss": 0.0282, "step": 4296 }, { "epoch": 3.8161634103019537, "grad_norm": 0.3858915709196421, "learning_rate": 6.422369776820036e-08, "loss": 0.0346, "step": 4297 }, { "epoch": 3.8170515097690942, "grad_norm": 0.3420728360249852, "learning_rate": 6.360599360064224e-08, "loss": 0.0318, "step": 4298 }, { "epoch": 3.8179396092362343, "grad_norm": 0.41664185777198154, "learning_rate": 6.299125533191286e-08, "loss": 0.0281, "step": 4299 }, { "epoch": 3.818827708703375, "grad_norm": 0.3962704915837752, "learning_rate": 6.237948333136068e-08, "loss": 0.0282, "step": 4300 }, { "epoch": 3.819715808170515, "grad_norm": 0.4038464560650923, "learning_rate": 6.177067796655334e-08, "loss": 0.0335, "step": 4301 }, { "epoch": 3.8206039076376554, "grad_norm": 0.3730830701692366, "learning_rate": 6.11648396032738e-08, "loss": 0.0315, "step": 4302 }, { "epoch": 3.821492007104796, "grad_norm": 0.42568534133699615, "learning_rate": 6.056196860552421e-08, "loss": 0.0407, "step": 4303 }, { "epoch": 3.822380106571936, "grad_norm": 0.3748187361883993, "learning_rate": 5.996206533552373e-08, "loss": 0.0289, "step": 4304 }, { "epoch": 3.823268206039076, "grad_norm": 0.3366591717911857, "learning_rate": 5.9365130153707374e-08, "loss": 0.0316, "step": 4305 }, { "epoch": 3.8241563055062167, "grad_norm": 0.3237067178908839, "learning_rate": 5.8771163418727686e-08, "loss": 0.0262, "step": 4306 }, { "epoch": 3.825044404973357, "grad_norm": 0.3226604068159033, "learning_rate": 5.8180165487454754e-08, "loss": 0.0264, "step": 4307 }, { "epoch": 3.8259325044404973, "grad_norm": 0.47668452083716517, "learning_rate": 5.7592136714971214e-08, "loss": 0.0316, "step": 4308 }, { "epoch": 3.826820603907638, "grad_norm": 0.35405850227990326, "learning_rate": 5.700707745458001e-08, "loss": 0.0361, "step": 4309 }, { "epoch": 3.827708703374778, "grad_norm": 0.3483911527846554, "learning_rate": 5.6424988057797723e-08, "loss": 0.0282, "step": 4310 }, { "epoch": 3.8285968028419184, "grad_norm": 0.4185263789996629, "learning_rate": 5.584586887435739e-08, "loss": 0.0383, "step": 4311 }, { "epoch": 3.8294849023090585, "grad_norm": 0.319413368147179, "learning_rate": 5.5269720252206225e-08, "loss": 0.0258, "step": 4312 }, { "epoch": 3.830373001776199, "grad_norm": 0.3460427540781141, "learning_rate": 5.4696542537509e-08, "loss": 0.0284, "step": 4313 }, { "epoch": 3.8312611012433395, "grad_norm": 0.28556441758858203, "learning_rate": 5.4126336074641905e-08, "loss": 0.0247, "step": 4314 }, { "epoch": 3.8321492007104796, "grad_norm": 0.3348374123329273, "learning_rate": 5.3559101206200337e-08, "loss": 0.0297, "step": 4315 }, { "epoch": 3.8330373001776197, "grad_norm": 0.3766724819392865, "learning_rate": 5.2994838272990567e-08, "loss": 0.0339, "step": 4316 }, { "epoch": 3.83392539964476, "grad_norm": 0.3614763475685039, "learning_rate": 5.2433547614035296e-08, "loss": 0.0292, "step": 4317 }, { "epoch": 3.8348134991119007, "grad_norm": 0.3476340792098871, "learning_rate": 5.1875229566570316e-08, "loss": 0.0285, "step": 4318 }, { "epoch": 3.835701598579041, "grad_norm": 0.35206208433206104, "learning_rate": 5.131988446604674e-08, "loss": 0.0288, "step": 4319 }, { "epoch": 3.8365896980461813, "grad_norm": 0.35235509559553985, "learning_rate": 5.076751264612767e-08, "loss": 0.0336, "step": 4320 }, { "epoch": 3.8374777975133214, "grad_norm": 0.3387059741626194, "learning_rate": 5.0218114438692065e-08, "loss": 0.0272, "step": 4321 }, { "epoch": 3.838365896980462, "grad_norm": 0.2940058924932474, "learning_rate": 4.967169017382978e-08, "loss": 0.0233, "step": 4322 }, { "epoch": 3.839253996447602, "grad_norm": 0.5090182431358794, "learning_rate": 4.912824017984541e-08, "loss": 0.0327, "step": 4323 }, { "epoch": 3.8401420959147425, "grad_norm": 0.4579340325310792, "learning_rate": 4.858776478325666e-08, "loss": 0.0382, "step": 4324 }, { "epoch": 3.8410301953818826, "grad_norm": 0.38268228889926165, "learning_rate": 4.805026430879267e-08, "loss": 0.0291, "step": 4325 }, { "epoch": 3.841918294849023, "grad_norm": 0.343892329770722, "learning_rate": 4.751573907939677e-08, "loss": 0.0311, "step": 4326 }, { "epoch": 3.842806394316163, "grad_norm": 0.40706138301457534, "learning_rate": 4.6984189416223735e-08, "loss": 0.0314, "step": 4327 }, { "epoch": 3.8436944937833037, "grad_norm": 0.3111134558120298, "learning_rate": 4.6455615638640875e-08, "loss": 0.023, "step": 4328 }, { "epoch": 3.8445825932504443, "grad_norm": 0.3477045472139336, "learning_rate": 4.593001806422748e-08, "loss": 0.0244, "step": 4329 }, { "epoch": 3.8454706927175843, "grad_norm": 0.33859044332690635, "learning_rate": 4.540739700877483e-08, "loss": 0.0328, "step": 4330 }, { "epoch": 3.8463587921847244, "grad_norm": 0.3691228667767026, "learning_rate": 4.48877527862851e-08, "loss": 0.0325, "step": 4331 }, { "epoch": 3.847246891651865, "grad_norm": 0.3588167745313349, "learning_rate": 4.4371085708972974e-08, "loss": 0.0267, "step": 4332 }, { "epoch": 3.8481349911190055, "grad_norm": 0.32927581237235615, "learning_rate": 4.385739608726347e-08, "loss": 0.0319, "step": 4333 }, { "epoch": 3.8490230905861456, "grad_norm": 0.29159483128634467, "learning_rate": 4.334668422979305e-08, "loss": 0.0299, "step": 4334 }, { "epoch": 3.849911190053286, "grad_norm": 0.3312135156060222, "learning_rate": 4.283895044340958e-08, "loss": 0.0245, "step": 4335 }, { "epoch": 3.850799289520426, "grad_norm": 0.2868713577117152, "learning_rate": 4.233419503317182e-08, "loss": 0.0286, "step": 4336 }, { "epoch": 3.8516873889875667, "grad_norm": 0.3475136235984561, "learning_rate": 4.183241830234663e-08, "loss": 0.0322, "step": 4337 }, { "epoch": 3.8525754884547068, "grad_norm": 0.3937627490986881, "learning_rate": 4.1333620552413967e-08, "loss": 0.0271, "step": 4338 }, { "epoch": 3.8534635879218473, "grad_norm": 0.40467753769294995, "learning_rate": 4.0837802083062984e-08, "loss": 0.0334, "step": 4339 }, { "epoch": 3.854351687388988, "grad_norm": 0.34675837935471, "learning_rate": 4.0344963192193165e-08, "loss": 0.0286, "step": 4340 }, { "epoch": 3.855239786856128, "grad_norm": 0.32529428338468314, "learning_rate": 3.985510417591265e-08, "loss": 0.0321, "step": 4341 }, { "epoch": 3.856127886323268, "grad_norm": 0.3724586734964784, "learning_rate": 3.936822532854046e-08, "loss": 0.0317, "step": 4342 }, { "epoch": 3.8570159857904085, "grad_norm": 0.34911932378498656, "learning_rate": 3.888432694260425e-08, "loss": 0.0312, "step": 4343 }, { "epoch": 3.857904085257549, "grad_norm": 0.36508793241837756, "learning_rate": 3.840340930884145e-08, "loss": 0.0372, "step": 4344 }, { "epoch": 3.858792184724689, "grad_norm": 0.42477103778186287, "learning_rate": 3.792547271619929e-08, "loss": 0.0405, "step": 4345 }, { "epoch": 3.8596802841918296, "grad_norm": 0.3502306374820717, "learning_rate": 3.745051745183137e-08, "loss": 0.0293, "step": 4346 }, { "epoch": 3.8605683836589697, "grad_norm": 0.3576034002722447, "learning_rate": 3.697854380110277e-08, "loss": 0.0308, "step": 4347 }, { "epoch": 3.8614564831261102, "grad_norm": 0.3663533569527988, "learning_rate": 3.65095520475861e-08, "loss": 0.0292, "step": 4348 }, { "epoch": 3.8623445825932503, "grad_norm": 0.37016066414991033, "learning_rate": 3.604354247306152e-08, "loss": 0.0368, "step": 4349 }, { "epoch": 3.863232682060391, "grad_norm": 0.3462057549727546, "learning_rate": 3.558051535751894e-08, "loss": 0.0294, "step": 4350 }, { "epoch": 3.8641207815275314, "grad_norm": 0.3872169706759806, "learning_rate": 3.5120470979156386e-08, "loss": 0.0418, "step": 4351 }, { "epoch": 3.8650088809946714, "grad_norm": 0.33307226944359014, "learning_rate": 3.466340961437775e-08, "loss": 0.0282, "step": 4352 }, { "epoch": 3.8658969804618115, "grad_norm": 0.3531028159774626, "learning_rate": 3.420933153779671e-08, "loss": 0.0304, "step": 4353 }, { "epoch": 3.866785079928952, "grad_norm": 0.3712953348328917, "learning_rate": 3.375823702223391e-08, "loss": 0.0341, "step": 4354 }, { "epoch": 3.8676731793960926, "grad_norm": 0.5930361349122664, "learning_rate": 3.3310126338716444e-08, "loss": 0.0303, "step": 4355 }, { "epoch": 3.8685612788632326, "grad_norm": 0.3366657561722301, "learning_rate": 3.286499975648061e-08, "loss": 0.0314, "step": 4356 }, { "epoch": 3.869449378330373, "grad_norm": 0.30761576587876494, "learning_rate": 3.242285754296859e-08, "loss": 0.0259, "step": 4357 }, { "epoch": 3.8703374777975132, "grad_norm": 0.4057322839382098, "learning_rate": 3.198369996382844e-08, "loss": 0.035, "step": 4358 }, { "epoch": 3.8712255772646538, "grad_norm": 0.38516550750614265, "learning_rate": 3.1547527282916876e-08, "loss": 0.0285, "step": 4359 }, { "epoch": 3.872113676731794, "grad_norm": 0.3726666679963141, "learning_rate": 3.1114339762296495e-08, "loss": 0.0318, "step": 4360 }, { "epoch": 3.8730017761989344, "grad_norm": 0.45217090450614816, "learning_rate": 3.0684137662236324e-08, "loss": 0.0375, "step": 4361 }, { "epoch": 3.8738898756660745, "grad_norm": 0.3690454521221677, "learning_rate": 3.0256921241211824e-08, "loss": 0.0316, "step": 4362 }, { "epoch": 3.874777975133215, "grad_norm": 0.30883468045000373, "learning_rate": 2.983269075590323e-08, "loss": 0.0251, "step": 4363 }, { "epoch": 3.875666074600355, "grad_norm": 0.3133306119098753, "learning_rate": 2.9411446461199978e-08, "loss": 0.0281, "step": 4364 }, { "epoch": 3.8765541740674956, "grad_norm": 0.6609166358769407, "learning_rate": 2.8993188610193513e-08, "loss": 0.0376, "step": 4365 }, { "epoch": 3.877442273534636, "grad_norm": 0.3364655775257454, "learning_rate": 2.8577917454183367e-08, "loss": 0.0375, "step": 4366 }, { "epoch": 3.878330373001776, "grad_norm": 0.32332908013329004, "learning_rate": 2.8165633242674408e-08, "loss": 0.0255, "step": 4367 }, { "epoch": 3.8792184724689163, "grad_norm": 0.3374522564042292, "learning_rate": 2.7756336223375702e-08, "loss": 0.0297, "step": 4368 }, { "epoch": 3.880106571936057, "grad_norm": 0.3223234211010064, "learning_rate": 2.735002664220332e-08, "loss": 0.0303, "step": 4369 }, { "epoch": 3.8809946714031973, "grad_norm": 0.32260140969535633, "learning_rate": 2.6946704743276432e-08, "loss": 0.0266, "step": 4370 }, { "epoch": 3.8818827708703374, "grad_norm": 0.3489766999359135, "learning_rate": 2.6546370768920638e-08, "loss": 0.0262, "step": 4371 }, { "epoch": 3.882770870337478, "grad_norm": 0.4192868948264376, "learning_rate": 2.6149024959665204e-08, "loss": 0.031, "step": 4372 }, { "epoch": 3.883658969804618, "grad_norm": 0.3657141947965113, "learning_rate": 2.5754667554244717e-08, "loss": 0.0348, "step": 4373 }, { "epoch": 3.8845470692717585, "grad_norm": 0.4307137443150828, "learning_rate": 2.536329878959909e-08, "loss": 0.0369, "step": 4374 }, { "epoch": 3.8854351687388986, "grad_norm": 0.35289049259198735, "learning_rate": 2.4974918900870782e-08, "loss": 0.0292, "step": 4375 }, { "epoch": 3.886323268206039, "grad_norm": 0.32992230436377096, "learning_rate": 2.458952812140758e-08, "loss": 0.0237, "step": 4376 }, { "epoch": 3.8872113676731797, "grad_norm": 0.30266200036111696, "learning_rate": 2.420712668276093e-08, "loss": 0.0286, "step": 4377 }, { "epoch": 3.8880994671403197, "grad_norm": 0.3339250029881702, "learning_rate": 2.3827714814686488e-08, "loss": 0.028, "step": 4378 }, { "epoch": 3.88898756660746, "grad_norm": 0.4569844580828058, "learning_rate": 2.3451292745143572e-08, "loss": 0.0292, "step": 4379 }, { "epoch": 3.8898756660746003, "grad_norm": 0.36223606525550006, "learning_rate": 2.3077860700295717e-08, "loss": 0.0354, "step": 4380 }, { "epoch": 3.890763765541741, "grad_norm": 0.33860378609473823, "learning_rate": 2.2707418904509004e-08, "loss": 0.035, "step": 4381 }, { "epoch": 3.891651865008881, "grad_norm": 0.34459530016921563, "learning_rate": 2.2339967580353173e-08, "loss": 0.0312, "step": 4382 }, { "epoch": 3.8925399644760215, "grad_norm": 0.3718421787480263, "learning_rate": 2.197550694860162e-08, "loss": 0.031, "step": 4383 }, { "epoch": 3.8934280639431615, "grad_norm": 0.3983433406484029, "learning_rate": 2.1614037228230857e-08, "loss": 0.0428, "step": 4384 }, { "epoch": 3.894316163410302, "grad_norm": 0.35969146510568784, "learning_rate": 2.1255558636419925e-08, "loss": 0.0317, "step": 4385 }, { "epoch": 3.895204262877442, "grad_norm": 0.32795656581532545, "learning_rate": 2.090007138855099e-08, "loss": 0.0318, "step": 4386 }, { "epoch": 3.8960923623445827, "grad_norm": 0.42925324451322316, "learning_rate": 2.054757569820931e-08, "loss": 0.0264, "step": 4387 }, { "epoch": 3.8969804618117228, "grad_norm": 0.4052490784414431, "learning_rate": 2.0198071777182136e-08, "loss": 0.0468, "step": 4388 }, { "epoch": 3.8978685612788633, "grad_norm": 0.3603324555012287, "learning_rate": 1.9851559835459277e-08, "loss": 0.0317, "step": 4389 }, { "epoch": 3.8987566607460034, "grad_norm": 0.3210421385639071, "learning_rate": 1.950804008123308e-08, "loss": 0.0249, "step": 4390 }, { "epoch": 3.899644760213144, "grad_norm": 0.31113861637791496, "learning_rate": 1.9167512720897896e-08, "loss": 0.0272, "step": 4391 }, { "epoch": 3.9005328596802844, "grad_norm": 0.3121746661598503, "learning_rate": 1.8829977959051728e-08, "loss": 0.025, "step": 4392 }, { "epoch": 3.9014209591474245, "grad_norm": 0.3631011849664124, "learning_rate": 1.8495435998491796e-08, "loss": 0.0283, "step": 4393 }, { "epoch": 3.9023090586145646, "grad_norm": 0.37966032224972057, "learning_rate": 1.816388704021843e-08, "loss": 0.0361, "step": 4394 }, { "epoch": 3.903197158081705, "grad_norm": 0.2910539720829746, "learning_rate": 1.7835331283435065e-08, "loss": 0.0216, "step": 4395 }, { "epoch": 3.9040852575488456, "grad_norm": 0.41715398574395457, "learning_rate": 1.7509768925544902e-08, "loss": 0.0363, "step": 4396 }, { "epoch": 3.9049733570159857, "grad_norm": 0.4393386937039567, "learning_rate": 1.718720016215314e-08, "loss": 0.0388, "step": 4397 }, { "epoch": 3.905861456483126, "grad_norm": 0.3908960542013512, "learning_rate": 1.6867625187066418e-08, "loss": 0.0306, "step": 4398 }, { "epoch": 3.9067495559502663, "grad_norm": 0.30938149095981177, "learning_rate": 1.655104419229281e-08, "loss": 0.0263, "step": 4399 }, { "epoch": 3.907637655417407, "grad_norm": 0.36052732524194336, "learning_rate": 1.6237457368041833e-08, "loss": 0.033, "step": 4400 }, { "epoch": 3.908525754884547, "grad_norm": 0.31285339372140775, "learning_rate": 1.5926864902723882e-08, "loss": 0.0251, "step": 4401 }, { "epoch": 3.9094138543516874, "grad_norm": 0.30362241125417555, "learning_rate": 1.5619266982948576e-08, "loss": 0.0246, "step": 4402 }, { "epoch": 3.910301953818828, "grad_norm": 0.40522883397041964, "learning_rate": 1.5314663793529195e-08, "loss": 0.0399, "step": 4403 }, { "epoch": 3.911190053285968, "grad_norm": 0.33940970253321623, "learning_rate": 1.5013055517478227e-08, "loss": 0.0241, "step": 4404 }, { "epoch": 3.912078152753108, "grad_norm": 0.6869631004896491, "learning_rate": 1.4714442336007939e-08, "loss": 0.0265, "step": 4405 }, { "epoch": 3.9129662522202486, "grad_norm": 0.3479602430347928, "learning_rate": 1.4418824428533152e-08, "loss": 0.0279, "step": 4406 }, { "epoch": 3.913854351687389, "grad_norm": 0.35189230000307964, "learning_rate": 1.4126201972666786e-08, "loss": 0.032, "step": 4407 }, { "epoch": 3.9147424511545292, "grad_norm": 0.3374428462147622, "learning_rate": 1.3836575144223763e-08, "loss": 0.0331, "step": 4408 }, { "epoch": 3.9156305506216698, "grad_norm": 0.3932222565628268, "learning_rate": 1.3549944117218771e-08, "loss": 0.0303, "step": 4409 }, { "epoch": 3.91651865008881, "grad_norm": 0.3905548643151072, "learning_rate": 1.3266309063865724e-08, "loss": 0.0373, "step": 4410 }, { "epoch": 3.9174067495559504, "grad_norm": 0.3581931365729894, "learning_rate": 1.2985670154578856e-08, "loss": 0.0368, "step": 4411 }, { "epoch": 3.9182948490230904, "grad_norm": 0.3455849993181419, "learning_rate": 1.270802755797329e-08, "loss": 0.0298, "step": 4412 }, { "epoch": 3.919182948490231, "grad_norm": 0.34729808591163475, "learning_rate": 1.2433381440862814e-08, "loss": 0.0336, "step": 4413 }, { "epoch": 3.9200710479573715, "grad_norm": 0.34014520671272247, "learning_rate": 1.2161731968260981e-08, "loss": 0.0238, "step": 4414 }, { "epoch": 3.9209591474245116, "grad_norm": 0.36396393616068506, "learning_rate": 1.1893079303381127e-08, "loss": 0.0283, "step": 4415 }, { "epoch": 3.9218472468916517, "grad_norm": 0.4075439359104293, "learning_rate": 1.1627423607635246e-08, "loss": 0.037, "step": 4416 }, { "epoch": 3.922735346358792, "grad_norm": 0.38375575558697966, "learning_rate": 1.1364765040636772e-08, "loss": 0.035, "step": 4417 }, { "epoch": 3.9236234458259327, "grad_norm": 0.3555601567631089, "learning_rate": 1.1105103760195578e-08, "loss": 0.0342, "step": 4418 }, { "epoch": 3.924511545293073, "grad_norm": 0.3779977661952045, "learning_rate": 1.0848439922322984e-08, "loss": 0.0386, "step": 4419 }, { "epoch": 3.9253996447602133, "grad_norm": 0.3746265004100518, "learning_rate": 1.059477368122841e-08, "loss": 0.031, "step": 4420 }, { "epoch": 3.9262877442273534, "grad_norm": 0.4093351345110092, "learning_rate": 1.0344105189320496e-08, "loss": 0.0327, "step": 4421 }, { "epoch": 3.927175843694494, "grad_norm": 0.36839518559894463, "learning_rate": 1.0096434597205995e-08, "loss": 0.0349, "step": 4422 }, { "epoch": 3.928063943161634, "grad_norm": 0.357834203515738, "learning_rate": 9.851762053691426e-09, "loss": 0.0251, "step": 4423 }, { "epoch": 3.9289520426287745, "grad_norm": 0.42674085134064105, "learning_rate": 9.610087705781979e-09, "loss": 0.0305, "step": 4424 }, { "epoch": 3.9298401420959146, "grad_norm": 0.4328029975863711, "learning_rate": 9.371411698680388e-09, "loss": 0.0339, "step": 4425 }, { "epoch": 3.930728241563055, "grad_norm": 0.37474273469732067, "learning_rate": 9.135734175789723e-09, "loss": 0.031, "step": 4426 }, { "epoch": 3.931616341030195, "grad_norm": 0.3799716585454093, "learning_rate": 8.903055278709494e-09, "loss": 0.0409, "step": 4427 }, { "epoch": 3.9325044404973357, "grad_norm": 0.4941566703547101, "learning_rate": 8.673375147238983e-09, "loss": 0.0348, "step": 4428 }, { "epoch": 3.9333925399644762, "grad_norm": 0.3051086264307757, "learning_rate": 8.446693919375026e-09, "loss": 0.0284, "step": 4429 }, { "epoch": 3.9342806394316163, "grad_norm": 0.35148664081966585, "learning_rate": 8.223011731313679e-09, "loss": 0.0234, "step": 4430 }, { "epoch": 3.9351687388987564, "grad_norm": 0.46463968792863114, "learning_rate": 8.002328717447438e-09, "loss": 0.0331, "step": 4431 }, { "epoch": 3.936056838365897, "grad_norm": 0.3388242083345385, "learning_rate": 7.78464501036802e-09, "loss": 0.0285, "step": 4432 }, { "epoch": 3.9369449378330375, "grad_norm": 0.4574281105885283, "learning_rate": 7.569960740864691e-09, "loss": 0.0437, "step": 4433 }, { "epoch": 3.9378330373001775, "grad_norm": 0.38361101761730315, "learning_rate": 7.358276037924827e-09, "loss": 0.0357, "step": 4434 }, { "epoch": 3.938721136767318, "grad_norm": 0.46578909974309685, "learning_rate": 7.14959102873336e-09, "loss": 0.0284, "step": 4435 }, { "epoch": 3.939609236234458, "grad_norm": 0.3574257582077392, "learning_rate": 6.943905838673881e-09, "loss": 0.0351, "step": 4436 }, { "epoch": 3.9404973357015987, "grad_norm": 0.363374991490502, "learning_rate": 6.7412205913253145e-09, "loss": 0.0259, "step": 4437 }, { "epoch": 3.9413854351687387, "grad_norm": 0.31613225253795635, "learning_rate": 6.54153540846636e-09, "loss": 0.0305, "step": 4438 }, { "epoch": 3.9422735346358793, "grad_norm": 0.34656321322412253, "learning_rate": 6.344850410072157e-09, "loss": 0.0238, "step": 4439 }, { "epoch": 3.94316163410302, "grad_norm": 0.3123946843459642, "learning_rate": 6.151165714316509e-09, "loss": 0.0255, "step": 4440 }, { "epoch": 3.94404973357016, "grad_norm": 0.36304887085285237, "learning_rate": 5.9604814375685546e-09, "loss": 0.0261, "step": 4441 }, { "epoch": 3.9449378330373, "grad_norm": 0.334897253182668, "learning_rate": 5.772797694396093e-09, "loss": 0.0275, "step": 4442 }, { "epoch": 3.9458259325044405, "grad_norm": 0.3136357507978124, "learning_rate": 5.58811459756392e-09, "loss": 0.0304, "step": 4443 }, { "epoch": 3.946714031971581, "grad_norm": 0.4593259847309228, "learning_rate": 5.406432258033834e-09, "loss": 0.0321, "step": 4444 }, { "epoch": 3.947602131438721, "grad_norm": 0.40217183419243724, "learning_rate": 5.227750784964625e-09, "loss": 0.0366, "step": 4445 }, { "epoch": 3.9484902309058616, "grad_norm": 0.34898375985369834, "learning_rate": 5.05207028571264e-09, "loss": 0.0341, "step": 4446 }, { "epoch": 3.9493783303730017, "grad_norm": 0.39904753677337107, "learning_rate": 4.8793908658306685e-09, "loss": 0.0341, "step": 4447 }, { "epoch": 3.950266429840142, "grad_norm": 0.3553762719573451, "learning_rate": 4.709712629067942e-09, "loss": 0.0286, "step": 4448 }, { "epoch": 3.9511545293072823, "grad_norm": 0.35527412309700473, "learning_rate": 4.543035677371244e-09, "loss": 0.0301, "step": 4449 }, { "epoch": 3.952042628774423, "grad_norm": 0.39552273161631557, "learning_rate": 4.3793601108843565e-09, "loss": 0.0368, "step": 4450 }, { "epoch": 3.9529307282415633, "grad_norm": 0.37130737779018136, "learning_rate": 4.21868602794695e-09, "loss": 0.031, "step": 4451 }, { "epoch": 3.9538188277087034, "grad_norm": 0.3440514548923429, "learning_rate": 4.061013525095692e-09, "loss": 0.0303, "step": 4452 }, { "epoch": 3.9547069271758435, "grad_norm": 0.35033498305532507, "learning_rate": 3.906342697063692e-09, "loss": 0.0337, "step": 4453 }, { "epoch": 3.955595026642984, "grad_norm": 0.30750918842297564, "learning_rate": 3.754673636781614e-09, "loss": 0.0276, "step": 4454 }, { "epoch": 3.9564831261101245, "grad_norm": 0.33644798243597185, "learning_rate": 3.606006435374343e-09, "loss": 0.0304, "step": 4455 }, { "epoch": 3.9573712255772646, "grad_norm": 0.39809286248444264, "learning_rate": 3.4603411821654277e-09, "loss": 0.0326, "step": 4456 }, { "epoch": 3.9582593250444047, "grad_norm": 0.38058939879062353, "learning_rate": 3.317677964674304e-09, "loss": 0.029, "step": 4457 }, { "epoch": 3.959147424511545, "grad_norm": 0.4372617290747728, "learning_rate": 3.1780168686151856e-09, "loss": 0.0336, "step": 4458 }, { "epoch": 3.9600355239786857, "grad_norm": 0.3848830082973257, "learning_rate": 3.0413579779003944e-09, "loss": 0.0306, "step": 4459 }, { "epoch": 3.960923623445826, "grad_norm": 0.41395429123456157, "learning_rate": 2.9077013746370284e-09, "loss": 0.0458, "step": 4460 }, { "epoch": 3.9618117229129663, "grad_norm": 0.3645564211377606, "learning_rate": 2.7770471391302954e-09, "loss": 0.0298, "step": 4461 }, { "epoch": 3.9626998223801064, "grad_norm": 0.3398971948514898, "learning_rate": 2.6493953498790692e-09, "loss": 0.0246, "step": 4462 }, { "epoch": 3.963587921847247, "grad_norm": 0.38068185224854645, "learning_rate": 2.5247460835803317e-09, "loss": 0.0276, "step": 4463 }, { "epoch": 3.964476021314387, "grad_norm": 0.44423873753371224, "learning_rate": 2.4030994151252873e-09, "loss": 0.0275, "step": 4464 }, { "epoch": 3.9653641207815276, "grad_norm": 0.3828132649398428, "learning_rate": 2.2844554176026933e-09, "loss": 0.0325, "step": 4465 }, { "epoch": 3.966252220248668, "grad_norm": 0.38784021300605276, "learning_rate": 2.1688141622966397e-09, "loss": 0.037, "step": 4466 }, { "epoch": 3.967140319715808, "grad_norm": 0.30232065076110426, "learning_rate": 2.0561757186871033e-09, "loss": 0.0248, "step": 4467 }, { "epoch": 3.9680284191829482, "grad_norm": 0.3393350555374966, "learning_rate": 1.9465401544493944e-09, "loss": 0.0252, "step": 4468 }, { "epoch": 3.9689165186500888, "grad_norm": 0.37312630808588526, "learning_rate": 1.8399075354552653e-09, "loss": 0.0325, "step": 4469 }, { "epoch": 3.9698046181172293, "grad_norm": 0.3460099167477063, "learning_rate": 1.736277925771801e-09, "loss": 0.0305, "step": 4470 }, { "epoch": 3.9706927175843694, "grad_norm": 0.3152575554068713, "learning_rate": 1.635651387662529e-09, "loss": 0.038, "step": 4471 }, { "epoch": 3.97158081705151, "grad_norm": 0.3338993687005216, "learning_rate": 1.5380279815863097e-09, "loss": 0.0255, "step": 4472 }, { "epoch": 3.97246891651865, "grad_norm": 0.40719111799698354, "learning_rate": 1.4434077661967804e-09, "loss": 0.0402, "step": 4473 }, { "epoch": 3.9733570159857905, "grad_norm": 0.296777642640809, "learning_rate": 1.3517907983445767e-09, "loss": 0.029, "step": 4474 }, { "epoch": 3.9742451154529306, "grad_norm": 0.5392442936919057, "learning_rate": 1.2631771330751107e-09, "loss": 0.0397, "step": 4475 }, { "epoch": 3.975133214920071, "grad_norm": 0.4056363313670081, "learning_rate": 1.1775668236291282e-09, "loss": 0.0312, "step": 4476 }, { "epoch": 3.9760213143872116, "grad_norm": 0.40720194454595754, "learning_rate": 1.0949599214438166e-09, "loss": 0.0366, "step": 4477 }, { "epoch": 3.9769094138543517, "grad_norm": 0.364250813844353, "learning_rate": 1.0153564761511414e-09, "loss": 0.0332, "step": 4478 }, { "epoch": 3.977797513321492, "grad_norm": 0.3556858387174976, "learning_rate": 9.387565355784e-10, "loss": 0.0246, "step": 4479 }, { "epoch": 3.9786856127886323, "grad_norm": 0.45165219318354977, "learning_rate": 8.651601457493331e-10, "loss": 0.0289, "step": 4480 }, { "epoch": 3.979573712255773, "grad_norm": 0.35139031002380433, "learning_rate": 7.945673508813479e-10, "loss": 0.0317, "step": 4481 }, { "epoch": 3.980461811722913, "grad_norm": 0.3442321499166264, "learning_rate": 7.269781933888498e-10, "loss": 0.0263, "step": 4482 }, { "epoch": 3.9813499111900534, "grad_norm": 0.3938383521002017, "learning_rate": 6.623927138804665e-10, "loss": 0.0277, "step": 4483 }, { "epoch": 3.9822380106571935, "grad_norm": 0.3487659679775154, "learning_rate": 6.008109511612681e-10, "loss": 0.0293, "step": 4484 }, { "epoch": 3.983126110124334, "grad_norm": 0.38471884813260165, "learning_rate": 5.422329422305472e-10, "loss": 0.0284, "step": 4485 }, { "epoch": 3.984014209591474, "grad_norm": 0.3897173591989827, "learning_rate": 4.86658722283484e-10, "loss": 0.0273, "step": 4486 }, { "epoch": 3.9849023090586146, "grad_norm": 0.30948987005245854, "learning_rate": 4.3408832471059094e-10, "loss": 0.0284, "step": 4487 }, { "epoch": 3.9857904085257547, "grad_norm": 0.3627150868494569, "learning_rate": 3.8452178109660285e-10, "loss": 0.0308, "step": 4488 }, { "epoch": 3.9866785079928952, "grad_norm": 0.4022849724346789, "learning_rate": 3.379591212226974e-10, "loss": 0.0351, "step": 4489 }, { "epoch": 3.9875666074600353, "grad_norm": 0.3631211766442342, "learning_rate": 2.944003730653844e-10, "loss": 0.0279, "step": 4490 }, { "epoch": 3.988454706927176, "grad_norm": 0.324546551530327, "learning_rate": 2.5384556279484107e-10, "loss": 0.0289, "step": 4491 }, { "epoch": 3.9893428063943164, "grad_norm": 0.3355581682470875, "learning_rate": 2.1629471477768727e-10, "loss": 0.0264, "step": 4492 }, { "epoch": 3.9902309058614565, "grad_norm": 0.32219069993034016, "learning_rate": 1.8174785157532017e-10, "loss": 0.0305, "step": 4493 }, { "epoch": 3.9911190053285965, "grad_norm": 0.34967548702004914, "learning_rate": 1.502049939444694e-10, "loss": 0.0251, "step": 4494 }, { "epoch": 3.992007104795737, "grad_norm": 0.36096755448800333, "learning_rate": 1.2166616083608695e-10, "loss": 0.0325, "step": 4495 }, { "epoch": 3.9928952042628776, "grad_norm": 0.5522564036143544, "learning_rate": 9.613136939812251e-11, "loss": 0.0366, "step": 4496 }, { "epoch": 3.9937833037300177, "grad_norm": 0.37215104990308784, "learning_rate": 7.360063497163783e-11, "loss": 0.0354, "step": 4497 }, { "epoch": 3.994671403197158, "grad_norm": 0.28028856136098346, "learning_rate": 5.4073971093582257e-11, "loss": 0.0266, "step": 4498 }, { "epoch": 3.9955595026642983, "grad_norm": 0.3228560504657294, "learning_rate": 3.755138949679271e-11, "loss": 0.0258, "step": 4499 }, { "epoch": 3.996447602131439, "grad_norm": 0.35030932548468985, "learning_rate": 2.403290010777326e-11, "loss": 0.0336, "step": 4500 }, { "epoch": 3.997335701598579, "grad_norm": 0.39703349791825004, "learning_rate": 1.3518511048360438e-11, "loss": 0.0327, "step": 4501 }, { "epoch": 3.9982238010657194, "grad_norm": 0.29928946564050385, "learning_rate": 6.00822863683348e-12, "loss": 0.0264, "step": 4502 }, { "epoch": 3.99911190053286, "grad_norm": 0.40616736854240193, "learning_rate": 1.502057384583644e-12, "loss": 0.0343, "step": 4503 }, { "epoch": 4.0, "grad_norm": 0.3392781532186694, "learning_rate": 0.0, "loss": 0.0238, "step": 4504 }, { "epoch": 4.0, "eval_loss": 0.04085389897227287, "eval_runtime": 77.9654, "eval_samples_per_second": 194.522, "eval_steps_per_second": 3.04, "step": 4504 }, { "epoch": 4.0, "step": 4504, "total_flos": 1010769106173952.0, "train_loss": 0.055861650320550765, "train_runtime": 18499.8374, "train_samples_per_second": 62.299, "train_steps_per_second": 0.243 } ], "logging_steps": 1, "max_steps": 4504, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1010769106173952.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }