{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9991117033089054, "eval_steps": 500, "global_step": 9004, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00044414834554741284, "grad_norm": 14.241565838210393, "learning_rate": 1.1098779134295229e-08, "loss": 0.533, "step": 1 }, { "epoch": 0.0008882966910948257, "grad_norm": 16.278912065775813, "learning_rate": 2.2197558268590458e-08, "loss": 0.5359, "step": 2 }, { "epoch": 0.0013324450366422385, "grad_norm": 15.38532645021989, "learning_rate": 3.329633740288568e-08, "loss": 0.6437, "step": 3 }, { "epoch": 0.0017765933821896514, "grad_norm": 13.00040105537638, "learning_rate": 4.4395116537180915e-08, "loss": 0.5256, "step": 4 }, { "epoch": 0.0022207417277370642, "grad_norm": 13.99913175512498, "learning_rate": 5.549389567147614e-08, "loss": 0.559, "step": 5 }, { "epoch": 0.002664890073284477, "grad_norm": 14.827666239037775, "learning_rate": 6.659267480577137e-08, "loss": 0.5342, "step": 6 }, { "epoch": 0.00310903841883189, "grad_norm": 14.56618533431443, "learning_rate": 7.76914539400666e-08, "loss": 0.5818, "step": 7 }, { "epoch": 0.0035531867643793028, "grad_norm": 14.021029607148948, "learning_rate": 8.879023307436183e-08, "loss": 0.5116, "step": 8 }, { "epoch": 0.003997335109926716, "grad_norm": 13.230591443503124, "learning_rate": 9.988901220865707e-08, "loss": 0.5254, "step": 9 }, { "epoch": 0.0044414834554741284, "grad_norm": 14.105717100587825, "learning_rate": 1.1098779134295228e-07, "loss": 0.5469, "step": 10 }, { "epoch": 0.004885631801021541, "grad_norm": 13.393742170063549, "learning_rate": 1.220865704772475e-07, "loss": 0.5273, "step": 11 }, { "epoch": 0.005329780146568954, "grad_norm": 13.614380052118197, "learning_rate": 1.3318534961154273e-07, "loss": 0.5127, "step": 12 }, { "epoch": 0.005773928492116367, "grad_norm": 12.784423466221073, "learning_rate": 1.4428412874583796e-07, "loss": 0.4306, "step": 13 }, { "epoch": 0.00621807683766378, "grad_norm": 13.80866223671461, "learning_rate": 1.553829078801332e-07, "loss": 0.5439, "step": 14 }, { "epoch": 0.006662225183211193, "grad_norm": 14.041859557487046, "learning_rate": 1.6648168701442844e-07, "loss": 0.5188, "step": 15 }, { "epoch": 0.0071063735287586055, "grad_norm": 14.936881953932252, "learning_rate": 1.7758046614872366e-07, "loss": 0.5683, "step": 16 }, { "epoch": 0.007550521874306018, "grad_norm": 15.650434961515957, "learning_rate": 1.886792452830189e-07, "loss": 0.5353, "step": 17 }, { "epoch": 0.007994670219853431, "grad_norm": 14.344720416498966, "learning_rate": 1.9977802441731414e-07, "loss": 0.4958, "step": 18 }, { "epoch": 0.008438818565400843, "grad_norm": 15.791401491309836, "learning_rate": 2.1087680355160934e-07, "loss": 0.5652, "step": 19 }, { "epoch": 0.008882966910948257, "grad_norm": 13.010603153221817, "learning_rate": 2.2197558268590456e-07, "loss": 0.4343, "step": 20 }, { "epoch": 0.009327115256495669, "grad_norm": 15.52141534233184, "learning_rate": 2.330743618201998e-07, "loss": 0.5389, "step": 21 }, { "epoch": 0.009771263602043083, "grad_norm": 15.756642888500245, "learning_rate": 2.44173140954495e-07, "loss": 0.5326, "step": 22 }, { "epoch": 0.010215411947590495, "grad_norm": 15.268952002568577, "learning_rate": 2.5527192008879024e-07, "loss": 0.4824, "step": 23 }, { "epoch": 0.010659560293137908, "grad_norm": 15.932949745262599, "learning_rate": 2.6637069922308547e-07, "loss": 0.5079, "step": 24 }, { "epoch": 0.01110370863868532, "grad_norm": 15.58611205142026, "learning_rate": 2.7746947835738074e-07, "loss": 0.476, "step": 25 }, { "epoch": 0.011547856984232734, "grad_norm": 15.375627217870338, "learning_rate": 2.885682574916759e-07, "loss": 0.4744, "step": 26 }, { "epoch": 0.011992005329780146, "grad_norm": 16.173866872463734, "learning_rate": 2.9966703662597114e-07, "loss": 0.5543, "step": 27 }, { "epoch": 0.01243615367532756, "grad_norm": 11.889556738869189, "learning_rate": 3.107658157602664e-07, "loss": 0.459, "step": 28 }, { "epoch": 0.012880302020874972, "grad_norm": 9.805467442732954, "learning_rate": 3.218645948945616e-07, "loss": 0.4039, "step": 29 }, { "epoch": 0.013324450366422385, "grad_norm": 9.109396128992588, "learning_rate": 3.3296337402885687e-07, "loss": 0.374, "step": 30 }, { "epoch": 0.013768598711969797, "grad_norm": 10.263226704180568, "learning_rate": 3.440621531631521e-07, "loss": 0.5212, "step": 31 }, { "epoch": 0.014212747057517211, "grad_norm": 8.67404802135948, "learning_rate": 3.551609322974473e-07, "loss": 0.449, "step": 32 }, { "epoch": 0.014656895403064623, "grad_norm": 9.21127577620048, "learning_rate": 3.6625971143174255e-07, "loss": 0.4436, "step": 33 }, { "epoch": 0.015101043748612037, "grad_norm": 8.76521126068558, "learning_rate": 3.773584905660378e-07, "loss": 0.4802, "step": 34 }, { "epoch": 0.015545192094159449, "grad_norm": 8.042704299848994, "learning_rate": 3.8845726970033295e-07, "loss": 0.4144, "step": 35 }, { "epoch": 0.015989340439706862, "grad_norm": 7.470228322799545, "learning_rate": 3.995560488346283e-07, "loss": 0.3999, "step": 36 }, { "epoch": 0.016433488785254274, "grad_norm": 8.608115328544926, "learning_rate": 4.1065482796892345e-07, "loss": 0.4636, "step": 37 }, { "epoch": 0.016877637130801686, "grad_norm": 7.697028697048977, "learning_rate": 4.217536071032187e-07, "loss": 0.3997, "step": 38 }, { "epoch": 0.017321785476349102, "grad_norm": 5.663561386344377, "learning_rate": 4.328523862375139e-07, "loss": 0.3691, "step": 39 }, { "epoch": 0.017765933821896514, "grad_norm": 5.938577871272599, "learning_rate": 4.4395116537180913e-07, "loss": 0.3926, "step": 40 }, { "epoch": 0.018210082167443926, "grad_norm": 6.0845722806591365, "learning_rate": 4.5504994450610435e-07, "loss": 0.4168, "step": 41 }, { "epoch": 0.018654230512991338, "grad_norm": 4.988124447126365, "learning_rate": 4.661487236403996e-07, "loss": 0.3995, "step": 42 }, { "epoch": 0.019098378858538753, "grad_norm": 5.157391164042898, "learning_rate": 4.772475027746949e-07, "loss": 0.3846, "step": 43 }, { "epoch": 0.019542527204086165, "grad_norm": 4.524592140728303, "learning_rate": 4.8834628190899e-07, "loss": 0.3393, "step": 44 }, { "epoch": 0.019986675549633577, "grad_norm": 4.748116191926127, "learning_rate": 4.994450610432853e-07, "loss": 0.3278, "step": 45 }, { "epoch": 0.02043082389518099, "grad_norm": 5.1431403204013755, "learning_rate": 5.105438401775805e-07, "loss": 0.3638, "step": 46 }, { "epoch": 0.020874972240728405, "grad_norm": 5.298031590299299, "learning_rate": 5.216426193118758e-07, "loss": 0.3682, "step": 47 }, { "epoch": 0.021319120586275817, "grad_norm": 4.678508163207935, "learning_rate": 5.327413984461709e-07, "loss": 0.3772, "step": 48 }, { "epoch": 0.02176326893182323, "grad_norm": 4.39773491079716, "learning_rate": 5.438401775804662e-07, "loss": 0.3682, "step": 49 }, { "epoch": 0.02220741727737064, "grad_norm": 4.313057042541789, "learning_rate": 5.549389567147615e-07, "loss": 0.3237, "step": 50 }, { "epoch": 0.022651565622918056, "grad_norm": 4.449086461913763, "learning_rate": 5.660377358490567e-07, "loss": 0.3623, "step": 51 }, { "epoch": 0.023095713968465468, "grad_norm": 2.9654000926481783, "learning_rate": 5.771365149833518e-07, "loss": 0.2696, "step": 52 }, { "epoch": 0.02353986231401288, "grad_norm": 2.9606096375757986, "learning_rate": 5.882352941176471e-07, "loss": 0.2518, "step": 53 }, { "epoch": 0.023984010659560292, "grad_norm": 3.4611785952391463, "learning_rate": 5.993340732519423e-07, "loss": 0.2782, "step": 54 }, { "epoch": 0.024428159005107707, "grad_norm": 3.1814895723143284, "learning_rate": 6.104328523862376e-07, "loss": 0.2906, "step": 55 }, { "epoch": 0.02487230735065512, "grad_norm": 3.1808671717848496, "learning_rate": 6.215316315205328e-07, "loss": 0.2983, "step": 56 }, { "epoch": 0.02531645569620253, "grad_norm": 2.118090185635995, "learning_rate": 6.32630410654828e-07, "loss": 0.2158, "step": 57 }, { "epoch": 0.025760604041749943, "grad_norm": 2.697428855954616, "learning_rate": 6.437291897891232e-07, "loss": 0.2815, "step": 58 }, { "epoch": 0.02620475238729736, "grad_norm": 2.8924667859746926, "learning_rate": 6.548279689234186e-07, "loss": 0.2487, "step": 59 }, { "epoch": 0.02664890073284477, "grad_norm": 2.346290297240456, "learning_rate": 6.659267480577137e-07, "loss": 0.2496, "step": 60 }, { "epoch": 0.027093049078392183, "grad_norm": 2.498409492992046, "learning_rate": 6.77025527192009e-07, "loss": 0.2728, "step": 61 }, { "epoch": 0.027537197423939595, "grad_norm": 3.674582885160151, "learning_rate": 6.881243063263042e-07, "loss": 0.2383, "step": 62 }, { "epoch": 0.02798134576948701, "grad_norm": 2.799425394175707, "learning_rate": 6.992230854605994e-07, "loss": 0.3199, "step": 63 }, { "epoch": 0.028425494115034422, "grad_norm": 2.4559947409849805, "learning_rate": 7.103218645948946e-07, "loss": 0.2693, "step": 64 }, { "epoch": 0.028869642460581834, "grad_norm": 2.293048008449068, "learning_rate": 7.214206437291898e-07, "loss": 0.2519, "step": 65 }, { "epoch": 0.029313790806129246, "grad_norm": 2.442411284742022, "learning_rate": 7.325194228634851e-07, "loss": 0.2574, "step": 66 }, { "epoch": 0.02975793915167666, "grad_norm": 1.9858437264830013, "learning_rate": 7.436182019977803e-07, "loss": 0.2059, "step": 67 }, { "epoch": 0.030202087497224073, "grad_norm": 2.561701423318645, "learning_rate": 7.547169811320755e-07, "loss": 0.2526, "step": 68 }, { "epoch": 0.030646235842771485, "grad_norm": 1.7160142067734883, "learning_rate": 7.658157602663707e-07, "loss": 0.2103, "step": 69 }, { "epoch": 0.031090384188318897, "grad_norm": 3.0262045436370473, "learning_rate": 7.769145394006659e-07, "loss": 0.278, "step": 70 }, { "epoch": 0.03153453253386631, "grad_norm": 2.1467589387452786, "learning_rate": 7.880133185349612e-07, "loss": 0.2033, "step": 71 }, { "epoch": 0.031978680879413725, "grad_norm": 2.161876762654834, "learning_rate": 7.991120976692566e-07, "loss": 0.2319, "step": 72 }, { "epoch": 0.03242282922496114, "grad_norm": 2.376008135141836, "learning_rate": 8.102108768035517e-07, "loss": 0.2775, "step": 73 }, { "epoch": 0.03286697757050855, "grad_norm": 2.12961751744709, "learning_rate": 8.213096559378469e-07, "loss": 0.2066, "step": 74 }, { "epoch": 0.033311125916055964, "grad_norm": 2.087503322703807, "learning_rate": 8.324084350721422e-07, "loss": 0.2432, "step": 75 }, { "epoch": 0.03375527426160337, "grad_norm": 2.5677509345382354, "learning_rate": 8.435072142064374e-07, "loss": 0.2027, "step": 76 }, { "epoch": 0.03419942260715079, "grad_norm": 1.7422813109091475, "learning_rate": 8.546059933407326e-07, "loss": 0.1981, "step": 77 }, { "epoch": 0.034643570952698204, "grad_norm": 1.9135429042700127, "learning_rate": 8.657047724750278e-07, "loss": 0.1914, "step": 78 }, { "epoch": 0.03508771929824561, "grad_norm": 2.0631119197996357, "learning_rate": 8.768035516093231e-07, "loss": 0.2247, "step": 79 }, { "epoch": 0.03553186764379303, "grad_norm": 2.3180619400309452, "learning_rate": 8.879023307436183e-07, "loss": 0.2491, "step": 80 }, { "epoch": 0.03597601598934044, "grad_norm": 1.9425485940179281, "learning_rate": 8.990011098779134e-07, "loss": 0.2187, "step": 81 }, { "epoch": 0.03642016433488785, "grad_norm": 1.8295253723979448, "learning_rate": 9.100998890122087e-07, "loss": 0.2012, "step": 82 }, { "epoch": 0.03686431268043527, "grad_norm": 1.9729843745153348, "learning_rate": 9.211986681465039e-07, "loss": 0.209, "step": 83 }, { "epoch": 0.037308461025982675, "grad_norm": 2.037422711387131, "learning_rate": 9.322974472807992e-07, "loss": 0.2094, "step": 84 }, { "epoch": 0.03775260937153009, "grad_norm": 1.577701916798112, "learning_rate": 9.433962264150944e-07, "loss": 0.1858, "step": 85 }, { "epoch": 0.038196757717077506, "grad_norm": 2.139670658650865, "learning_rate": 9.544950055493897e-07, "loss": 0.2254, "step": 86 }, { "epoch": 0.038640906062624915, "grad_norm": 2.8502450993225956, "learning_rate": 9.65593784683685e-07, "loss": 0.2145, "step": 87 }, { "epoch": 0.03908505440817233, "grad_norm": 1.7436844528376316, "learning_rate": 9.7669256381798e-07, "loss": 0.1927, "step": 88 }, { "epoch": 0.039529202753719746, "grad_norm": 2.2129521928377796, "learning_rate": 9.877913429522753e-07, "loss": 0.2449, "step": 89 }, { "epoch": 0.039973351099267154, "grad_norm": 1.7063140977463, "learning_rate": 9.988901220865706e-07, "loss": 0.2002, "step": 90 }, { "epoch": 0.04041749944481457, "grad_norm": 1.7274557393115937, "learning_rate": 1.009988901220866e-06, "loss": 0.2143, "step": 91 }, { "epoch": 0.04086164779036198, "grad_norm": 1.6036310126204065, "learning_rate": 1.021087680355161e-06, "loss": 0.209, "step": 92 }, { "epoch": 0.041305796135909394, "grad_norm": 1.6067518053520686, "learning_rate": 1.0321864594894562e-06, "loss": 0.2051, "step": 93 }, { "epoch": 0.04174994448145681, "grad_norm": 2.193990373798874, "learning_rate": 1.0432852386237515e-06, "loss": 0.2124, "step": 94 }, { "epoch": 0.04219409282700422, "grad_norm": 2.0886683470736536, "learning_rate": 1.0543840177580466e-06, "loss": 0.2283, "step": 95 }, { "epoch": 0.04263824117255163, "grad_norm": 2.111784187748928, "learning_rate": 1.0654827968923419e-06, "loss": 0.2035, "step": 96 }, { "epoch": 0.04308238951809905, "grad_norm": 2.082762904011713, "learning_rate": 1.0765815760266371e-06, "loss": 0.1979, "step": 97 }, { "epoch": 0.04352653786364646, "grad_norm": 1.9566752129472822, "learning_rate": 1.0876803551609324e-06, "loss": 0.2146, "step": 98 }, { "epoch": 0.04397068620919387, "grad_norm": 1.553353783765945, "learning_rate": 1.0987791342952277e-06, "loss": 0.1682, "step": 99 }, { "epoch": 0.04441483455474128, "grad_norm": 1.9387075500236326, "learning_rate": 1.109877913429523e-06, "loss": 0.1792, "step": 100 }, { "epoch": 0.044858982900288696, "grad_norm": 1.980937561976891, "learning_rate": 1.120976692563818e-06, "loss": 0.1896, "step": 101 }, { "epoch": 0.04530313124583611, "grad_norm": 2.4054051662002824, "learning_rate": 1.1320754716981133e-06, "loss": 0.2097, "step": 102 }, { "epoch": 0.04574727959138352, "grad_norm": 1.8504294534254195, "learning_rate": 1.1431742508324086e-06, "loss": 0.1795, "step": 103 }, { "epoch": 0.046191427936930936, "grad_norm": 1.8842730303029716, "learning_rate": 1.1542730299667037e-06, "loss": 0.1883, "step": 104 }, { "epoch": 0.046635576282478344, "grad_norm": 1.5610944241824418, "learning_rate": 1.165371809100999e-06, "loss": 0.1655, "step": 105 }, { "epoch": 0.04707972462802576, "grad_norm": 1.5819842411294622, "learning_rate": 1.1764705882352942e-06, "loss": 0.1811, "step": 106 }, { "epoch": 0.047523872973573175, "grad_norm": 1.9979272057896553, "learning_rate": 1.1875693673695895e-06, "loss": 0.1823, "step": 107 }, { "epoch": 0.047968021319120584, "grad_norm": 1.9458215900555198, "learning_rate": 1.1986681465038846e-06, "loss": 0.2136, "step": 108 }, { "epoch": 0.048412169664668, "grad_norm": 1.8191486796264364, "learning_rate": 1.2097669256381799e-06, "loss": 0.2082, "step": 109 }, { "epoch": 0.048856318010215415, "grad_norm": 1.9290831624594054, "learning_rate": 1.2208657047724751e-06, "loss": 0.1731, "step": 110 }, { "epoch": 0.04930046635576282, "grad_norm": 2.1157513592152233, "learning_rate": 1.2319644839067704e-06, "loss": 0.1883, "step": 111 }, { "epoch": 0.04974461470131024, "grad_norm": 2.0021290506932026, "learning_rate": 1.2430632630410657e-06, "loss": 0.1906, "step": 112 }, { "epoch": 0.05018876304685765, "grad_norm": 1.9159228327922788, "learning_rate": 1.254162042175361e-06, "loss": 0.2176, "step": 113 }, { "epoch": 0.05063291139240506, "grad_norm": 1.7650876352312228, "learning_rate": 1.265260821309656e-06, "loss": 0.1718, "step": 114 }, { "epoch": 0.05107705973795248, "grad_norm": 2.03898072034729, "learning_rate": 1.2763596004439513e-06, "loss": 0.1898, "step": 115 }, { "epoch": 0.051521208083499886, "grad_norm": 1.6693420710972704, "learning_rate": 1.2874583795782464e-06, "loss": 0.1759, "step": 116 }, { "epoch": 0.0519653564290473, "grad_norm": 2.335357803271554, "learning_rate": 1.2985571587125417e-06, "loss": 0.1837, "step": 117 }, { "epoch": 0.05240950477459472, "grad_norm": 1.682419177219224, "learning_rate": 1.3096559378468371e-06, "loss": 0.1808, "step": 118 }, { "epoch": 0.052853653120142126, "grad_norm": 2.0519984686691126, "learning_rate": 1.3207547169811322e-06, "loss": 0.1982, "step": 119 }, { "epoch": 0.05329780146568954, "grad_norm": 1.6368796114014708, "learning_rate": 1.3318534961154275e-06, "loss": 0.1758, "step": 120 }, { "epoch": 0.05374194981123695, "grad_norm": 1.6286801352711888, "learning_rate": 1.3429522752497226e-06, "loss": 0.1936, "step": 121 }, { "epoch": 0.054186098156784365, "grad_norm": 1.8633084178414114, "learning_rate": 1.354051054384018e-06, "loss": 0.202, "step": 122 }, { "epoch": 0.05463024650233178, "grad_norm": 1.9496618888186672, "learning_rate": 1.3651498335183131e-06, "loss": 0.2025, "step": 123 }, { "epoch": 0.05507439484787919, "grad_norm": 2.0859989784182353, "learning_rate": 1.3762486126526084e-06, "loss": 0.1903, "step": 124 }, { "epoch": 0.055518543193426605, "grad_norm": 1.993272387486733, "learning_rate": 1.3873473917869035e-06, "loss": 0.2123, "step": 125 }, { "epoch": 0.05596269153897402, "grad_norm": 2.2645503131575735, "learning_rate": 1.3984461709211987e-06, "loss": 0.2069, "step": 126 }, { "epoch": 0.05640683988452143, "grad_norm": 1.581545113462678, "learning_rate": 1.409544950055494e-06, "loss": 0.1804, "step": 127 }, { "epoch": 0.056850988230068844, "grad_norm": 1.9654703741649047, "learning_rate": 1.4206437291897893e-06, "loss": 0.1625, "step": 128 }, { "epoch": 0.05729513657561625, "grad_norm": 2.452801004200468, "learning_rate": 1.4317425083240844e-06, "loss": 0.1779, "step": 129 }, { "epoch": 0.05773928492116367, "grad_norm": 1.847246271423822, "learning_rate": 1.4428412874583796e-06, "loss": 0.1749, "step": 130 }, { "epoch": 0.05818343326671108, "grad_norm": 1.9357616103684274, "learning_rate": 1.4539400665926751e-06, "loss": 0.2213, "step": 131 }, { "epoch": 0.05862758161225849, "grad_norm": 1.495082490211886, "learning_rate": 1.4650388457269702e-06, "loss": 0.1579, "step": 132 }, { "epoch": 0.05907172995780591, "grad_norm": 1.5859431774202346, "learning_rate": 1.4761376248612655e-06, "loss": 0.1606, "step": 133 }, { "epoch": 0.05951587830335332, "grad_norm": 2.8376409272544687, "learning_rate": 1.4872364039955605e-06, "loss": 0.2481, "step": 134 }, { "epoch": 0.05996002664890073, "grad_norm": 1.7612190558454792, "learning_rate": 1.498335183129856e-06, "loss": 0.1637, "step": 135 }, { "epoch": 0.06040417499444815, "grad_norm": 2.070041455462641, "learning_rate": 1.509433962264151e-06, "loss": 0.2122, "step": 136 }, { "epoch": 0.060848323339995555, "grad_norm": 2.3780145580102627, "learning_rate": 1.5205327413984464e-06, "loss": 0.186, "step": 137 }, { "epoch": 0.06129247168554297, "grad_norm": 1.8658905992142587, "learning_rate": 1.5316315205327414e-06, "loss": 0.1673, "step": 138 }, { "epoch": 0.061736620031090386, "grad_norm": 2.8172930803107077, "learning_rate": 1.5427302996670367e-06, "loss": 0.244, "step": 139 }, { "epoch": 0.062180768376637795, "grad_norm": 1.6725142692022914, "learning_rate": 1.5538290788013318e-06, "loss": 0.157, "step": 140 }, { "epoch": 0.0626249167221852, "grad_norm": 1.392484954569343, "learning_rate": 1.5649278579356273e-06, "loss": 0.1558, "step": 141 }, { "epoch": 0.06306906506773262, "grad_norm": 1.4158061250880813, "learning_rate": 1.5760266370699223e-06, "loss": 0.148, "step": 142 }, { "epoch": 0.06351321341328003, "grad_norm": 1.9695160587869651, "learning_rate": 1.5871254162042176e-06, "loss": 0.1468, "step": 143 }, { "epoch": 0.06395736175882745, "grad_norm": 1.6387389663448713, "learning_rate": 1.5982241953385131e-06, "loss": 0.1363, "step": 144 }, { "epoch": 0.06440151010437487, "grad_norm": 1.977141381047165, "learning_rate": 1.6093229744728082e-06, "loss": 0.1775, "step": 145 }, { "epoch": 0.06484565844992228, "grad_norm": 1.866914454872281, "learning_rate": 1.6204217536071035e-06, "loss": 0.1676, "step": 146 }, { "epoch": 0.06528980679546968, "grad_norm": 1.8479866982339428, "learning_rate": 1.6315205327413985e-06, "loss": 0.1795, "step": 147 }, { "epoch": 0.0657339551410171, "grad_norm": 2.447758458580566, "learning_rate": 1.6426193118756938e-06, "loss": 0.1921, "step": 148 }, { "epoch": 0.06617810348656451, "grad_norm": 1.928627100482751, "learning_rate": 1.653718091009989e-06, "loss": 0.1236, "step": 149 }, { "epoch": 0.06662225183211193, "grad_norm": 2.496031778783847, "learning_rate": 1.6648168701442844e-06, "loss": 0.1263, "step": 150 }, { "epoch": 0.06706640017765934, "grad_norm": 2.543270527791031, "learning_rate": 1.6759156492785794e-06, "loss": 0.1798, "step": 151 }, { "epoch": 0.06751054852320675, "grad_norm": 2.6478561509220637, "learning_rate": 1.6870144284128747e-06, "loss": 0.1976, "step": 152 }, { "epoch": 0.06795469686875416, "grad_norm": 2.046113685099785, "learning_rate": 1.6981132075471698e-06, "loss": 0.1412, "step": 153 }, { "epoch": 0.06839884521430158, "grad_norm": 4.398993310024598, "learning_rate": 1.7092119866814653e-06, "loss": 0.2004, "step": 154 }, { "epoch": 0.06884299355984899, "grad_norm": 1.7977539055103122, "learning_rate": 1.7203107658157603e-06, "loss": 0.1803, "step": 155 }, { "epoch": 0.06928714190539641, "grad_norm": 1.9000289734288398, "learning_rate": 1.7314095449500556e-06, "loss": 0.1725, "step": 156 }, { "epoch": 0.06973129025094381, "grad_norm": 2.8300001805659956, "learning_rate": 1.742508324084351e-06, "loss": 0.1396, "step": 157 }, { "epoch": 0.07017543859649122, "grad_norm": 1.6221647641516457, "learning_rate": 1.7536071032186462e-06, "loss": 0.1377, "step": 158 }, { "epoch": 0.07061958694203864, "grad_norm": 1.8070988364351561, "learning_rate": 1.7647058823529414e-06, "loss": 0.1466, "step": 159 }, { "epoch": 0.07106373528758606, "grad_norm": 1.522016454575872, "learning_rate": 1.7758046614872365e-06, "loss": 0.1508, "step": 160 }, { "epoch": 0.07150788363313347, "grad_norm": 2.5966599384816393, "learning_rate": 1.7869034406215318e-06, "loss": 0.1946, "step": 161 }, { "epoch": 0.07195203197868089, "grad_norm": 2.055913601931369, "learning_rate": 1.7980022197558269e-06, "loss": 0.1485, "step": 162 }, { "epoch": 0.07239618032422829, "grad_norm": 1.716589810835508, "learning_rate": 1.8091009988901223e-06, "loss": 0.1552, "step": 163 }, { "epoch": 0.0728403286697757, "grad_norm": 1.324188540188503, "learning_rate": 1.8201997780244174e-06, "loss": 0.1217, "step": 164 }, { "epoch": 0.07328447701532312, "grad_norm": 1.7875490213167293, "learning_rate": 1.8312985571587127e-06, "loss": 0.1814, "step": 165 }, { "epoch": 0.07372862536087053, "grad_norm": 2.111076485427071, "learning_rate": 1.8423973362930078e-06, "loss": 0.1858, "step": 166 }, { "epoch": 0.07417277370641795, "grad_norm": 1.8246301721111953, "learning_rate": 1.8534961154273032e-06, "loss": 0.1328, "step": 167 }, { "epoch": 0.07461692205196535, "grad_norm": 7.017994935548065, "learning_rate": 1.8645948945615983e-06, "loss": 0.2175, "step": 168 }, { "epoch": 0.07506107039751277, "grad_norm": 1.8833148111605775, "learning_rate": 1.8756936736958936e-06, "loss": 0.169, "step": 169 }, { "epoch": 0.07550521874306018, "grad_norm": 2.1063106621517833, "learning_rate": 1.8867924528301889e-06, "loss": 0.1723, "step": 170 }, { "epoch": 0.0759493670886076, "grad_norm": 2.1080508911518603, "learning_rate": 1.8978912319644842e-06, "loss": 0.1313, "step": 171 }, { "epoch": 0.07639351543415501, "grad_norm": 2.042694432552121, "learning_rate": 1.9089900110987794e-06, "loss": 0.1561, "step": 172 }, { "epoch": 0.07683766377970241, "grad_norm": 2.275654963885769, "learning_rate": 1.9200887902330745e-06, "loss": 0.2056, "step": 173 }, { "epoch": 0.07728181212524983, "grad_norm": 1.4521350560986936, "learning_rate": 1.93118756936737e-06, "loss": 0.1539, "step": 174 }, { "epoch": 0.07772596047079725, "grad_norm": 1.4721774252433903, "learning_rate": 1.942286348501665e-06, "loss": 0.1392, "step": 175 }, { "epoch": 0.07817010881634466, "grad_norm": 2.236429259701826, "learning_rate": 1.95338512763596e-06, "loss": 0.1851, "step": 176 }, { "epoch": 0.07861425716189208, "grad_norm": 2.0509681089981973, "learning_rate": 1.964483906770255e-06, "loss": 0.1613, "step": 177 }, { "epoch": 0.07905840550743949, "grad_norm": 1.921297270021361, "learning_rate": 1.9755826859045507e-06, "loss": 0.1411, "step": 178 }, { "epoch": 0.07950255385298689, "grad_norm": 2.3334407767674534, "learning_rate": 1.9866814650388457e-06, "loss": 0.1673, "step": 179 }, { "epoch": 0.07994670219853431, "grad_norm": 2.4173674731585035, "learning_rate": 1.9977802441731412e-06, "loss": 0.1769, "step": 180 }, { "epoch": 0.08039085054408172, "grad_norm": 2.672814520854723, "learning_rate": 2.0088790233074363e-06, "loss": 0.1882, "step": 181 }, { "epoch": 0.08083499888962914, "grad_norm": 2.082354232015489, "learning_rate": 2.019977802441732e-06, "loss": 0.1594, "step": 182 }, { "epoch": 0.08127914723517655, "grad_norm": 2.0548698535266965, "learning_rate": 2.031076581576027e-06, "loss": 0.162, "step": 183 }, { "epoch": 0.08172329558072396, "grad_norm": 2.352430010700108, "learning_rate": 2.042175360710322e-06, "loss": 0.1429, "step": 184 }, { "epoch": 0.08216744392627137, "grad_norm": 1.8421875981209739, "learning_rate": 2.0532741398446174e-06, "loss": 0.1575, "step": 185 }, { "epoch": 0.08261159227181879, "grad_norm": 1.8037755953700159, "learning_rate": 2.0643729189789125e-06, "loss": 0.1474, "step": 186 }, { "epoch": 0.0830557406173662, "grad_norm": 2.2301663182454887, "learning_rate": 2.075471698113208e-06, "loss": 0.1381, "step": 187 }, { "epoch": 0.08349988896291362, "grad_norm": 1.8820242106298288, "learning_rate": 2.086570477247503e-06, "loss": 0.136, "step": 188 }, { "epoch": 0.08394403730846102, "grad_norm": 2.0480577593839935, "learning_rate": 2.097669256381798e-06, "loss": 0.146, "step": 189 }, { "epoch": 0.08438818565400844, "grad_norm": 1.844554763649842, "learning_rate": 2.108768035516093e-06, "loss": 0.1663, "step": 190 }, { "epoch": 0.08483233399955585, "grad_norm": 1.5207370248204364, "learning_rate": 2.1198668146503887e-06, "loss": 0.1453, "step": 191 }, { "epoch": 0.08527648234510327, "grad_norm": 1.7288600299730912, "learning_rate": 2.1309655937846837e-06, "loss": 0.1492, "step": 192 }, { "epoch": 0.08572063069065068, "grad_norm": 1.6054586875361534, "learning_rate": 2.1420643729189792e-06, "loss": 0.1702, "step": 193 }, { "epoch": 0.0861647790361981, "grad_norm": 2.082434204878279, "learning_rate": 2.1531631520532743e-06, "loss": 0.1417, "step": 194 }, { "epoch": 0.0866089273817455, "grad_norm": 1.893129069527839, "learning_rate": 2.1642619311875694e-06, "loss": 0.1746, "step": 195 }, { "epoch": 0.08705307572729291, "grad_norm": 1.5433606197712828, "learning_rate": 2.175360710321865e-06, "loss": 0.1434, "step": 196 }, { "epoch": 0.08749722407284033, "grad_norm": 1.8343180586393633, "learning_rate": 2.18645948945616e-06, "loss": 0.1635, "step": 197 }, { "epoch": 0.08794137241838774, "grad_norm": 1.9644492187331304, "learning_rate": 2.1975582685904554e-06, "loss": 0.1613, "step": 198 }, { "epoch": 0.08838552076393516, "grad_norm": 2.3183987673393784, "learning_rate": 2.2086570477247505e-06, "loss": 0.1451, "step": 199 }, { "epoch": 0.08882966910948256, "grad_norm": 1.8970811571077912, "learning_rate": 2.219755826859046e-06, "loss": 0.1693, "step": 200 }, { "epoch": 0.08927381745502998, "grad_norm": 1.3438831493644725, "learning_rate": 2.230854605993341e-06, "loss": 0.1442, "step": 201 }, { "epoch": 0.08971796580057739, "grad_norm": 2.0647130607040687, "learning_rate": 2.241953385127636e-06, "loss": 0.1483, "step": 202 }, { "epoch": 0.09016211414612481, "grad_norm": 1.8137712195107252, "learning_rate": 2.253052164261931e-06, "loss": 0.1898, "step": 203 }, { "epoch": 0.09060626249167222, "grad_norm": 1.5677896114201355, "learning_rate": 2.2641509433962266e-06, "loss": 0.1105, "step": 204 }, { "epoch": 0.09105041083721963, "grad_norm": 1.5774232800494377, "learning_rate": 2.2752497225305217e-06, "loss": 0.1439, "step": 205 }, { "epoch": 0.09149455918276704, "grad_norm": 2.191208018434963, "learning_rate": 2.286348501664817e-06, "loss": 0.1739, "step": 206 }, { "epoch": 0.09193870752831446, "grad_norm": 1.8759698513356193, "learning_rate": 2.2974472807991123e-06, "loss": 0.1238, "step": 207 }, { "epoch": 0.09238285587386187, "grad_norm": 2.193936812351504, "learning_rate": 2.3085460599334073e-06, "loss": 0.149, "step": 208 }, { "epoch": 0.09282700421940929, "grad_norm": 1.7624384709819947, "learning_rate": 2.319644839067703e-06, "loss": 0.1395, "step": 209 }, { "epoch": 0.09327115256495669, "grad_norm": 1.940248444895634, "learning_rate": 2.330743618201998e-06, "loss": 0.1321, "step": 210 }, { "epoch": 0.0937153009105041, "grad_norm": 1.623293710115476, "learning_rate": 2.3418423973362934e-06, "loss": 0.1411, "step": 211 }, { "epoch": 0.09415944925605152, "grad_norm": 2.311237234944747, "learning_rate": 2.3529411764705885e-06, "loss": 0.1451, "step": 212 }, { "epoch": 0.09460359760159893, "grad_norm": 2.0964379988774366, "learning_rate": 2.364039955604884e-06, "loss": 0.1628, "step": 213 }, { "epoch": 0.09504774594714635, "grad_norm": 1.57604767113886, "learning_rate": 2.375138734739179e-06, "loss": 0.1282, "step": 214 }, { "epoch": 0.09549189429269377, "grad_norm": 1.446630991649051, "learning_rate": 2.386237513873474e-06, "loss": 0.1392, "step": 215 }, { "epoch": 0.09593604263824117, "grad_norm": 2.274345473449785, "learning_rate": 2.397336293007769e-06, "loss": 0.1962, "step": 216 }, { "epoch": 0.09638019098378858, "grad_norm": 1.5946628028114809, "learning_rate": 2.4084350721420646e-06, "loss": 0.138, "step": 217 }, { "epoch": 0.096824339329336, "grad_norm": 1.882389417953473, "learning_rate": 2.4195338512763597e-06, "loss": 0.1903, "step": 218 }, { "epoch": 0.09726848767488341, "grad_norm": 1.904432577295931, "learning_rate": 2.430632630410655e-06, "loss": 0.1481, "step": 219 }, { "epoch": 0.09771263602043083, "grad_norm": 2.080561584835578, "learning_rate": 2.4417314095449503e-06, "loss": 0.1514, "step": 220 }, { "epoch": 0.09815678436597823, "grad_norm": 1.6677943098084358, "learning_rate": 2.4528301886792453e-06, "loss": 0.1347, "step": 221 }, { "epoch": 0.09860093271152565, "grad_norm": 1.7294352905463155, "learning_rate": 2.463928967813541e-06, "loss": 0.1386, "step": 222 }, { "epoch": 0.09904508105707306, "grad_norm": 2.0928510956520934, "learning_rate": 2.475027746947836e-06, "loss": 0.1611, "step": 223 }, { "epoch": 0.09948922940262048, "grad_norm": 1.6018044625885859, "learning_rate": 2.4861265260821314e-06, "loss": 0.1338, "step": 224 }, { "epoch": 0.09993337774816789, "grad_norm": 1.9068721122681065, "learning_rate": 2.4972253052164264e-06, "loss": 0.1557, "step": 225 }, { "epoch": 0.1003775260937153, "grad_norm": 1.5854054800345856, "learning_rate": 2.508324084350722e-06, "loss": 0.1582, "step": 226 }, { "epoch": 0.10082167443926271, "grad_norm": 1.6327772348060883, "learning_rate": 2.519422863485017e-06, "loss": 0.153, "step": 227 }, { "epoch": 0.10126582278481013, "grad_norm": 1.7221707573000986, "learning_rate": 2.530521642619312e-06, "loss": 0.1521, "step": 228 }, { "epoch": 0.10170997113035754, "grad_norm": 3.8860493727438605, "learning_rate": 2.541620421753607e-06, "loss": 0.185, "step": 229 }, { "epoch": 0.10215411947590496, "grad_norm": 2.2982967806121057, "learning_rate": 2.5527192008879026e-06, "loss": 0.1268, "step": 230 }, { "epoch": 0.10259826782145237, "grad_norm": 2.633523229110552, "learning_rate": 2.563817980022198e-06, "loss": 0.1312, "step": 231 }, { "epoch": 0.10304241616699977, "grad_norm": 1.8405348072939953, "learning_rate": 2.5749167591564928e-06, "loss": 0.1352, "step": 232 }, { "epoch": 0.10348656451254719, "grad_norm": 1.5009853192812423, "learning_rate": 2.5860155382907882e-06, "loss": 0.1191, "step": 233 }, { "epoch": 0.1039307128580946, "grad_norm": 1.3280238597160159, "learning_rate": 2.5971143174250833e-06, "loss": 0.1207, "step": 234 }, { "epoch": 0.10437486120364202, "grad_norm": 1.412771085327836, "learning_rate": 2.608213096559379e-06, "loss": 0.1343, "step": 235 }, { "epoch": 0.10481900954918943, "grad_norm": 1.4216505684340854, "learning_rate": 2.6193118756936743e-06, "loss": 0.1312, "step": 236 }, { "epoch": 0.10526315789473684, "grad_norm": 1.5128985222362534, "learning_rate": 2.630410654827969e-06, "loss": 0.1351, "step": 237 }, { "epoch": 0.10570730624028425, "grad_norm": 1.9599268561293408, "learning_rate": 2.6415094339622644e-06, "loss": 0.1403, "step": 238 }, { "epoch": 0.10615145458583167, "grad_norm": 1.4181236006554954, "learning_rate": 2.6526082130965595e-06, "loss": 0.1377, "step": 239 }, { "epoch": 0.10659560293137908, "grad_norm": 1.9917215037873872, "learning_rate": 2.663706992230855e-06, "loss": 0.1451, "step": 240 }, { "epoch": 0.1070397512769265, "grad_norm": 1.683475509658225, "learning_rate": 2.67480577136515e-06, "loss": 0.1375, "step": 241 }, { "epoch": 0.1074838996224739, "grad_norm": 1.8301967264376793, "learning_rate": 2.685904550499445e-06, "loss": 0.1464, "step": 242 }, { "epoch": 0.10792804796802132, "grad_norm": 1.6846261045693358, "learning_rate": 2.6970033296337406e-06, "loss": 0.1332, "step": 243 }, { "epoch": 0.10837219631356873, "grad_norm": 1.2774719377840502, "learning_rate": 2.708102108768036e-06, "loss": 0.1292, "step": 244 }, { "epoch": 0.10881634465911615, "grad_norm": 1.595633542836346, "learning_rate": 2.7192008879023307e-06, "loss": 0.1386, "step": 245 }, { "epoch": 0.10926049300466356, "grad_norm": 1.4528662121606482, "learning_rate": 2.7302996670366262e-06, "loss": 0.1392, "step": 246 }, { "epoch": 0.10970464135021098, "grad_norm": 1.5309789173048087, "learning_rate": 2.7413984461709213e-06, "loss": 0.1271, "step": 247 }, { "epoch": 0.11014878969575838, "grad_norm": 1.518105261331223, "learning_rate": 2.7524972253052168e-06, "loss": 0.1237, "step": 248 }, { "epoch": 0.1105929380413058, "grad_norm": 1.4676341827820025, "learning_rate": 2.7635960044395123e-06, "loss": 0.1317, "step": 249 }, { "epoch": 0.11103708638685321, "grad_norm": 2.146666992664208, "learning_rate": 2.774694783573807e-06, "loss": 0.1266, "step": 250 }, { "epoch": 0.11148123473240062, "grad_norm": 1.450876621074019, "learning_rate": 2.7857935627081024e-06, "loss": 0.1124, "step": 251 }, { "epoch": 0.11192538307794804, "grad_norm": 1.6447214379715893, "learning_rate": 2.7968923418423975e-06, "loss": 0.1176, "step": 252 }, { "epoch": 0.11236953142349544, "grad_norm": 1.85196408048202, "learning_rate": 2.807991120976693e-06, "loss": 0.1196, "step": 253 }, { "epoch": 0.11281367976904286, "grad_norm": 1.8297536053418253, "learning_rate": 2.819089900110988e-06, "loss": 0.1312, "step": 254 }, { "epoch": 0.11325782811459027, "grad_norm": 1.7773730595281947, "learning_rate": 2.830188679245283e-06, "loss": 0.1343, "step": 255 }, { "epoch": 0.11370197646013769, "grad_norm": 1.5584909202093926, "learning_rate": 2.8412874583795786e-06, "loss": 0.1075, "step": 256 }, { "epoch": 0.1141461248056851, "grad_norm": 1.9920621657295152, "learning_rate": 2.852386237513874e-06, "loss": 0.1591, "step": 257 }, { "epoch": 0.1145902731512325, "grad_norm": 2.0535793857620264, "learning_rate": 2.8634850166481687e-06, "loss": 0.1459, "step": 258 }, { "epoch": 0.11503442149677992, "grad_norm": 1.432401039912359, "learning_rate": 2.8745837957824642e-06, "loss": 0.1235, "step": 259 }, { "epoch": 0.11547856984232734, "grad_norm": 2.3190551632714427, "learning_rate": 2.8856825749167593e-06, "loss": 0.1608, "step": 260 }, { "epoch": 0.11592271818787475, "grad_norm": 1.4461950992247072, "learning_rate": 2.8967813540510548e-06, "loss": 0.1242, "step": 261 }, { "epoch": 0.11636686653342217, "grad_norm": 1.60132726584782, "learning_rate": 2.9078801331853503e-06, "loss": 0.1057, "step": 262 }, { "epoch": 0.11681101487896958, "grad_norm": 2.5527110858786553, "learning_rate": 2.918978912319645e-06, "loss": 0.1735, "step": 263 }, { "epoch": 0.11725516322451698, "grad_norm": 1.587535094612724, "learning_rate": 2.9300776914539404e-06, "loss": 0.1145, "step": 264 }, { "epoch": 0.1176993115700644, "grad_norm": 1.8628254342286168, "learning_rate": 2.9411764705882355e-06, "loss": 0.1475, "step": 265 }, { "epoch": 0.11814345991561181, "grad_norm": 2.029208074213613, "learning_rate": 2.952275249722531e-06, "loss": 0.1448, "step": 266 }, { "epoch": 0.11858760826115923, "grad_norm": 1.788916557618341, "learning_rate": 2.9633740288568256e-06, "loss": 0.1223, "step": 267 }, { "epoch": 0.11903175660670665, "grad_norm": 2.0663932534564147, "learning_rate": 2.974472807991121e-06, "loss": 0.1456, "step": 268 }, { "epoch": 0.11947590495225405, "grad_norm": 1.5929695265003, "learning_rate": 2.9855715871254166e-06, "loss": 0.1427, "step": 269 }, { "epoch": 0.11992005329780146, "grad_norm": 1.8366971238631566, "learning_rate": 2.996670366259712e-06, "loss": 0.1029, "step": 270 }, { "epoch": 0.12036420164334888, "grad_norm": 1.4941973532109012, "learning_rate": 3.0077691453940067e-06, "loss": 0.1115, "step": 271 }, { "epoch": 0.1208083499888963, "grad_norm": 2.1604567983633403, "learning_rate": 3.018867924528302e-06, "loss": 0.11, "step": 272 }, { "epoch": 0.12125249833444371, "grad_norm": 1.5440708951155746, "learning_rate": 3.0299667036625973e-06, "loss": 0.1312, "step": 273 }, { "epoch": 0.12169664667999111, "grad_norm": 1.1320500690500013, "learning_rate": 3.0410654827968928e-06, "loss": 0.1172, "step": 274 }, { "epoch": 0.12214079502553853, "grad_norm": 2.1326279999447326, "learning_rate": 3.0521642619311882e-06, "loss": 0.1099, "step": 275 }, { "epoch": 0.12258494337108594, "grad_norm": 1.8624808272019928, "learning_rate": 3.063263041065483e-06, "loss": 0.1061, "step": 276 }, { "epoch": 0.12302909171663336, "grad_norm": 2.836562163833866, "learning_rate": 3.0743618201997784e-06, "loss": 0.1173, "step": 277 }, { "epoch": 0.12347324006218077, "grad_norm": 1.5779920497766018, "learning_rate": 3.0854605993340734e-06, "loss": 0.1305, "step": 278 }, { "epoch": 0.12391738840772819, "grad_norm": 1.4061427971159166, "learning_rate": 3.096559378468369e-06, "loss": 0.1063, "step": 279 }, { "epoch": 0.12436153675327559, "grad_norm": 1.4735808311660463, "learning_rate": 3.1076581576026636e-06, "loss": 0.1478, "step": 280 }, { "epoch": 0.124805685098823, "grad_norm": 1.9063172097566101, "learning_rate": 3.118756936736959e-06, "loss": 0.152, "step": 281 }, { "epoch": 0.1252498334443704, "grad_norm": 1.2460125667748942, "learning_rate": 3.1298557158712546e-06, "loss": 0.1078, "step": 282 }, { "epoch": 0.12569398178991784, "grad_norm": 1.422370758153891, "learning_rate": 3.1409544950055496e-06, "loss": 0.1143, "step": 283 }, { "epoch": 0.12613813013546524, "grad_norm": 1.3901208342210212, "learning_rate": 3.1520532741398447e-06, "loss": 0.1232, "step": 284 }, { "epoch": 0.12658227848101267, "grad_norm": 1.2571280817929795, "learning_rate": 3.16315205327414e-06, "loss": 0.1183, "step": 285 }, { "epoch": 0.12702642682656007, "grad_norm": 1.7211835588268667, "learning_rate": 3.1742508324084352e-06, "loss": 0.1376, "step": 286 }, { "epoch": 0.1274705751721075, "grad_norm": 1.7352722841271955, "learning_rate": 3.1853496115427307e-06, "loss": 0.1221, "step": 287 }, { "epoch": 0.1279147235176549, "grad_norm": 1.608485934770127, "learning_rate": 3.1964483906770262e-06, "loss": 0.1287, "step": 288 }, { "epoch": 0.1283588718632023, "grad_norm": 1.4070749655660284, "learning_rate": 3.207547169811321e-06, "loss": 0.1107, "step": 289 }, { "epoch": 0.12880302020874973, "grad_norm": 1.8061966734954316, "learning_rate": 3.2186459489456164e-06, "loss": 0.1209, "step": 290 }, { "epoch": 0.12924716855429713, "grad_norm": 1.9290505056364757, "learning_rate": 3.2297447280799114e-06, "loss": 0.1088, "step": 291 }, { "epoch": 0.12969131689984456, "grad_norm": 1.2873703020140206, "learning_rate": 3.240843507214207e-06, "loss": 0.1107, "step": 292 }, { "epoch": 0.13013546524539196, "grad_norm": 2.100967763487988, "learning_rate": 3.2519422863485016e-06, "loss": 0.1746, "step": 293 }, { "epoch": 0.13057961359093936, "grad_norm": 2.9637529933084785, "learning_rate": 3.263041065482797e-06, "loss": 0.1408, "step": 294 }, { "epoch": 0.1310237619364868, "grad_norm": 1.8502103362104685, "learning_rate": 3.2741398446170925e-06, "loss": 0.108, "step": 295 }, { "epoch": 0.1314679102820342, "grad_norm": 1.5072896588301588, "learning_rate": 3.2852386237513876e-06, "loss": 0.1233, "step": 296 }, { "epoch": 0.13191205862758162, "grad_norm": 1.9060937237646072, "learning_rate": 3.2963374028856827e-06, "loss": 0.1137, "step": 297 }, { "epoch": 0.13235620697312903, "grad_norm": 1.4214687758215054, "learning_rate": 3.307436182019978e-06, "loss": 0.1181, "step": 298 }, { "epoch": 0.13280035531867643, "grad_norm": 1.5173189244791243, "learning_rate": 3.3185349611542732e-06, "loss": 0.1221, "step": 299 }, { "epoch": 0.13324450366422386, "grad_norm": 1.4086327372245158, "learning_rate": 3.3296337402885687e-06, "loss": 0.1598, "step": 300 }, { "epoch": 0.13368865200977126, "grad_norm": 1.3949120100912162, "learning_rate": 3.3407325194228642e-06, "loss": 0.0996, "step": 301 }, { "epoch": 0.1341328003553187, "grad_norm": 1.6249998744801628, "learning_rate": 3.351831298557159e-06, "loss": 0.0959, "step": 302 }, { "epoch": 0.1345769487008661, "grad_norm": 1.7178562509007014, "learning_rate": 3.3629300776914543e-06, "loss": 0.1348, "step": 303 }, { "epoch": 0.1350210970464135, "grad_norm": 1.7790098039504103, "learning_rate": 3.3740288568257494e-06, "loss": 0.1011, "step": 304 }, { "epoch": 0.13546524539196092, "grad_norm": 1.4533709920798474, "learning_rate": 3.385127635960045e-06, "loss": 0.1177, "step": 305 }, { "epoch": 0.13590939373750832, "grad_norm": 1.7170638072373428, "learning_rate": 3.3962264150943395e-06, "loss": 0.1264, "step": 306 }, { "epoch": 0.13635354208305575, "grad_norm": 1.1622578542744249, "learning_rate": 3.407325194228635e-06, "loss": 0.1164, "step": 307 }, { "epoch": 0.13679769042860315, "grad_norm": 1.7861497563291042, "learning_rate": 3.4184239733629305e-06, "loss": 0.1328, "step": 308 }, { "epoch": 0.13724183877415055, "grad_norm": 1.2393311320446403, "learning_rate": 3.4295227524972256e-06, "loss": 0.0994, "step": 309 }, { "epoch": 0.13768598711969798, "grad_norm": 1.779362058176627, "learning_rate": 3.4406215316315207e-06, "loss": 0.131, "step": 310 }, { "epoch": 0.13813013546524538, "grad_norm": 1.384763433835653, "learning_rate": 3.4517203107658157e-06, "loss": 0.1011, "step": 311 }, { "epoch": 0.13857428381079281, "grad_norm": 1.5455433688862117, "learning_rate": 3.4628190899001112e-06, "loss": 0.1216, "step": 312 }, { "epoch": 0.13901843215634022, "grad_norm": 1.3658352699008705, "learning_rate": 3.4739178690344067e-06, "loss": 0.123, "step": 313 }, { "epoch": 0.13946258050188762, "grad_norm": 1.3724682796873768, "learning_rate": 3.485016648168702e-06, "loss": 0.1209, "step": 314 }, { "epoch": 0.13990672884743505, "grad_norm": 1.608691375904217, "learning_rate": 3.496115427302997e-06, "loss": 0.1141, "step": 315 }, { "epoch": 0.14035087719298245, "grad_norm": 1.3598637431605427, "learning_rate": 3.5072142064372923e-06, "loss": 0.1528, "step": 316 }, { "epoch": 0.14079502553852988, "grad_norm": 1.0876962111896626, "learning_rate": 3.5183129855715874e-06, "loss": 0.1049, "step": 317 }, { "epoch": 0.14123917388407728, "grad_norm": 1.3385892109435766, "learning_rate": 3.529411764705883e-06, "loss": 0.1037, "step": 318 }, { "epoch": 0.14168332222962468, "grad_norm": 1.7433775937165439, "learning_rate": 3.5405105438401775e-06, "loss": 0.1486, "step": 319 }, { "epoch": 0.1421274705751721, "grad_norm": 1.5533508842477224, "learning_rate": 3.551609322974473e-06, "loss": 0.1049, "step": 320 }, { "epoch": 0.1425716189207195, "grad_norm": 1.2029122587877374, "learning_rate": 3.5627081021087685e-06, "loss": 0.1098, "step": 321 }, { "epoch": 0.14301576726626694, "grad_norm": 1.8995176312013884, "learning_rate": 3.5738068812430636e-06, "loss": 0.0944, "step": 322 }, { "epoch": 0.14345991561181434, "grad_norm": 1.6602519149722867, "learning_rate": 3.5849056603773586e-06, "loss": 0.1243, "step": 323 }, { "epoch": 0.14390406395736177, "grad_norm": 1.6075958194272566, "learning_rate": 3.5960044395116537e-06, "loss": 0.1196, "step": 324 }, { "epoch": 0.14434821230290917, "grad_norm": 1.510226320322185, "learning_rate": 3.607103218645949e-06, "loss": 0.1422, "step": 325 }, { "epoch": 0.14479236064845658, "grad_norm": 1.2802794537606514, "learning_rate": 3.6182019977802447e-06, "loss": 0.1244, "step": 326 }, { "epoch": 0.145236508994004, "grad_norm": 1.1595946058732067, "learning_rate": 3.6293007769145398e-06, "loss": 0.0929, "step": 327 }, { "epoch": 0.1456806573395514, "grad_norm": 1.2381936978069086, "learning_rate": 3.640399556048835e-06, "loss": 0.1002, "step": 328 }, { "epoch": 0.14612480568509884, "grad_norm": 2.4354535778190742, "learning_rate": 3.6514983351831303e-06, "loss": 0.1377, "step": 329 }, { "epoch": 0.14656895403064624, "grad_norm": 1.5157062223087485, "learning_rate": 3.6625971143174254e-06, "loss": 0.1206, "step": 330 }, { "epoch": 0.14701310237619364, "grad_norm": 1.3681542301294034, "learning_rate": 3.673695893451721e-06, "loss": 0.1047, "step": 331 }, { "epoch": 0.14745725072174107, "grad_norm": 1.9762614541590338, "learning_rate": 3.6847946725860155e-06, "loss": 0.1249, "step": 332 }, { "epoch": 0.14790139906728847, "grad_norm": 1.391859368616253, "learning_rate": 3.695893451720311e-06, "loss": 0.1015, "step": 333 }, { "epoch": 0.1483455474128359, "grad_norm": 1.1072542539549668, "learning_rate": 3.7069922308546065e-06, "loss": 0.0931, "step": 334 }, { "epoch": 0.1487896957583833, "grad_norm": 1.4909584737380348, "learning_rate": 3.7180910099889016e-06, "loss": 0.1141, "step": 335 }, { "epoch": 0.1492338441039307, "grad_norm": 1.7478929922992545, "learning_rate": 3.7291897891231966e-06, "loss": 0.0891, "step": 336 }, { "epoch": 0.14967799244947813, "grad_norm": 1.5597867645297776, "learning_rate": 3.7402885682574917e-06, "loss": 0.128, "step": 337 }, { "epoch": 0.15012214079502553, "grad_norm": 1.4083772110340225, "learning_rate": 3.751387347391787e-06, "loss": 0.1337, "step": 338 }, { "epoch": 0.15056628914057296, "grad_norm": 2.011956681151715, "learning_rate": 3.7624861265260827e-06, "loss": 0.1036, "step": 339 }, { "epoch": 0.15101043748612036, "grad_norm": 2.154404794046358, "learning_rate": 3.7735849056603777e-06, "loss": 0.1104, "step": 340 }, { "epoch": 0.15145458583166777, "grad_norm": 2.1106357904010316, "learning_rate": 3.784683684794673e-06, "loss": 0.1686, "step": 341 }, { "epoch": 0.1518987341772152, "grad_norm": 1.6933676844964125, "learning_rate": 3.7957824639289683e-06, "loss": 0.1375, "step": 342 }, { "epoch": 0.1523428825227626, "grad_norm": 2.205537988403305, "learning_rate": 3.8068812430632634e-06, "loss": 0.1513, "step": 343 }, { "epoch": 0.15278703086831003, "grad_norm": 1.2058069327729946, "learning_rate": 3.817980022197559e-06, "loss": 0.091, "step": 344 }, { "epoch": 0.15323117921385743, "grad_norm": 1.400652563163732, "learning_rate": 3.829078801331854e-06, "loss": 0.0826, "step": 345 }, { "epoch": 0.15367532755940483, "grad_norm": 2.057959821320217, "learning_rate": 3.840177580466149e-06, "loss": 0.118, "step": 346 }, { "epoch": 0.15411947590495226, "grad_norm": 1.6604930714816526, "learning_rate": 3.851276359600444e-06, "loss": 0.1301, "step": 347 }, { "epoch": 0.15456362425049966, "grad_norm": 1.033671229980745, "learning_rate": 3.86237513873474e-06, "loss": 0.0924, "step": 348 }, { "epoch": 0.1550077725960471, "grad_norm": 1.17967777436608, "learning_rate": 3.873473917869034e-06, "loss": 0.0978, "step": 349 }, { "epoch": 0.1554519209415945, "grad_norm": 1.650470297879078, "learning_rate": 3.88457269700333e-06, "loss": 0.133, "step": 350 }, { "epoch": 0.1558960692871419, "grad_norm": 1.47519341296619, "learning_rate": 3.895671476137625e-06, "loss": 0.1254, "step": 351 }, { "epoch": 0.15634021763268932, "grad_norm": 1.4035761818876917, "learning_rate": 3.90677025527192e-06, "loss": 0.1183, "step": 352 }, { "epoch": 0.15678436597823672, "grad_norm": 2.2555395385843036, "learning_rate": 3.917869034406216e-06, "loss": 0.1202, "step": 353 }, { "epoch": 0.15722851432378415, "grad_norm": 1.4759637016708067, "learning_rate": 3.92896781354051e-06, "loss": 0.1104, "step": 354 }, { "epoch": 0.15767266266933155, "grad_norm": 2.952687567266444, "learning_rate": 3.940066592674806e-06, "loss": 0.122, "step": 355 }, { "epoch": 0.15811681101487898, "grad_norm": 1.1679111801200106, "learning_rate": 3.951165371809101e-06, "loss": 0.0905, "step": 356 }, { "epoch": 0.15856095936042638, "grad_norm": 1.660729161757867, "learning_rate": 3.962264150943396e-06, "loss": 0.1521, "step": 357 }, { "epoch": 0.15900510770597379, "grad_norm": 1.3321535227222834, "learning_rate": 3.9733629300776915e-06, "loss": 0.1315, "step": 358 }, { "epoch": 0.15944925605152122, "grad_norm": 1.1798749965091133, "learning_rate": 3.9844617092119866e-06, "loss": 0.0852, "step": 359 }, { "epoch": 0.15989340439706862, "grad_norm": 1.198919817717748, "learning_rate": 3.9955604883462825e-06, "loss": 0.1029, "step": 360 }, { "epoch": 0.16033755274261605, "grad_norm": 1.1799761966591196, "learning_rate": 4.0066592674805775e-06, "loss": 0.1009, "step": 361 }, { "epoch": 0.16078170108816345, "grad_norm": 1.3843920770849105, "learning_rate": 4.017758046614873e-06, "loss": 0.1078, "step": 362 }, { "epoch": 0.16122584943371085, "grad_norm": 1.230254034984121, "learning_rate": 4.028856825749168e-06, "loss": 0.1218, "step": 363 }, { "epoch": 0.16166999777925828, "grad_norm": 1.0307816837555734, "learning_rate": 4.039955604883464e-06, "loss": 0.1138, "step": 364 }, { "epoch": 0.16211414612480568, "grad_norm": 1.1369945329768874, "learning_rate": 4.051054384017759e-06, "loss": 0.0917, "step": 365 }, { "epoch": 0.1625582944703531, "grad_norm": 1.3026077981860287, "learning_rate": 4.062153163152054e-06, "loss": 0.1209, "step": 366 }, { "epoch": 0.1630024428159005, "grad_norm": 1.8570215631965663, "learning_rate": 4.073251942286349e-06, "loss": 0.103, "step": 367 }, { "epoch": 0.1634465911614479, "grad_norm": 1.4261357373143895, "learning_rate": 4.084350721420644e-06, "loss": 0.1117, "step": 368 }, { "epoch": 0.16389073950699534, "grad_norm": 1.2842977064487127, "learning_rate": 4.09544950055494e-06, "loss": 0.0945, "step": 369 }, { "epoch": 0.16433488785254274, "grad_norm": 1.7942409210847103, "learning_rate": 4.106548279689235e-06, "loss": 0.1658, "step": 370 }, { "epoch": 0.16477903619809017, "grad_norm": 1.2570893224456356, "learning_rate": 4.11764705882353e-06, "loss": 0.1238, "step": 371 }, { "epoch": 0.16522318454363757, "grad_norm": 1.0685833693361235, "learning_rate": 4.128745837957825e-06, "loss": 0.0964, "step": 372 }, { "epoch": 0.16566733288918498, "grad_norm": 1.2815898248385025, "learning_rate": 4.13984461709212e-06, "loss": 0.105, "step": 373 }, { "epoch": 0.1661114812347324, "grad_norm": 1.64061710866675, "learning_rate": 4.150943396226416e-06, "loss": 0.1096, "step": 374 }, { "epoch": 0.1665556295802798, "grad_norm": 1.3088826974530428, "learning_rate": 4.16204217536071e-06, "loss": 0.1211, "step": 375 }, { "epoch": 0.16699977792582724, "grad_norm": 1.3003151707348597, "learning_rate": 4.173140954495006e-06, "loss": 0.1111, "step": 376 }, { "epoch": 0.16744392627137464, "grad_norm": 1.3124364780409907, "learning_rate": 4.184239733629301e-06, "loss": 0.1085, "step": 377 }, { "epoch": 0.16788807461692204, "grad_norm": 1.3347095844879298, "learning_rate": 4.195338512763596e-06, "loss": 0.1228, "step": 378 }, { "epoch": 0.16833222296246947, "grad_norm": 1.3151237942729317, "learning_rate": 4.206437291897892e-06, "loss": 0.1041, "step": 379 }, { "epoch": 0.16877637130801687, "grad_norm": 1.6082774402292863, "learning_rate": 4.217536071032186e-06, "loss": 0.0915, "step": 380 }, { "epoch": 0.1692205196535643, "grad_norm": 1.2544764918032303, "learning_rate": 4.228634850166482e-06, "loss": 0.0903, "step": 381 }, { "epoch": 0.1696646679991117, "grad_norm": 1.4709993024116135, "learning_rate": 4.239733629300777e-06, "loss": 0.1095, "step": 382 }, { "epoch": 0.1701088163446591, "grad_norm": 1.090059538683836, "learning_rate": 4.250832408435072e-06, "loss": 0.09, "step": 383 }, { "epoch": 0.17055296469020653, "grad_norm": 1.936001842072003, "learning_rate": 4.2619311875693675e-06, "loss": 0.125, "step": 384 }, { "epoch": 0.17099711303575393, "grad_norm": 1.7118256852712324, "learning_rate": 4.2730299667036625e-06, "loss": 0.095, "step": 385 }, { "epoch": 0.17144126138130136, "grad_norm": 2.0311105495228268, "learning_rate": 4.2841287458379584e-06, "loss": 0.1029, "step": 386 }, { "epoch": 0.17188540972684876, "grad_norm": 1.444490027117435, "learning_rate": 4.2952275249722535e-06, "loss": 0.1232, "step": 387 }, { "epoch": 0.1723295580723962, "grad_norm": 1.4378003146516516, "learning_rate": 4.3063263041065486e-06, "loss": 0.0932, "step": 388 }, { "epoch": 0.1727737064179436, "grad_norm": 1.1043097695262434, "learning_rate": 4.317425083240844e-06, "loss": 0.0994, "step": 389 }, { "epoch": 0.173217854763491, "grad_norm": 1.1954032097891434, "learning_rate": 4.328523862375139e-06, "loss": 0.101, "step": 390 }, { "epoch": 0.17366200310903843, "grad_norm": 1.6447959614931191, "learning_rate": 4.339622641509435e-06, "loss": 0.1343, "step": 391 }, { "epoch": 0.17410615145458583, "grad_norm": 1.5660886998105679, "learning_rate": 4.35072142064373e-06, "loss": 0.1052, "step": 392 }, { "epoch": 0.17455029980013326, "grad_norm": 1.129037326613576, "learning_rate": 4.361820199778025e-06, "loss": 0.0949, "step": 393 }, { "epoch": 0.17499444814568066, "grad_norm": 0.9299986418563283, "learning_rate": 4.37291897891232e-06, "loss": 0.091, "step": 394 }, { "epoch": 0.17543859649122806, "grad_norm": 1.5748400271295198, "learning_rate": 4.384017758046616e-06, "loss": 0.1688, "step": 395 }, { "epoch": 0.1758827448367755, "grad_norm": 0.8778899838246078, "learning_rate": 4.395116537180911e-06, "loss": 0.0983, "step": 396 }, { "epoch": 0.1763268931823229, "grad_norm": 1.1516100437124952, "learning_rate": 4.406215316315206e-06, "loss": 0.1201, "step": 397 }, { "epoch": 0.17677104152787032, "grad_norm": 0.9883123409205935, "learning_rate": 4.417314095449501e-06, "loss": 0.0693, "step": 398 }, { "epoch": 0.17721518987341772, "grad_norm": 1.367128447993114, "learning_rate": 4.428412874583796e-06, "loss": 0.1009, "step": 399 }, { "epoch": 0.17765933821896512, "grad_norm": 1.2639479044263988, "learning_rate": 4.439511653718092e-06, "loss": 0.093, "step": 400 }, { "epoch": 0.17810348656451255, "grad_norm": 1.284067214420426, "learning_rate": 4.450610432852386e-06, "loss": 0.1029, "step": 401 }, { "epoch": 0.17854763491005995, "grad_norm": 0.9723989283162116, "learning_rate": 4.461709211986682e-06, "loss": 0.093, "step": 402 }, { "epoch": 0.17899178325560738, "grad_norm": 1.1814808630811793, "learning_rate": 4.472807991120977e-06, "loss": 0.1004, "step": 403 }, { "epoch": 0.17943593160115479, "grad_norm": 1.2100875095234747, "learning_rate": 4.483906770255272e-06, "loss": 0.1165, "step": 404 }, { "epoch": 0.1798800799467022, "grad_norm": 1.2480781051362362, "learning_rate": 4.495005549389568e-06, "loss": 0.1086, "step": 405 }, { "epoch": 0.18032422829224962, "grad_norm": 1.4659555944931602, "learning_rate": 4.506104328523862e-06, "loss": 0.0966, "step": 406 }, { "epoch": 0.18076837663779702, "grad_norm": 1.2905749286816455, "learning_rate": 4.517203107658158e-06, "loss": 0.1308, "step": 407 }, { "epoch": 0.18121252498334445, "grad_norm": 1.2083135245558752, "learning_rate": 4.528301886792453e-06, "loss": 0.139, "step": 408 }, { "epoch": 0.18165667332889185, "grad_norm": 1.2677821305643562, "learning_rate": 4.539400665926748e-06, "loss": 0.1522, "step": 409 }, { "epoch": 0.18210082167443925, "grad_norm": 0.8309366049632025, "learning_rate": 4.5504994450610434e-06, "loss": 0.0955, "step": 410 }, { "epoch": 0.18254497001998668, "grad_norm": 1.6708204443941888, "learning_rate": 4.5615982241953385e-06, "loss": 0.0983, "step": 411 }, { "epoch": 0.18298911836553408, "grad_norm": 1.8028863536801523, "learning_rate": 4.572697003329634e-06, "loss": 0.1141, "step": 412 }, { "epoch": 0.1834332667110815, "grad_norm": 1.6039168610165708, "learning_rate": 4.5837957824639295e-06, "loss": 0.14, "step": 413 }, { "epoch": 0.1838774150566289, "grad_norm": 1.3909894579389195, "learning_rate": 4.5948945615982245e-06, "loss": 0.0924, "step": 414 }, { "epoch": 0.1843215634021763, "grad_norm": 1.444002762358773, "learning_rate": 4.60599334073252e-06, "loss": 0.0882, "step": 415 }, { "epoch": 0.18476571174772374, "grad_norm": 1.3517930787179577, "learning_rate": 4.617092119866815e-06, "loss": 0.1198, "step": 416 }, { "epoch": 0.18520986009327114, "grad_norm": 0.9651918865616642, "learning_rate": 4.628190899001111e-06, "loss": 0.0794, "step": 417 }, { "epoch": 0.18565400843881857, "grad_norm": 1.3087574560794024, "learning_rate": 4.639289678135406e-06, "loss": 0.1166, "step": 418 }, { "epoch": 0.18609815678436598, "grad_norm": 1.1029726712835008, "learning_rate": 4.650388457269701e-06, "loss": 0.106, "step": 419 }, { "epoch": 0.18654230512991338, "grad_norm": 1.481760865085893, "learning_rate": 4.661487236403996e-06, "loss": 0.1225, "step": 420 }, { "epoch": 0.1869864534754608, "grad_norm": 1.125656650735202, "learning_rate": 4.672586015538291e-06, "loss": 0.0894, "step": 421 }, { "epoch": 0.1874306018210082, "grad_norm": 1.1643460761006563, "learning_rate": 4.683684794672587e-06, "loss": 0.1049, "step": 422 }, { "epoch": 0.18787475016655564, "grad_norm": 1.2081839286402132, "learning_rate": 4.694783573806882e-06, "loss": 0.0984, "step": 423 }, { "epoch": 0.18831889851210304, "grad_norm": 1.0952768068994485, "learning_rate": 4.705882352941177e-06, "loss": 0.1083, "step": 424 }, { "epoch": 0.18876304685765047, "grad_norm": 1.993217784490899, "learning_rate": 4.716981132075472e-06, "loss": 0.1198, "step": 425 }, { "epoch": 0.18920719520319787, "grad_norm": 1.4709865866324339, "learning_rate": 4.728079911209768e-06, "loss": 0.1004, "step": 426 }, { "epoch": 0.18965134354874527, "grad_norm": 1.1800698065694786, "learning_rate": 4.739178690344062e-06, "loss": 0.0942, "step": 427 }, { "epoch": 0.1900954918942927, "grad_norm": 0.9004404155709735, "learning_rate": 4.750277469478358e-06, "loss": 0.0879, "step": 428 }, { "epoch": 0.1905396402398401, "grad_norm": 1.3988615855599944, "learning_rate": 4.761376248612653e-06, "loss": 0.1313, "step": 429 }, { "epoch": 0.19098378858538753, "grad_norm": 1.2097973380846636, "learning_rate": 4.772475027746948e-06, "loss": 0.1192, "step": 430 }, { "epoch": 0.19142793693093493, "grad_norm": 1.795772492183597, "learning_rate": 4.783573806881244e-06, "loss": 0.1319, "step": 431 }, { "epoch": 0.19187208527648233, "grad_norm": 1.5438576982175642, "learning_rate": 4.794672586015538e-06, "loss": 0.156, "step": 432 }, { "epoch": 0.19231623362202976, "grad_norm": 1.4462148356085274, "learning_rate": 4.805771365149834e-06, "loss": 0.0977, "step": 433 }, { "epoch": 0.19276038196757717, "grad_norm": 1.1013117212825567, "learning_rate": 4.816870144284129e-06, "loss": 0.123, "step": 434 }, { "epoch": 0.1932045303131246, "grad_norm": 1.473729194266236, "learning_rate": 4.827968923418424e-06, "loss": 0.1219, "step": 435 }, { "epoch": 0.193648678658672, "grad_norm": 1.060585950094457, "learning_rate": 4.839067702552719e-06, "loss": 0.0867, "step": 436 }, { "epoch": 0.1940928270042194, "grad_norm": 1.1939948263664504, "learning_rate": 4.8501664816870145e-06, "loss": 0.1221, "step": 437 }, { "epoch": 0.19453697534976683, "grad_norm": 1.0411525410331977, "learning_rate": 4.86126526082131e-06, "loss": 0.0955, "step": 438 }, { "epoch": 0.19498112369531423, "grad_norm": 1.3074349171537876, "learning_rate": 4.8723640399556054e-06, "loss": 0.1092, "step": 439 }, { "epoch": 0.19542527204086166, "grad_norm": 2.556790721991839, "learning_rate": 4.8834628190899005e-06, "loss": 0.1144, "step": 440 }, { "epoch": 0.19586942038640906, "grad_norm": 1.5219706664834065, "learning_rate": 4.894561598224196e-06, "loss": 0.1353, "step": 441 }, { "epoch": 0.19631356873195646, "grad_norm": 1.5771009476283926, "learning_rate": 4.905660377358491e-06, "loss": 0.1139, "step": 442 }, { "epoch": 0.1967577170775039, "grad_norm": 1.7247841251255984, "learning_rate": 4.9167591564927866e-06, "loss": 0.0969, "step": 443 }, { "epoch": 0.1972018654230513, "grad_norm": 1.118860834302166, "learning_rate": 4.927857935627082e-06, "loss": 0.0955, "step": 444 }, { "epoch": 0.19764601376859872, "grad_norm": 1.4715029598755733, "learning_rate": 4.938956714761377e-06, "loss": 0.1001, "step": 445 }, { "epoch": 0.19809016211414612, "grad_norm": 1.5680153762667055, "learning_rate": 4.950055493895672e-06, "loss": 0.1222, "step": 446 }, { "epoch": 0.19853431045969352, "grad_norm": 1.7877852329091748, "learning_rate": 4.961154273029967e-06, "loss": 0.1057, "step": 447 }, { "epoch": 0.19897845880524095, "grad_norm": 1.1650327458270198, "learning_rate": 4.972253052164263e-06, "loss": 0.1242, "step": 448 }, { "epoch": 0.19942260715078836, "grad_norm": 1.0753006649574548, "learning_rate": 4.983351831298557e-06, "loss": 0.0912, "step": 449 }, { "epoch": 0.19986675549633579, "grad_norm": 1.546863382694724, "learning_rate": 4.994450610432853e-06, "loss": 0.123, "step": 450 }, { "epoch": 0.2003109038418832, "grad_norm": 1.3219021070497294, "learning_rate": 5.005549389567148e-06, "loss": 0.1167, "step": 451 }, { "epoch": 0.2007550521874306, "grad_norm": 1.3154364501774296, "learning_rate": 5.016648168701444e-06, "loss": 0.1023, "step": 452 }, { "epoch": 0.20119920053297802, "grad_norm": 1.2732051452073168, "learning_rate": 5.027746947835739e-06, "loss": 0.0717, "step": 453 }, { "epoch": 0.20164334887852542, "grad_norm": 1.3206735580145712, "learning_rate": 5.038845726970034e-06, "loss": 0.1095, "step": 454 }, { "epoch": 0.20208749722407285, "grad_norm": 1.0332614998616947, "learning_rate": 5.049944506104328e-06, "loss": 0.0845, "step": 455 }, { "epoch": 0.20253164556962025, "grad_norm": 1.211744462644113, "learning_rate": 5.061043285238624e-06, "loss": 0.1066, "step": 456 }, { "epoch": 0.20297579391516768, "grad_norm": 1.3376009797970827, "learning_rate": 5.072142064372919e-06, "loss": 0.0905, "step": 457 }, { "epoch": 0.20341994226071508, "grad_norm": 1.2905140024192596, "learning_rate": 5.083240843507214e-06, "loss": 0.1011, "step": 458 }, { "epoch": 0.20386409060626248, "grad_norm": 1.1671668945164224, "learning_rate": 5.09433962264151e-06, "loss": 0.1027, "step": 459 }, { "epoch": 0.2043082389518099, "grad_norm": 2.1761489486755363, "learning_rate": 5.105438401775805e-06, "loss": 0.1093, "step": 460 }, { "epoch": 0.2047523872973573, "grad_norm": 1.5496188187186781, "learning_rate": 5.1165371809101e-06, "loss": 0.1241, "step": 461 }, { "epoch": 0.20519653564290474, "grad_norm": 1.555761458410687, "learning_rate": 5.127635960044396e-06, "loss": 0.1119, "step": 462 }, { "epoch": 0.20564068398845214, "grad_norm": 1.4018925493874805, "learning_rate": 5.138734739178691e-06, "loss": 0.149, "step": 463 }, { "epoch": 0.20608483233399955, "grad_norm": 1.2908877898264874, "learning_rate": 5.1498335183129855e-06, "loss": 0.1255, "step": 464 }, { "epoch": 0.20652898067954698, "grad_norm": 1.2526168814511602, "learning_rate": 5.1609322974472806e-06, "loss": 0.1145, "step": 465 }, { "epoch": 0.20697312902509438, "grad_norm": 1.1641091501138674, "learning_rate": 5.1720310765815765e-06, "loss": 0.1136, "step": 466 }, { "epoch": 0.2074172773706418, "grad_norm": 1.1082496575030598, "learning_rate": 5.1831298557158716e-06, "loss": 0.0897, "step": 467 }, { "epoch": 0.2078614257161892, "grad_norm": 1.174903547638247, "learning_rate": 5.194228634850167e-06, "loss": 0.1121, "step": 468 }, { "epoch": 0.2083055740617366, "grad_norm": 1.0600460968712155, "learning_rate": 5.2053274139844625e-06, "loss": 0.0819, "step": 469 }, { "epoch": 0.20874972240728404, "grad_norm": 1.3263800742507028, "learning_rate": 5.216426193118758e-06, "loss": 0.084, "step": 470 }, { "epoch": 0.20919387075283144, "grad_norm": 1.604108989764318, "learning_rate": 5.227524972253053e-06, "loss": 0.1337, "step": 471 }, { "epoch": 0.20963801909837887, "grad_norm": 0.9363413921696734, "learning_rate": 5.238623751387349e-06, "loss": 0.0863, "step": 472 }, { "epoch": 0.21008216744392627, "grad_norm": 2.0569016097307578, "learning_rate": 5.249722530521643e-06, "loss": 0.1218, "step": 473 }, { "epoch": 0.21052631578947367, "grad_norm": 2.0907599336843674, "learning_rate": 5.260821309655938e-06, "loss": 0.1032, "step": 474 }, { "epoch": 0.2109704641350211, "grad_norm": 1.03795187986254, "learning_rate": 5.271920088790233e-06, "loss": 0.1002, "step": 475 }, { "epoch": 0.2114146124805685, "grad_norm": 1.6168543989206048, "learning_rate": 5.283018867924529e-06, "loss": 0.1292, "step": 476 }, { "epoch": 0.21185876082611593, "grad_norm": 1.555835178594497, "learning_rate": 5.294117647058824e-06, "loss": 0.0956, "step": 477 }, { "epoch": 0.21230290917166333, "grad_norm": 1.4750911257012305, "learning_rate": 5.305216426193119e-06, "loss": 0.1025, "step": 478 }, { "epoch": 0.21274705751721074, "grad_norm": 1.1903302332979158, "learning_rate": 5.316315205327415e-06, "loss": 0.0927, "step": 479 }, { "epoch": 0.21319120586275817, "grad_norm": 1.799907699381259, "learning_rate": 5.32741398446171e-06, "loss": 0.1554, "step": 480 }, { "epoch": 0.21363535420830557, "grad_norm": 1.1084539632711692, "learning_rate": 5.338512763596004e-06, "loss": 0.0963, "step": 481 }, { "epoch": 0.214079502553853, "grad_norm": 1.0531894918937277, "learning_rate": 5.3496115427303e-06, "loss": 0.0808, "step": 482 }, { "epoch": 0.2145236508994004, "grad_norm": 1.4912911807270512, "learning_rate": 5.360710321864595e-06, "loss": 0.1053, "step": 483 }, { "epoch": 0.2149677992449478, "grad_norm": 1.1603558318841067, "learning_rate": 5.37180910099889e-06, "loss": 0.0859, "step": 484 }, { "epoch": 0.21541194759049523, "grad_norm": 1.2647862898402058, "learning_rate": 5.382907880133186e-06, "loss": 0.1165, "step": 485 }, { "epoch": 0.21585609593604263, "grad_norm": 1.3294179512793625, "learning_rate": 5.394006659267481e-06, "loss": 0.1006, "step": 486 }, { "epoch": 0.21630024428159006, "grad_norm": 1.288870748090468, "learning_rate": 5.405105438401776e-06, "loss": 0.1174, "step": 487 }, { "epoch": 0.21674439262713746, "grad_norm": 1.6526073205290404, "learning_rate": 5.416204217536072e-06, "loss": 0.1278, "step": 488 }, { "epoch": 0.2171885409726849, "grad_norm": 1.1625080059645232, "learning_rate": 5.427302996670367e-06, "loss": 0.0962, "step": 489 }, { "epoch": 0.2176326893182323, "grad_norm": 1.0239562074655422, "learning_rate": 5.4384017758046615e-06, "loss": 0.1137, "step": 490 }, { "epoch": 0.2180768376637797, "grad_norm": 1.1775834154919058, "learning_rate": 5.4495005549389565e-06, "loss": 0.1167, "step": 491 }, { "epoch": 0.21852098600932712, "grad_norm": 1.0527464833567977, "learning_rate": 5.4605993340732525e-06, "loss": 0.1035, "step": 492 }, { "epoch": 0.21896513435487452, "grad_norm": 1.0639212306477968, "learning_rate": 5.4716981132075475e-06, "loss": 0.0798, "step": 493 }, { "epoch": 0.21940928270042195, "grad_norm": 1.4130768874633084, "learning_rate": 5.482796892341843e-06, "loss": 0.1191, "step": 494 }, { "epoch": 0.21985343104596936, "grad_norm": 1.1493987018132557, "learning_rate": 5.4938956714761385e-06, "loss": 0.0933, "step": 495 }, { "epoch": 0.22029757939151676, "grad_norm": 1.5099978612143894, "learning_rate": 5.5049944506104336e-06, "loss": 0.1312, "step": 496 }, { "epoch": 0.2207417277370642, "grad_norm": 1.7153041600616725, "learning_rate": 5.516093229744729e-06, "loss": 0.1631, "step": 497 }, { "epoch": 0.2211858760826116, "grad_norm": 0.9681698333927948, "learning_rate": 5.5271920088790245e-06, "loss": 0.0811, "step": 498 }, { "epoch": 0.22163002442815902, "grad_norm": 1.2720779336432644, "learning_rate": 5.538290788013319e-06, "loss": 0.1305, "step": 499 }, { "epoch": 0.22207417277370642, "grad_norm": 1.2223556006920575, "learning_rate": 5.549389567147614e-06, "loss": 0.0942, "step": 500 }, { "epoch": 0.22251832111925382, "grad_norm": 1.0558717161626225, "learning_rate": 5.560488346281909e-06, "loss": 0.1077, "step": 501 }, { "epoch": 0.22296246946480125, "grad_norm": 1.1170017223897397, "learning_rate": 5.571587125416205e-06, "loss": 0.1309, "step": 502 }, { "epoch": 0.22340661781034865, "grad_norm": 0.9964488397086242, "learning_rate": 5.5826859045505e-06, "loss": 0.0834, "step": 503 }, { "epoch": 0.22385076615589608, "grad_norm": 1.2815734363162072, "learning_rate": 5.593784683684795e-06, "loss": 0.0901, "step": 504 }, { "epoch": 0.22429491450144348, "grad_norm": 1.0732922534576572, "learning_rate": 5.604883462819091e-06, "loss": 0.0918, "step": 505 }, { "epoch": 0.22473906284699088, "grad_norm": 1.0346342912485391, "learning_rate": 5.615982241953386e-06, "loss": 0.0983, "step": 506 }, { "epoch": 0.2251832111925383, "grad_norm": 1.2353611959723767, "learning_rate": 5.62708102108768e-06, "loss": 0.1066, "step": 507 }, { "epoch": 0.22562735953808571, "grad_norm": 1.5571341771119367, "learning_rate": 5.638179800221976e-06, "loss": 0.1197, "step": 508 }, { "epoch": 0.22607150788363314, "grad_norm": 1.8593279138829277, "learning_rate": 5.649278579356271e-06, "loss": 0.1539, "step": 509 }, { "epoch": 0.22651565622918055, "grad_norm": 1.3210670897027246, "learning_rate": 5.660377358490566e-06, "loss": 0.0972, "step": 510 }, { "epoch": 0.22695980457472795, "grad_norm": 1.555152024387198, "learning_rate": 5.671476137624862e-06, "loss": 0.1017, "step": 511 }, { "epoch": 0.22740395292027538, "grad_norm": 1.1784666679583122, "learning_rate": 5.682574916759157e-06, "loss": 0.0896, "step": 512 }, { "epoch": 0.22784810126582278, "grad_norm": 0.897552288214822, "learning_rate": 5.693673695893452e-06, "loss": 0.0748, "step": 513 }, { "epoch": 0.2282922496113702, "grad_norm": 0.9984908709528371, "learning_rate": 5.704772475027748e-06, "loss": 0.0962, "step": 514 }, { "epoch": 0.2287363979569176, "grad_norm": 1.1522533946932125, "learning_rate": 5.715871254162043e-06, "loss": 0.1175, "step": 515 }, { "epoch": 0.229180546302465, "grad_norm": 1.0543599181977559, "learning_rate": 5.7269700332963374e-06, "loss": 0.1386, "step": 516 }, { "epoch": 0.22962469464801244, "grad_norm": 0.8788825627877698, "learning_rate": 5.7380688124306325e-06, "loss": 0.0947, "step": 517 }, { "epoch": 0.23006884299355984, "grad_norm": 0.8916267274537737, "learning_rate": 5.7491675915649284e-06, "loss": 0.0875, "step": 518 }, { "epoch": 0.23051299133910727, "grad_norm": 1.3741271628041778, "learning_rate": 5.7602663706992235e-06, "loss": 0.1431, "step": 519 }, { "epoch": 0.23095713968465467, "grad_norm": 1.158157014144474, "learning_rate": 5.7713651498335186e-06, "loss": 0.1079, "step": 520 }, { "epoch": 0.2314012880302021, "grad_norm": 0.9669483042814996, "learning_rate": 5.7824639289678145e-06, "loss": 0.0756, "step": 521 }, { "epoch": 0.2318454363757495, "grad_norm": 0.9858439751900236, "learning_rate": 5.7935627081021095e-06, "loss": 0.1091, "step": 522 }, { "epoch": 0.2322895847212969, "grad_norm": 1.0356934556615558, "learning_rate": 5.804661487236405e-06, "loss": 0.0899, "step": 523 }, { "epoch": 0.23273373306684433, "grad_norm": 3.900766522495628, "learning_rate": 5.8157602663707005e-06, "loss": 0.0926, "step": 524 }, { "epoch": 0.23317788141239174, "grad_norm": 1.1976168770257003, "learning_rate": 5.826859045504995e-06, "loss": 0.0963, "step": 525 }, { "epoch": 0.23362202975793916, "grad_norm": 1.2441372325560522, "learning_rate": 5.83795782463929e-06, "loss": 0.1105, "step": 526 }, { "epoch": 0.23406617810348657, "grad_norm": 1.379627337877928, "learning_rate": 5.849056603773585e-06, "loss": 0.0851, "step": 527 }, { "epoch": 0.23451032644903397, "grad_norm": 1.0553207077914868, "learning_rate": 5.860155382907881e-06, "loss": 0.0876, "step": 528 }, { "epoch": 0.2349544747945814, "grad_norm": 1.0117503811303656, "learning_rate": 5.871254162042176e-06, "loss": 0.0925, "step": 529 }, { "epoch": 0.2353986231401288, "grad_norm": 1.2155239899129966, "learning_rate": 5.882352941176471e-06, "loss": 0.1177, "step": 530 }, { "epoch": 0.23584277148567623, "grad_norm": 1.0134886033118073, "learning_rate": 5.893451720310767e-06, "loss": 0.0868, "step": 531 }, { "epoch": 0.23628691983122363, "grad_norm": 1.2415034220382457, "learning_rate": 5.904550499445062e-06, "loss": 0.1016, "step": 532 }, { "epoch": 0.23673106817677103, "grad_norm": 0.9266174346376681, "learning_rate": 5.915649278579356e-06, "loss": 0.1001, "step": 533 }, { "epoch": 0.23717521652231846, "grad_norm": 1.3120341529200943, "learning_rate": 5.926748057713651e-06, "loss": 0.1248, "step": 534 }, { "epoch": 0.23761936486786586, "grad_norm": 1.426598559638218, "learning_rate": 5.937846836847947e-06, "loss": 0.0932, "step": 535 }, { "epoch": 0.2380635132134133, "grad_norm": 1.0182331505869047, "learning_rate": 5.948945615982242e-06, "loss": 0.1009, "step": 536 }, { "epoch": 0.2385076615589607, "grad_norm": 1.4554113324351339, "learning_rate": 5.960044395116537e-06, "loss": 0.0998, "step": 537 }, { "epoch": 0.2389518099045081, "grad_norm": 1.6220655384523863, "learning_rate": 5.971143174250833e-06, "loss": 0.105, "step": 538 }, { "epoch": 0.23939595825005552, "grad_norm": 1.9400783770338148, "learning_rate": 5.982241953385128e-06, "loss": 0.1524, "step": 539 }, { "epoch": 0.23984010659560293, "grad_norm": 1.4635091103881346, "learning_rate": 5.993340732519424e-06, "loss": 0.1233, "step": 540 }, { "epoch": 0.24028425494115035, "grad_norm": 1.2937114895643234, "learning_rate": 6.004439511653719e-06, "loss": 0.1169, "step": 541 }, { "epoch": 0.24072840328669776, "grad_norm": 1.2221129883336008, "learning_rate": 6.015538290788013e-06, "loss": 0.1049, "step": 542 }, { "epoch": 0.24117255163224516, "grad_norm": 1.0735565355706922, "learning_rate": 6.0266370699223085e-06, "loss": 0.0886, "step": 543 }, { "epoch": 0.2416166999777926, "grad_norm": 1.3261214209776406, "learning_rate": 6.037735849056604e-06, "loss": 0.1245, "step": 544 }, { "epoch": 0.24206084832334, "grad_norm": 1.2122447330179282, "learning_rate": 6.0488346281908995e-06, "loss": 0.0824, "step": 545 }, { "epoch": 0.24250499666888742, "grad_norm": 1.1700065718816541, "learning_rate": 6.0599334073251945e-06, "loss": 0.1062, "step": 546 }, { "epoch": 0.24294914501443482, "grad_norm": 1.0126080671178852, "learning_rate": 6.0710321864594904e-06, "loss": 0.0876, "step": 547 }, { "epoch": 0.24339329335998222, "grad_norm": 1.185450234336463, "learning_rate": 6.0821309655937855e-06, "loss": 0.1057, "step": 548 }, { "epoch": 0.24383744170552965, "grad_norm": 0.94702847545811, "learning_rate": 6.0932297447280806e-06, "loss": 0.0704, "step": 549 }, { "epoch": 0.24428159005107705, "grad_norm": 1.1538838071376252, "learning_rate": 6.1043285238623765e-06, "loss": 0.1062, "step": 550 }, { "epoch": 0.24472573839662448, "grad_norm": 1.2787034775849095, "learning_rate": 6.115427302996671e-06, "loss": 0.1053, "step": 551 }, { "epoch": 0.24516988674217188, "grad_norm": 2.111386576675479, "learning_rate": 6.126526082130966e-06, "loss": 0.1423, "step": 552 }, { "epoch": 0.24561403508771928, "grad_norm": 0.8396851122657492, "learning_rate": 6.137624861265261e-06, "loss": 0.0857, "step": 553 }, { "epoch": 0.24605818343326671, "grad_norm": 1.222223712186514, "learning_rate": 6.148723640399557e-06, "loss": 0.1241, "step": 554 }, { "epoch": 0.24650233177881412, "grad_norm": 1.3542921033853719, "learning_rate": 6.159822419533852e-06, "loss": 0.1113, "step": 555 }, { "epoch": 0.24694648012436154, "grad_norm": 0.921783149485065, "learning_rate": 6.170921198668147e-06, "loss": 0.0954, "step": 556 }, { "epoch": 0.24739062846990895, "grad_norm": 1.170580173743532, "learning_rate": 6.182019977802443e-06, "loss": 0.0994, "step": 557 }, { "epoch": 0.24783477681545638, "grad_norm": 1.4092773954441822, "learning_rate": 6.193118756936738e-06, "loss": 0.0931, "step": 558 }, { "epoch": 0.24827892516100378, "grad_norm": 1.0651408845659296, "learning_rate": 6.204217536071032e-06, "loss": 0.1083, "step": 559 }, { "epoch": 0.24872307350655118, "grad_norm": 1.2143455853200855, "learning_rate": 6.215316315205327e-06, "loss": 0.0952, "step": 560 }, { "epoch": 0.2491672218520986, "grad_norm": 2.64991339168012, "learning_rate": 6.226415094339623e-06, "loss": 0.1145, "step": 561 }, { "epoch": 0.249611370197646, "grad_norm": 0.7270989844733312, "learning_rate": 6.237513873473918e-06, "loss": 0.0861, "step": 562 }, { "epoch": 0.2500555185431934, "grad_norm": 1.1431426285342126, "learning_rate": 6.248612652608213e-06, "loss": 0.1162, "step": 563 }, { "epoch": 0.2504996668887408, "grad_norm": 0.860711327757446, "learning_rate": 6.259711431742509e-06, "loss": 0.0935, "step": 564 }, { "epoch": 0.25094381523428827, "grad_norm": 0.8073003696766194, "learning_rate": 6.270810210876804e-06, "loss": 0.1079, "step": 565 }, { "epoch": 0.25138796357983567, "grad_norm": 1.1101487411387352, "learning_rate": 6.281908990011099e-06, "loss": 0.1174, "step": 566 }, { "epoch": 0.2518321119253831, "grad_norm": 1.0549286990788267, "learning_rate": 6.293007769145395e-06, "loss": 0.1051, "step": 567 }, { "epoch": 0.2522762602709305, "grad_norm": 1.2019614422517741, "learning_rate": 6.304106548279689e-06, "loss": 0.1063, "step": 568 }, { "epoch": 0.2527204086164779, "grad_norm": 0.9611079197173829, "learning_rate": 6.3152053274139845e-06, "loss": 0.0974, "step": 569 }, { "epoch": 0.25316455696202533, "grad_norm": 1.1961495715533894, "learning_rate": 6.32630410654828e-06, "loss": 0.0991, "step": 570 }, { "epoch": 0.25360870530757273, "grad_norm": 1.0576689858819426, "learning_rate": 6.3374028856825754e-06, "loss": 0.0953, "step": 571 }, { "epoch": 0.25405285365312014, "grad_norm": 1.2656186745282128, "learning_rate": 6.3485016648168705e-06, "loss": 0.1083, "step": 572 }, { "epoch": 0.25449700199866754, "grad_norm": 1.1776166049192853, "learning_rate": 6.359600443951166e-06, "loss": 0.1118, "step": 573 }, { "epoch": 0.254941150344215, "grad_norm": 0.8664088781289958, "learning_rate": 6.3706992230854615e-06, "loss": 0.0863, "step": 574 }, { "epoch": 0.2553852986897624, "grad_norm": 0.9925662850080711, "learning_rate": 6.3817980022197565e-06, "loss": 0.1058, "step": 575 }, { "epoch": 0.2558294470353098, "grad_norm": 1.0606625159824246, "learning_rate": 6.3928967813540525e-06, "loss": 0.1075, "step": 576 }, { "epoch": 0.2562735953808572, "grad_norm": 0.9900660508183841, "learning_rate": 6.403995560488347e-06, "loss": 0.0829, "step": 577 }, { "epoch": 0.2567177437264046, "grad_norm": 0.8582749721826495, "learning_rate": 6.415094339622642e-06, "loss": 0.0914, "step": 578 }, { "epoch": 0.25716189207195206, "grad_norm": 1.1617033243365338, "learning_rate": 6.426193118756937e-06, "loss": 0.1116, "step": 579 }, { "epoch": 0.25760604041749946, "grad_norm": 1.0362694278635598, "learning_rate": 6.437291897891233e-06, "loss": 0.1145, "step": 580 }, { "epoch": 0.25805018876304686, "grad_norm": 0.8723418719473582, "learning_rate": 6.448390677025528e-06, "loss": 0.0885, "step": 581 }, { "epoch": 0.25849433710859426, "grad_norm": 1.0870215330646826, "learning_rate": 6.459489456159823e-06, "loss": 0.0894, "step": 582 }, { "epoch": 0.25893848545414166, "grad_norm": 1.2407360085708667, "learning_rate": 6.470588235294119e-06, "loss": 0.0812, "step": 583 }, { "epoch": 0.2593826337996891, "grad_norm": 1.8006078173259896, "learning_rate": 6.481687014428414e-06, "loss": 0.1429, "step": 584 }, { "epoch": 0.2598267821452365, "grad_norm": 1.4748519282887915, "learning_rate": 6.492785793562708e-06, "loss": 0.0867, "step": 585 }, { "epoch": 0.2602709304907839, "grad_norm": 1.0424639232757755, "learning_rate": 6.503884572697003e-06, "loss": 0.1146, "step": 586 }, { "epoch": 0.2607150788363313, "grad_norm": 1.2786124769190668, "learning_rate": 6.514983351831299e-06, "loss": 0.1192, "step": 587 }, { "epoch": 0.26115922718187873, "grad_norm": 0.9288875670879717, "learning_rate": 6.526082130965594e-06, "loss": 0.101, "step": 588 }, { "epoch": 0.2616033755274262, "grad_norm": 0.6884103445991109, "learning_rate": 6.537180910099889e-06, "loss": 0.0743, "step": 589 }, { "epoch": 0.2620475238729736, "grad_norm": 1.0193990545252207, "learning_rate": 6.548279689234185e-06, "loss": 0.0826, "step": 590 }, { "epoch": 0.262491672218521, "grad_norm": 1.2098967104098979, "learning_rate": 6.55937846836848e-06, "loss": 0.116, "step": 591 }, { "epoch": 0.2629358205640684, "grad_norm": 0.86346939462905, "learning_rate": 6.570477247502775e-06, "loss": 0.0833, "step": 592 }, { "epoch": 0.2633799689096158, "grad_norm": 0.8577523664034169, "learning_rate": 6.581576026637071e-06, "loss": 0.0802, "step": 593 }, { "epoch": 0.26382411725516325, "grad_norm": 0.7836334305793592, "learning_rate": 6.592674805771365e-06, "loss": 0.079, "step": 594 }, { "epoch": 0.26426826560071065, "grad_norm": 0.9292792710410797, "learning_rate": 6.60377358490566e-06, "loss": 0.0841, "step": 595 }, { "epoch": 0.26471241394625805, "grad_norm": 0.949730314196452, "learning_rate": 6.614872364039956e-06, "loss": 0.0779, "step": 596 }, { "epoch": 0.26515656229180545, "grad_norm": 1.067206699117055, "learning_rate": 6.625971143174251e-06, "loss": 0.0813, "step": 597 }, { "epoch": 0.26560071063735285, "grad_norm": 1.11737459145968, "learning_rate": 6.6370699223085465e-06, "loss": 0.0855, "step": 598 }, { "epoch": 0.2660448589829003, "grad_norm": 1.3191268336017996, "learning_rate": 6.648168701442842e-06, "loss": 0.117, "step": 599 }, { "epoch": 0.2664890073284477, "grad_norm": 1.0102499350799081, "learning_rate": 6.6592674805771374e-06, "loss": 0.0878, "step": 600 }, { "epoch": 0.2669331556739951, "grad_norm": 0.878524576308141, "learning_rate": 6.6703662597114325e-06, "loss": 0.0886, "step": 601 }, { "epoch": 0.2673773040195425, "grad_norm": 1.0385994736274717, "learning_rate": 6.6814650388457284e-06, "loss": 0.0895, "step": 602 }, { "epoch": 0.2678214523650899, "grad_norm": 2.1746856925927305, "learning_rate": 6.692563817980023e-06, "loss": 0.1218, "step": 603 }, { "epoch": 0.2682656007106374, "grad_norm": 1.1284346260514344, "learning_rate": 6.703662597114318e-06, "loss": 0.0984, "step": 604 }, { "epoch": 0.2687097490561848, "grad_norm": 0.8795466679323811, "learning_rate": 6.714761376248613e-06, "loss": 0.1076, "step": 605 }, { "epoch": 0.2691538974017322, "grad_norm": 0.7889851290279695, "learning_rate": 6.725860155382909e-06, "loss": 0.0809, "step": 606 }, { "epoch": 0.2695980457472796, "grad_norm": 1.1495901760021123, "learning_rate": 6.736958934517204e-06, "loss": 0.1339, "step": 607 }, { "epoch": 0.270042194092827, "grad_norm": 0.9404581532626506, "learning_rate": 6.748057713651499e-06, "loss": 0.0835, "step": 608 }, { "epoch": 0.27048634243837444, "grad_norm": 1.1614393549481714, "learning_rate": 6.759156492785795e-06, "loss": 0.1022, "step": 609 }, { "epoch": 0.27093049078392184, "grad_norm": 0.7805043342922547, "learning_rate": 6.77025527192009e-06, "loss": 0.0872, "step": 610 }, { "epoch": 0.27137463912946924, "grad_norm": 1.1270320700845156, "learning_rate": 6.781354051054384e-06, "loss": 0.106, "step": 611 }, { "epoch": 0.27181878747501664, "grad_norm": 0.8511881872825724, "learning_rate": 6.792452830188679e-06, "loss": 0.077, "step": 612 }, { "epoch": 0.27226293582056404, "grad_norm": 0.8653445546881817, "learning_rate": 6.803551609322975e-06, "loss": 0.0703, "step": 613 }, { "epoch": 0.2727070841661115, "grad_norm": 0.9249325812470924, "learning_rate": 6.81465038845727e-06, "loss": 0.1239, "step": 614 }, { "epoch": 0.2731512325116589, "grad_norm": 1.2035733700107387, "learning_rate": 6.825749167591565e-06, "loss": 0.0803, "step": 615 }, { "epoch": 0.2735953808572063, "grad_norm": 1.058685859950284, "learning_rate": 6.836847946725861e-06, "loss": 0.101, "step": 616 }, { "epoch": 0.2740395292027537, "grad_norm": 0.9291079298055421, "learning_rate": 6.847946725860156e-06, "loss": 0.0907, "step": 617 }, { "epoch": 0.2744836775483011, "grad_norm": 1.1511507883949077, "learning_rate": 6.859045504994451e-06, "loss": 0.1077, "step": 618 }, { "epoch": 0.27492782589384857, "grad_norm": 1.0786240033108858, "learning_rate": 6.870144284128747e-06, "loss": 0.1251, "step": 619 }, { "epoch": 0.27537197423939597, "grad_norm": 0.9285679879223824, "learning_rate": 6.881243063263041e-06, "loss": 0.079, "step": 620 }, { "epoch": 0.27581612258494337, "grad_norm": 0.8117578679372189, "learning_rate": 6.892341842397336e-06, "loss": 0.0772, "step": 621 }, { "epoch": 0.27626027093049077, "grad_norm": 1.0138147773665356, "learning_rate": 6.9034406215316315e-06, "loss": 0.1238, "step": 622 }, { "epoch": 0.27670441927603817, "grad_norm": 0.988212212065645, "learning_rate": 6.914539400665927e-06, "loss": 0.0726, "step": 623 }, { "epoch": 0.27714856762158563, "grad_norm": 1.2358456192045688, "learning_rate": 6.9256381798002224e-06, "loss": 0.1125, "step": 624 }, { "epoch": 0.27759271596713303, "grad_norm": 1.2887749392067072, "learning_rate": 6.9367369589345175e-06, "loss": 0.1003, "step": 625 }, { "epoch": 0.27803686431268043, "grad_norm": 0.9864604958886025, "learning_rate": 6.947835738068813e-06, "loss": 0.0831, "step": 626 }, { "epoch": 0.27848101265822783, "grad_norm": 1.088865911862852, "learning_rate": 6.9589345172031085e-06, "loss": 0.101, "step": 627 }, { "epoch": 0.27892516100377523, "grad_norm": 1.409906763744686, "learning_rate": 6.970033296337404e-06, "loss": 0.104, "step": 628 }, { "epoch": 0.2793693093493227, "grad_norm": 1.2302985416041377, "learning_rate": 6.981132075471699e-06, "loss": 0.0969, "step": 629 }, { "epoch": 0.2798134576948701, "grad_norm": 1.2642567912432223, "learning_rate": 6.992230854605994e-06, "loss": 0.1116, "step": 630 }, { "epoch": 0.2802576060404175, "grad_norm": 1.2962548485310728, "learning_rate": 7.003329633740289e-06, "loss": 0.1144, "step": 631 }, { "epoch": 0.2807017543859649, "grad_norm": 0.9021212064599634, "learning_rate": 7.014428412874585e-06, "loss": 0.0794, "step": 632 }, { "epoch": 0.2811459027315123, "grad_norm": 0.9822662924030562, "learning_rate": 7.02552719200888e-06, "loss": 0.081, "step": 633 }, { "epoch": 0.28159005107705976, "grad_norm": 0.8568051773621045, "learning_rate": 7.036625971143175e-06, "loss": 0.0838, "step": 634 }, { "epoch": 0.28203419942260716, "grad_norm": 0.9134125872525329, "learning_rate": 7.047724750277471e-06, "loss": 0.0895, "step": 635 }, { "epoch": 0.28247834776815456, "grad_norm": 1.0203574319172648, "learning_rate": 7.058823529411766e-06, "loss": 0.0872, "step": 636 }, { "epoch": 0.28292249611370196, "grad_norm": 0.8320232571139462, "learning_rate": 7.06992230854606e-06, "loss": 0.0778, "step": 637 }, { "epoch": 0.28336664445924936, "grad_norm": 1.1019251570363913, "learning_rate": 7.081021087680355e-06, "loss": 0.101, "step": 638 }, { "epoch": 0.2838107928047968, "grad_norm": 0.9010932622870097, "learning_rate": 7.092119866814651e-06, "loss": 0.0852, "step": 639 }, { "epoch": 0.2842549411503442, "grad_norm": 1.0180000788163226, "learning_rate": 7.103218645948946e-06, "loss": 0.0955, "step": 640 }, { "epoch": 0.2846990894958916, "grad_norm": 0.9251835556617967, "learning_rate": 7.114317425083241e-06, "loss": 0.0871, "step": 641 }, { "epoch": 0.285143237841439, "grad_norm": 0.8901698566560228, "learning_rate": 7.125416204217537e-06, "loss": 0.0706, "step": 642 }, { "epoch": 0.2855873861869865, "grad_norm": 1.0154369539307455, "learning_rate": 7.136514983351832e-06, "loss": 0.0887, "step": 643 }, { "epoch": 0.2860315345325339, "grad_norm": 1.321020159204264, "learning_rate": 7.147613762486127e-06, "loss": 0.0865, "step": 644 }, { "epoch": 0.2864756828780813, "grad_norm": 2.535826739420158, "learning_rate": 7.158712541620423e-06, "loss": 0.1096, "step": 645 }, { "epoch": 0.2869198312236287, "grad_norm": 1.0227251731125295, "learning_rate": 7.169811320754717e-06, "loss": 0.0977, "step": 646 }, { "epoch": 0.2873639795691761, "grad_norm": 0.8089367869895305, "learning_rate": 7.180910099889012e-06, "loss": 0.0792, "step": 647 }, { "epoch": 0.28780812791472354, "grad_norm": 0.9374423150392824, "learning_rate": 7.1920088790233074e-06, "loss": 0.1055, "step": 648 }, { "epoch": 0.28825227626027095, "grad_norm": 1.07643087941822, "learning_rate": 7.203107658157603e-06, "loss": 0.0957, "step": 649 }, { "epoch": 0.28869642460581835, "grad_norm": 0.9052355958500774, "learning_rate": 7.214206437291898e-06, "loss": 0.097, "step": 650 }, { "epoch": 0.28914057295136575, "grad_norm": 1.1103580563722126, "learning_rate": 7.2253052164261935e-06, "loss": 0.1111, "step": 651 }, { "epoch": 0.28958472129691315, "grad_norm": 1.7388144567675092, "learning_rate": 7.236403995560489e-06, "loss": 0.0939, "step": 652 }, { "epoch": 0.2900288696424606, "grad_norm": 1.078770189866491, "learning_rate": 7.2475027746947845e-06, "loss": 0.0845, "step": 653 }, { "epoch": 0.290473017988008, "grad_norm": 1.2137588838688103, "learning_rate": 7.2586015538290795e-06, "loss": 0.0868, "step": 654 }, { "epoch": 0.2909171663335554, "grad_norm": 1.3727496776393755, "learning_rate": 7.269700332963375e-06, "loss": 0.0922, "step": 655 }, { "epoch": 0.2913613146791028, "grad_norm": 1.2519592442230572, "learning_rate": 7.28079911209767e-06, "loss": 0.126, "step": 656 }, { "epoch": 0.2918054630246502, "grad_norm": 0.8673663222128764, "learning_rate": 7.291897891231965e-06, "loss": 0.0778, "step": 657 }, { "epoch": 0.29224961137019767, "grad_norm": 1.1245127772074948, "learning_rate": 7.302996670366261e-06, "loss": 0.094, "step": 658 }, { "epoch": 0.29269375971574507, "grad_norm": 1.156956668230168, "learning_rate": 7.314095449500556e-06, "loss": 0.108, "step": 659 }, { "epoch": 0.2931379080612925, "grad_norm": 0.8322393629574484, "learning_rate": 7.325194228634851e-06, "loss": 0.0933, "step": 660 }, { "epoch": 0.2935820564068399, "grad_norm": 1.1303584309903236, "learning_rate": 7.336293007769147e-06, "loss": 0.1144, "step": 661 }, { "epoch": 0.2940262047523873, "grad_norm": 1.1516682415849495, "learning_rate": 7.347391786903442e-06, "loss": 0.1161, "step": 662 }, { "epoch": 0.29447035309793473, "grad_norm": 0.6573879684754841, "learning_rate": 7.358490566037736e-06, "loss": 0.0743, "step": 663 }, { "epoch": 0.29491450144348214, "grad_norm": 0.7675390269253073, "learning_rate": 7.369589345172031e-06, "loss": 0.0733, "step": 664 }, { "epoch": 0.29535864978902954, "grad_norm": 0.9852104843013336, "learning_rate": 7.380688124306327e-06, "loss": 0.1057, "step": 665 }, { "epoch": 0.29580279813457694, "grad_norm": 1.3140910564234118, "learning_rate": 7.391786903440622e-06, "loss": 0.1013, "step": 666 }, { "epoch": 0.29624694648012434, "grad_norm": 1.1588394898060206, "learning_rate": 7.402885682574917e-06, "loss": 0.1089, "step": 667 }, { "epoch": 0.2966910948256718, "grad_norm": 1.1857494935699013, "learning_rate": 7.413984461709213e-06, "loss": 0.0865, "step": 668 }, { "epoch": 0.2971352431712192, "grad_norm": 0.861410639016016, "learning_rate": 7.425083240843508e-06, "loss": 0.0771, "step": 669 }, { "epoch": 0.2975793915167666, "grad_norm": 1.293845000506648, "learning_rate": 7.436182019977803e-06, "loss": 0.1212, "step": 670 }, { "epoch": 0.298023539862314, "grad_norm": 1.0407435776743181, "learning_rate": 7.447280799112099e-06, "loss": 0.0948, "step": 671 }, { "epoch": 0.2984676882078614, "grad_norm": 0.9347120808526156, "learning_rate": 7.458379578246393e-06, "loss": 0.0779, "step": 672 }, { "epoch": 0.29891183655340886, "grad_norm": 0.9205867276903889, "learning_rate": 7.469478357380688e-06, "loss": 0.0838, "step": 673 }, { "epoch": 0.29935598489895626, "grad_norm": 1.388402805970494, "learning_rate": 7.480577136514983e-06, "loss": 0.0924, "step": 674 }, { "epoch": 0.29980013324450366, "grad_norm": 1.6172485445367795, "learning_rate": 7.491675915649279e-06, "loss": 0.1132, "step": 675 }, { "epoch": 0.30024428159005107, "grad_norm": 0.9561370366379224, "learning_rate": 7.502774694783574e-06, "loss": 0.1032, "step": 676 }, { "epoch": 0.30068842993559847, "grad_norm": 1.0171256621828135, "learning_rate": 7.5138734739178694e-06, "loss": 0.0763, "step": 677 }, { "epoch": 0.3011325782811459, "grad_norm": 1.0399008828044272, "learning_rate": 7.524972253052165e-06, "loss": 0.0763, "step": 678 }, { "epoch": 0.3015767266266933, "grad_norm": 1.433840414738971, "learning_rate": 7.5360710321864604e-06, "loss": 0.1028, "step": 679 }, { "epoch": 0.3020208749722407, "grad_norm": 1.0023463068044798, "learning_rate": 7.5471698113207555e-06, "loss": 0.0899, "step": 680 }, { "epoch": 0.30246502331778813, "grad_norm": 0.8983226693149738, "learning_rate": 7.55826859045505e-06, "loss": 0.0906, "step": 681 }, { "epoch": 0.30290917166333553, "grad_norm": 1.0458814923796536, "learning_rate": 7.569367369589346e-06, "loss": 0.087, "step": 682 }, { "epoch": 0.303353320008883, "grad_norm": 1.317009232453199, "learning_rate": 7.580466148723641e-06, "loss": 0.1185, "step": 683 }, { "epoch": 0.3037974683544304, "grad_norm": 0.9515944992821098, "learning_rate": 7.591564927857937e-06, "loss": 0.0784, "step": 684 }, { "epoch": 0.3042416166999778, "grad_norm": 0.8794173564696354, "learning_rate": 7.602663706992232e-06, "loss": 0.0689, "step": 685 }, { "epoch": 0.3046857650455252, "grad_norm": 1.3164826313969946, "learning_rate": 7.613762486126527e-06, "loss": 0.109, "step": 686 }, { "epoch": 0.3051299133910726, "grad_norm": 1.208021624972249, "learning_rate": 7.624861265260823e-06, "loss": 0.1006, "step": 687 }, { "epoch": 0.30557406173662005, "grad_norm": 0.9049457634328961, "learning_rate": 7.635960044395118e-06, "loss": 0.0898, "step": 688 }, { "epoch": 0.30601821008216745, "grad_norm": 1.715228836512271, "learning_rate": 7.647058823529411e-06, "loss": 0.0954, "step": 689 }, { "epoch": 0.30646235842771485, "grad_norm": 0.8243753589934399, "learning_rate": 7.658157602663708e-06, "loss": 0.0775, "step": 690 }, { "epoch": 0.30690650677326226, "grad_norm": 1.322320542517841, "learning_rate": 7.669256381798003e-06, "loss": 0.1016, "step": 691 }, { "epoch": 0.30735065511880966, "grad_norm": 0.9618609396789024, "learning_rate": 7.680355160932298e-06, "loss": 0.0973, "step": 692 }, { "epoch": 0.3077948034643571, "grad_norm": 0.9164782000859841, "learning_rate": 7.691453940066593e-06, "loss": 0.0625, "step": 693 }, { "epoch": 0.3082389518099045, "grad_norm": 0.9162081625393108, "learning_rate": 7.702552719200888e-06, "loss": 0.089, "step": 694 }, { "epoch": 0.3086831001554519, "grad_norm": 1.0274536861399304, "learning_rate": 7.713651498335183e-06, "loss": 0.0938, "step": 695 }, { "epoch": 0.3091272485009993, "grad_norm": 1.2217184648988348, "learning_rate": 7.72475027746948e-06, "loss": 0.0944, "step": 696 }, { "epoch": 0.3095713968465467, "grad_norm": 1.2610843560867633, "learning_rate": 7.735849056603775e-06, "loss": 0.1108, "step": 697 }, { "epoch": 0.3100155451920942, "grad_norm": 0.9157424175535596, "learning_rate": 7.746947835738068e-06, "loss": 0.0955, "step": 698 }, { "epoch": 0.3104596935376416, "grad_norm": 0.8026621451546831, "learning_rate": 7.758046614872365e-06, "loss": 0.0659, "step": 699 }, { "epoch": 0.310903841883189, "grad_norm": 0.8085721007510808, "learning_rate": 7.76914539400666e-06, "loss": 0.0677, "step": 700 }, { "epoch": 0.3113479902287364, "grad_norm": 0.9769745515486552, "learning_rate": 7.780244173140955e-06, "loss": 0.0896, "step": 701 }, { "epoch": 0.3117921385742838, "grad_norm": 1.0969240019260509, "learning_rate": 7.79134295227525e-06, "loss": 0.1038, "step": 702 }, { "epoch": 0.31223628691983124, "grad_norm": 0.6785759710369018, "learning_rate": 7.802441731409545e-06, "loss": 0.0559, "step": 703 }, { "epoch": 0.31268043526537864, "grad_norm": 0.9042685928894448, "learning_rate": 7.81354051054384e-06, "loss": 0.0843, "step": 704 }, { "epoch": 0.31312458361092604, "grad_norm": 0.8411568659072639, "learning_rate": 7.824639289678137e-06, "loss": 0.1247, "step": 705 }, { "epoch": 0.31356873195647345, "grad_norm": 0.9154910517915918, "learning_rate": 7.835738068812432e-06, "loss": 0.0964, "step": 706 }, { "epoch": 0.3140128803020209, "grad_norm": 1.31944775400599, "learning_rate": 7.846836847946726e-06, "loss": 0.0986, "step": 707 }, { "epoch": 0.3144570286475683, "grad_norm": 1.0518670866955138, "learning_rate": 7.85793562708102e-06, "loss": 0.095, "step": 708 }, { "epoch": 0.3149011769931157, "grad_norm": 1.580342102818051, "learning_rate": 7.869034406215318e-06, "loss": 0.0979, "step": 709 }, { "epoch": 0.3153453253386631, "grad_norm": 0.9955202031527373, "learning_rate": 7.880133185349613e-06, "loss": 0.0941, "step": 710 }, { "epoch": 0.3157894736842105, "grad_norm": 0.9719600353134535, "learning_rate": 7.891231964483908e-06, "loss": 0.0929, "step": 711 }, { "epoch": 0.31623362202975797, "grad_norm": 1.1433817316569035, "learning_rate": 7.902330743618203e-06, "loss": 0.0771, "step": 712 }, { "epoch": 0.31667777037530537, "grad_norm": 1.3256882881979881, "learning_rate": 7.913429522752498e-06, "loss": 0.0926, "step": 713 }, { "epoch": 0.31712191872085277, "grad_norm": 1.0742097800389574, "learning_rate": 7.924528301886793e-06, "loss": 0.111, "step": 714 }, { "epoch": 0.31756606706640017, "grad_norm": 0.8425793522673473, "learning_rate": 7.935627081021088e-06, "loss": 0.08, "step": 715 }, { "epoch": 0.31801021541194757, "grad_norm": 1.2662850654524707, "learning_rate": 7.946725860155383e-06, "loss": 0.1285, "step": 716 }, { "epoch": 0.31845436375749503, "grad_norm": 1.0639958089530621, "learning_rate": 7.957824639289678e-06, "loss": 0.0911, "step": 717 }, { "epoch": 0.31889851210304243, "grad_norm": 0.9354143320657179, "learning_rate": 7.968923418423973e-06, "loss": 0.0937, "step": 718 }, { "epoch": 0.31934266044858983, "grad_norm": 0.864109073064804, "learning_rate": 7.98002219755827e-06, "loss": 0.07, "step": 719 }, { "epoch": 0.31978680879413723, "grad_norm": 0.9458708669966496, "learning_rate": 7.991120976692565e-06, "loss": 0.0906, "step": 720 }, { "epoch": 0.32023095713968464, "grad_norm": 0.9008487720151787, "learning_rate": 8.00221975582686e-06, "loss": 0.0707, "step": 721 }, { "epoch": 0.3206751054852321, "grad_norm": 1.395224062553485, "learning_rate": 8.013318534961155e-06, "loss": 0.1097, "step": 722 }, { "epoch": 0.3211192538307795, "grad_norm": 1.1228102547899144, "learning_rate": 8.02441731409545e-06, "loss": 0.103, "step": 723 }, { "epoch": 0.3215634021763269, "grad_norm": 0.8489026661679754, "learning_rate": 8.035516093229745e-06, "loss": 0.0737, "step": 724 }, { "epoch": 0.3220075505218743, "grad_norm": 1.116811038077338, "learning_rate": 8.04661487236404e-06, "loss": 0.0861, "step": 725 }, { "epoch": 0.3224516988674217, "grad_norm": 1.29632952775896, "learning_rate": 8.057713651498335e-06, "loss": 0.1178, "step": 726 }, { "epoch": 0.32289584721296916, "grad_norm": 0.9812002112145898, "learning_rate": 8.06881243063263e-06, "loss": 0.0832, "step": 727 }, { "epoch": 0.32333999555851656, "grad_norm": 1.4524326080678385, "learning_rate": 8.079911209766927e-06, "loss": 0.1157, "step": 728 }, { "epoch": 0.32378414390406396, "grad_norm": 1.004750311195787, "learning_rate": 8.091009988901222e-06, "loss": 0.0817, "step": 729 }, { "epoch": 0.32422829224961136, "grad_norm": 0.9522750516409738, "learning_rate": 8.102108768035517e-06, "loss": 0.0731, "step": 730 }, { "epoch": 0.32467244059515876, "grad_norm": 1.0736358930231333, "learning_rate": 8.113207547169812e-06, "loss": 0.0994, "step": 731 }, { "epoch": 0.3251165889407062, "grad_norm": 1.8191859683022853, "learning_rate": 8.124306326304107e-06, "loss": 0.0937, "step": 732 }, { "epoch": 0.3255607372862536, "grad_norm": 1.5487234761111215, "learning_rate": 8.135405105438403e-06, "loss": 0.094, "step": 733 }, { "epoch": 0.326004885631801, "grad_norm": 1.0058113971822433, "learning_rate": 8.146503884572698e-06, "loss": 0.0887, "step": 734 }, { "epoch": 0.3264490339773484, "grad_norm": 2.7718590597147963, "learning_rate": 8.157602663706993e-06, "loss": 0.1335, "step": 735 }, { "epoch": 0.3268931823228958, "grad_norm": 2.05367085918422, "learning_rate": 8.168701442841288e-06, "loss": 0.1106, "step": 736 }, { "epoch": 0.3273373306684433, "grad_norm": 1.115500020772734, "learning_rate": 8.179800221975583e-06, "loss": 0.0722, "step": 737 }, { "epoch": 0.3277814790139907, "grad_norm": 1.1750795388816762, "learning_rate": 8.19089900110988e-06, "loss": 0.0625, "step": 738 }, { "epoch": 0.3282256273595381, "grad_norm": 0.9103834127535441, "learning_rate": 8.201997780244175e-06, "loss": 0.0796, "step": 739 }, { "epoch": 0.3286697757050855, "grad_norm": 0.9850210452729228, "learning_rate": 8.21309655937847e-06, "loss": 0.0658, "step": 740 }, { "epoch": 0.3291139240506329, "grad_norm": 0.599291679194688, "learning_rate": 8.224195338512763e-06, "loss": 0.0528, "step": 741 }, { "epoch": 0.32955807239618035, "grad_norm": 1.1762184481853932, "learning_rate": 8.23529411764706e-06, "loss": 0.101, "step": 742 }, { "epoch": 0.33000222074172775, "grad_norm": 1.4905112857899239, "learning_rate": 8.246392896781355e-06, "loss": 0.1072, "step": 743 }, { "epoch": 0.33044636908727515, "grad_norm": 0.9022874687789386, "learning_rate": 8.25749167591565e-06, "loss": 0.0948, "step": 744 }, { "epoch": 0.33089051743282255, "grad_norm": 0.7870509929638926, "learning_rate": 8.268590455049945e-06, "loss": 0.0968, "step": 745 }, { "epoch": 0.33133466577836995, "grad_norm": 0.9927750763334663, "learning_rate": 8.27968923418424e-06, "loss": 0.0944, "step": 746 }, { "epoch": 0.3317788141239174, "grad_norm": 0.6875980968846429, "learning_rate": 8.290788013318535e-06, "loss": 0.0636, "step": 747 }, { "epoch": 0.3322229624694648, "grad_norm": 0.8442026433419401, "learning_rate": 8.301886792452832e-06, "loss": 0.0831, "step": 748 }, { "epoch": 0.3326671108150122, "grad_norm": 0.9412453312595589, "learning_rate": 8.312985571587127e-06, "loss": 0.0776, "step": 749 }, { "epoch": 0.3331112591605596, "grad_norm": 0.9364463887563221, "learning_rate": 8.32408435072142e-06, "loss": 0.0887, "step": 750 }, { "epoch": 0.333555407506107, "grad_norm": 0.7227541011442813, "learning_rate": 8.335183129855715e-06, "loss": 0.0857, "step": 751 }, { "epoch": 0.3339995558516545, "grad_norm": 0.9456038972922441, "learning_rate": 8.346281908990012e-06, "loss": 0.0831, "step": 752 }, { "epoch": 0.3344437041972019, "grad_norm": 0.8779308403041339, "learning_rate": 8.357380688124307e-06, "loss": 0.1193, "step": 753 }, { "epoch": 0.3348878525427493, "grad_norm": 0.9670324236179916, "learning_rate": 8.368479467258602e-06, "loss": 0.0701, "step": 754 }, { "epoch": 0.3353320008882967, "grad_norm": 0.661653951539699, "learning_rate": 8.379578246392897e-06, "loss": 0.0601, "step": 755 }, { "epoch": 0.3357761492338441, "grad_norm": 0.7315432781058985, "learning_rate": 8.390677025527192e-06, "loss": 0.0931, "step": 756 }, { "epoch": 0.33622029757939154, "grad_norm": 0.821857058746102, "learning_rate": 8.40177580466149e-06, "loss": 0.0853, "step": 757 }, { "epoch": 0.33666444592493894, "grad_norm": 0.8172880203687719, "learning_rate": 8.412874583795784e-06, "loss": 0.1047, "step": 758 }, { "epoch": 0.33710859427048634, "grad_norm": 1.0390709540622176, "learning_rate": 8.423973362930078e-06, "loss": 0.0954, "step": 759 }, { "epoch": 0.33755274261603374, "grad_norm": 0.854833081193649, "learning_rate": 8.435072142064373e-06, "loss": 0.07, "step": 760 }, { "epoch": 0.33799689096158114, "grad_norm": 1.0343320589862304, "learning_rate": 8.44617092119867e-06, "loss": 0.1111, "step": 761 }, { "epoch": 0.3384410393071286, "grad_norm": 0.9843875498635718, "learning_rate": 8.457269700332965e-06, "loss": 0.0885, "step": 762 }, { "epoch": 0.338885187652676, "grad_norm": 1.301338158343054, "learning_rate": 8.46836847946726e-06, "loss": 0.0978, "step": 763 }, { "epoch": 0.3393293359982234, "grad_norm": 1.8891259339303466, "learning_rate": 8.479467258601555e-06, "loss": 0.1068, "step": 764 }, { "epoch": 0.3397734843437708, "grad_norm": 0.8759783579336403, "learning_rate": 8.49056603773585e-06, "loss": 0.0702, "step": 765 }, { "epoch": 0.3402176326893182, "grad_norm": 1.0288011027453567, "learning_rate": 8.501664816870145e-06, "loss": 0.1032, "step": 766 }, { "epoch": 0.34066178103486566, "grad_norm": 1.0389974353449243, "learning_rate": 8.51276359600444e-06, "loss": 0.0876, "step": 767 }, { "epoch": 0.34110592938041306, "grad_norm": 0.8136721936839324, "learning_rate": 8.523862375138735e-06, "loss": 0.0825, "step": 768 }, { "epoch": 0.34155007772596047, "grad_norm": 0.7788745832938482, "learning_rate": 8.53496115427303e-06, "loss": 0.0841, "step": 769 }, { "epoch": 0.34199422607150787, "grad_norm": 0.7903184210262955, "learning_rate": 8.546059933407325e-06, "loss": 0.0798, "step": 770 }, { "epoch": 0.34243837441705527, "grad_norm": 0.9106664043393771, "learning_rate": 8.557158712541622e-06, "loss": 0.073, "step": 771 }, { "epoch": 0.3428825227626027, "grad_norm": 1.129345042296519, "learning_rate": 8.568257491675917e-06, "loss": 0.1122, "step": 772 }, { "epoch": 0.3433266711081501, "grad_norm": 1.0693131682338626, "learning_rate": 8.579356270810212e-06, "loss": 0.0963, "step": 773 }, { "epoch": 0.34377081945369753, "grad_norm": 0.9775496598799794, "learning_rate": 8.590455049944507e-06, "loss": 0.0917, "step": 774 }, { "epoch": 0.34421496779924493, "grad_norm": 1.734237792325461, "learning_rate": 8.601553829078802e-06, "loss": 0.0964, "step": 775 }, { "epoch": 0.3446591161447924, "grad_norm": 0.8386031594806228, "learning_rate": 8.612652608213097e-06, "loss": 0.0808, "step": 776 }, { "epoch": 0.3451032644903398, "grad_norm": 1.0606813630969132, "learning_rate": 8.623751387347392e-06, "loss": 0.0689, "step": 777 }, { "epoch": 0.3455474128358872, "grad_norm": 0.9348314863875357, "learning_rate": 8.634850166481687e-06, "loss": 0.0732, "step": 778 }, { "epoch": 0.3459915611814346, "grad_norm": 1.3147018531805297, "learning_rate": 8.645948945615982e-06, "loss": 0.1498, "step": 779 }, { "epoch": 0.346435709526982, "grad_norm": 0.6686475142401406, "learning_rate": 8.657047724750277e-06, "loss": 0.0643, "step": 780 }, { "epoch": 0.34687985787252945, "grad_norm": 1.4294668088170326, "learning_rate": 8.668146503884574e-06, "loss": 0.0973, "step": 781 }, { "epoch": 0.34732400621807685, "grad_norm": 1.0334036928446257, "learning_rate": 8.67924528301887e-06, "loss": 0.0907, "step": 782 }, { "epoch": 0.34776815456362425, "grad_norm": 0.787480188698369, "learning_rate": 8.690344062153164e-06, "loss": 0.0965, "step": 783 }, { "epoch": 0.34821230290917166, "grad_norm": 1.1443565433110983, "learning_rate": 8.70144284128746e-06, "loss": 0.1128, "step": 784 }, { "epoch": 0.34865645125471906, "grad_norm": 1.4871900340672362, "learning_rate": 8.712541620421754e-06, "loss": 0.1402, "step": 785 }, { "epoch": 0.3491005996002665, "grad_norm": 1.2517565722671151, "learning_rate": 8.72364039955605e-06, "loss": 0.0937, "step": 786 }, { "epoch": 0.3495447479458139, "grad_norm": 1.2066836215604748, "learning_rate": 8.734739178690345e-06, "loss": 0.1142, "step": 787 }, { "epoch": 0.3499888962913613, "grad_norm": 0.9107920976880283, "learning_rate": 8.74583795782464e-06, "loss": 0.0885, "step": 788 }, { "epoch": 0.3504330446369087, "grad_norm": 0.9724494230585177, "learning_rate": 8.756936736958935e-06, "loss": 0.0957, "step": 789 }, { "epoch": 0.3508771929824561, "grad_norm": 0.8845187547684428, "learning_rate": 8.768035516093231e-06, "loss": 0.0713, "step": 790 }, { "epoch": 0.3513213413280036, "grad_norm": 0.9555294870332015, "learning_rate": 8.779134295227527e-06, "loss": 0.0709, "step": 791 }, { "epoch": 0.351765489673551, "grad_norm": 0.9971954202922753, "learning_rate": 8.790233074361822e-06, "loss": 0.086, "step": 792 }, { "epoch": 0.3522096380190984, "grad_norm": 0.8799552486323357, "learning_rate": 8.801331853496115e-06, "loss": 0.0878, "step": 793 }, { "epoch": 0.3526537863646458, "grad_norm": 1.1586612806863954, "learning_rate": 8.812430632630412e-06, "loss": 0.1017, "step": 794 }, { "epoch": 0.3530979347101932, "grad_norm": 1.0618777970371058, "learning_rate": 8.823529411764707e-06, "loss": 0.0932, "step": 795 }, { "epoch": 0.35354208305574064, "grad_norm": 1.0717541227428644, "learning_rate": 8.834628190899002e-06, "loss": 0.095, "step": 796 }, { "epoch": 0.35398623140128804, "grad_norm": 1.1705694879290995, "learning_rate": 8.845726970033297e-06, "loss": 0.1118, "step": 797 }, { "epoch": 0.35443037974683544, "grad_norm": 1.1829108047712253, "learning_rate": 8.856825749167592e-06, "loss": 0.1028, "step": 798 }, { "epoch": 0.35487452809238285, "grad_norm": 0.7445292910176993, "learning_rate": 8.867924528301887e-06, "loss": 0.0639, "step": 799 }, { "epoch": 0.35531867643793025, "grad_norm": 1.0475116637450934, "learning_rate": 8.879023307436184e-06, "loss": 0.0706, "step": 800 }, { "epoch": 0.3557628247834777, "grad_norm": 1.1028250244486228, "learning_rate": 8.890122086570479e-06, "loss": 0.0819, "step": 801 }, { "epoch": 0.3562069731290251, "grad_norm": 1.0807445455171478, "learning_rate": 8.901220865704772e-06, "loss": 0.0888, "step": 802 }, { "epoch": 0.3566511214745725, "grad_norm": 1.0684985644998497, "learning_rate": 8.912319644839067e-06, "loss": 0.0822, "step": 803 }, { "epoch": 0.3570952698201199, "grad_norm": 1.3191397870753145, "learning_rate": 8.923418423973364e-06, "loss": 0.1026, "step": 804 }, { "epoch": 0.3575394181656673, "grad_norm": 1.0262852726585219, "learning_rate": 8.93451720310766e-06, "loss": 0.0776, "step": 805 }, { "epoch": 0.35798356651121477, "grad_norm": 1.0652219347423726, "learning_rate": 8.945615982241954e-06, "loss": 0.0907, "step": 806 }, { "epoch": 0.35842771485676217, "grad_norm": 1.0116135773749395, "learning_rate": 8.95671476137625e-06, "loss": 0.0959, "step": 807 }, { "epoch": 0.35887186320230957, "grad_norm": 0.7818161469235702, "learning_rate": 8.967813540510544e-06, "loss": 0.0687, "step": 808 }, { "epoch": 0.359316011547857, "grad_norm": 1.2323283633104487, "learning_rate": 8.97891231964484e-06, "loss": 0.1036, "step": 809 }, { "epoch": 0.3597601598934044, "grad_norm": 1.0483052311963612, "learning_rate": 8.990011098779136e-06, "loss": 0.0899, "step": 810 }, { "epoch": 0.36020430823895183, "grad_norm": 0.8215378781764873, "learning_rate": 9.00110987791343e-06, "loss": 0.089, "step": 811 }, { "epoch": 0.36064845658449923, "grad_norm": 1.2325336486761531, "learning_rate": 9.012208657047725e-06, "loss": 0.1338, "step": 812 }, { "epoch": 0.36109260493004663, "grad_norm": 1.0281228699327587, "learning_rate": 9.02330743618202e-06, "loss": 0.0928, "step": 813 }, { "epoch": 0.36153675327559404, "grad_norm": 0.9331667240788679, "learning_rate": 9.034406215316316e-06, "loss": 0.0795, "step": 814 }, { "epoch": 0.36198090162114144, "grad_norm": 0.9152493219085753, "learning_rate": 9.045504994450612e-06, "loss": 0.1015, "step": 815 }, { "epoch": 0.3624250499666889, "grad_norm": 0.8370218994947535, "learning_rate": 9.056603773584907e-06, "loss": 0.0873, "step": 816 }, { "epoch": 0.3628691983122363, "grad_norm": 0.7774818767814435, "learning_rate": 9.067702552719202e-06, "loss": 0.0822, "step": 817 }, { "epoch": 0.3633133466577837, "grad_norm": 1.6384223947850474, "learning_rate": 9.078801331853497e-06, "loss": 0.081, "step": 818 }, { "epoch": 0.3637574950033311, "grad_norm": 0.991313719663541, "learning_rate": 9.089900110987792e-06, "loss": 0.0837, "step": 819 }, { "epoch": 0.3642016433488785, "grad_norm": 0.8596136661125425, "learning_rate": 9.100998890122087e-06, "loss": 0.0918, "step": 820 }, { "epoch": 0.36464579169442596, "grad_norm": 1.0812055701714487, "learning_rate": 9.112097669256382e-06, "loss": 0.0825, "step": 821 }, { "epoch": 0.36508994003997336, "grad_norm": 0.8070207504890635, "learning_rate": 9.123196448390677e-06, "loss": 0.0843, "step": 822 }, { "epoch": 0.36553408838552076, "grad_norm": 1.1679552400138797, "learning_rate": 9.134295227524974e-06, "loss": 0.0914, "step": 823 }, { "epoch": 0.36597823673106816, "grad_norm": 0.9627855966278458, "learning_rate": 9.145394006659269e-06, "loss": 0.08, "step": 824 }, { "epoch": 0.36642238507661556, "grad_norm": 0.8861606234396192, "learning_rate": 9.156492785793564e-06, "loss": 0.0921, "step": 825 }, { "epoch": 0.366866533422163, "grad_norm": 1.0771512576751088, "learning_rate": 9.167591564927859e-06, "loss": 0.0981, "step": 826 }, { "epoch": 0.3673106817677104, "grad_norm": 0.7667307978284066, "learning_rate": 9.178690344062154e-06, "loss": 0.0763, "step": 827 }, { "epoch": 0.3677548301132578, "grad_norm": 0.8789921021859833, "learning_rate": 9.189789123196449e-06, "loss": 0.0969, "step": 828 }, { "epoch": 0.3681989784588052, "grad_norm": 1.100607694945035, "learning_rate": 9.200887902330744e-06, "loss": 0.0945, "step": 829 }, { "epoch": 0.3686431268043526, "grad_norm": 0.8889918245566973, "learning_rate": 9.21198668146504e-06, "loss": 0.1027, "step": 830 }, { "epoch": 0.3690872751499001, "grad_norm": 0.9919838528287025, "learning_rate": 9.223085460599334e-06, "loss": 0.0876, "step": 831 }, { "epoch": 0.3695314234954475, "grad_norm": 0.7322878388650527, "learning_rate": 9.23418423973363e-06, "loss": 0.0827, "step": 832 }, { "epoch": 0.3699755718409949, "grad_norm": 0.8334148632374379, "learning_rate": 9.245283018867926e-06, "loss": 0.0836, "step": 833 }, { "epoch": 0.3704197201865423, "grad_norm": 1.2813561868661674, "learning_rate": 9.256381798002221e-06, "loss": 0.0889, "step": 834 }, { "epoch": 0.3708638685320897, "grad_norm": 1.339839760620212, "learning_rate": 9.267480577136516e-06, "loss": 0.1145, "step": 835 }, { "epoch": 0.37130801687763715, "grad_norm": 0.9154079084300466, "learning_rate": 9.278579356270811e-06, "loss": 0.0816, "step": 836 }, { "epoch": 0.37175216522318455, "grad_norm": 0.6790049839601768, "learning_rate": 9.289678135405106e-06, "loss": 0.052, "step": 837 }, { "epoch": 0.37219631356873195, "grad_norm": 1.2902350399135438, "learning_rate": 9.300776914539401e-06, "loss": 0.1205, "step": 838 }, { "epoch": 0.37264046191427935, "grad_norm": 0.6777630897746993, "learning_rate": 9.311875693673697e-06, "loss": 0.0626, "step": 839 }, { "epoch": 0.37308461025982675, "grad_norm": 0.974153136606222, "learning_rate": 9.322974472807992e-06, "loss": 0.0872, "step": 840 }, { "epoch": 0.3735287586053742, "grad_norm": 0.8802239921866019, "learning_rate": 9.334073251942287e-06, "loss": 0.0705, "step": 841 }, { "epoch": 0.3739729069509216, "grad_norm": 0.8051312485970141, "learning_rate": 9.345172031076582e-06, "loss": 0.0878, "step": 842 }, { "epoch": 0.374417055296469, "grad_norm": 1.160749633212372, "learning_rate": 9.356270810210878e-06, "loss": 0.1112, "step": 843 }, { "epoch": 0.3748612036420164, "grad_norm": 0.8346210460548523, "learning_rate": 9.367369589345174e-06, "loss": 0.1035, "step": 844 }, { "epoch": 0.3753053519875639, "grad_norm": 1.0464653523849117, "learning_rate": 9.378468368479467e-06, "loss": 0.0901, "step": 845 }, { "epoch": 0.3757495003331113, "grad_norm": 0.7515018279210384, "learning_rate": 9.389567147613764e-06, "loss": 0.0618, "step": 846 }, { "epoch": 0.3761936486786587, "grad_norm": 1.1240058131240411, "learning_rate": 9.400665926748059e-06, "loss": 0.0682, "step": 847 }, { "epoch": 0.3766377970242061, "grad_norm": 0.8330871505734428, "learning_rate": 9.411764705882354e-06, "loss": 0.0815, "step": 848 }, { "epoch": 0.3770819453697535, "grad_norm": 0.6597157022162015, "learning_rate": 9.422863485016649e-06, "loss": 0.0705, "step": 849 }, { "epoch": 0.37752609371530094, "grad_norm": 0.870783524840563, "learning_rate": 9.433962264150944e-06, "loss": 0.0756, "step": 850 }, { "epoch": 0.37797024206084834, "grad_norm": 0.7718200428803087, "learning_rate": 9.445061043285239e-06, "loss": 0.0688, "step": 851 }, { "epoch": 0.37841439040639574, "grad_norm": 1.0565515531045717, "learning_rate": 9.456159822419536e-06, "loss": 0.096, "step": 852 }, { "epoch": 0.37885853875194314, "grad_norm": 0.9116267321999116, "learning_rate": 9.46725860155383e-06, "loss": 0.1219, "step": 853 }, { "epoch": 0.37930268709749054, "grad_norm": 0.614891198569231, "learning_rate": 9.478357380688124e-06, "loss": 0.0618, "step": 854 }, { "epoch": 0.379746835443038, "grad_norm": 0.6996810655829085, "learning_rate": 9.48945615982242e-06, "loss": 0.0733, "step": 855 }, { "epoch": 0.3801909837885854, "grad_norm": 0.8742171165547441, "learning_rate": 9.500554938956716e-06, "loss": 0.0943, "step": 856 }, { "epoch": 0.3806351321341328, "grad_norm": 0.6941177838819981, "learning_rate": 9.511653718091011e-06, "loss": 0.0979, "step": 857 }, { "epoch": 0.3810792804796802, "grad_norm": 0.9912966659857255, "learning_rate": 9.522752497225306e-06, "loss": 0.1146, "step": 858 }, { "epoch": 0.3815234288252276, "grad_norm": 0.8052784568872317, "learning_rate": 9.533851276359601e-06, "loss": 0.0765, "step": 859 }, { "epoch": 0.38196757717077506, "grad_norm": 0.6650776701454606, "learning_rate": 9.544950055493896e-06, "loss": 0.0598, "step": 860 }, { "epoch": 0.38241172551632246, "grad_norm": 0.9207926874686408, "learning_rate": 9.556048834628191e-06, "loss": 0.0812, "step": 861 }, { "epoch": 0.38285587386186987, "grad_norm": 0.9371867517767639, "learning_rate": 9.567147613762488e-06, "loss": 0.1016, "step": 862 }, { "epoch": 0.38330002220741727, "grad_norm": 0.8381824885212333, "learning_rate": 9.578246392896782e-06, "loss": 0.0975, "step": 863 }, { "epoch": 0.38374417055296467, "grad_norm": 0.8006129424637977, "learning_rate": 9.589345172031077e-06, "loss": 0.0768, "step": 864 }, { "epoch": 0.3841883188985121, "grad_norm": 1.5462086828607524, "learning_rate": 9.600443951165372e-06, "loss": 0.0973, "step": 865 }, { "epoch": 0.38463246724405953, "grad_norm": 0.9098699947791947, "learning_rate": 9.611542730299668e-06, "loss": 0.0862, "step": 866 }, { "epoch": 0.38507661558960693, "grad_norm": 0.6758969469373516, "learning_rate": 9.622641509433963e-06, "loss": 0.0926, "step": 867 }, { "epoch": 0.38552076393515433, "grad_norm": 0.8013932877888278, "learning_rate": 9.633740288568259e-06, "loss": 0.0788, "step": 868 }, { "epoch": 0.38596491228070173, "grad_norm": 0.8330625416886887, "learning_rate": 9.644839067702554e-06, "loss": 0.0908, "step": 869 }, { "epoch": 0.3864090606262492, "grad_norm": 0.8653830292054833, "learning_rate": 9.655937846836849e-06, "loss": 0.0845, "step": 870 }, { "epoch": 0.3868532089717966, "grad_norm": 0.9667174949251367, "learning_rate": 9.667036625971144e-06, "loss": 0.0661, "step": 871 }, { "epoch": 0.387297357317344, "grad_norm": 0.9162313384818003, "learning_rate": 9.678135405105439e-06, "loss": 0.0839, "step": 872 }, { "epoch": 0.3877415056628914, "grad_norm": 1.05316579924942, "learning_rate": 9.689234184239734e-06, "loss": 0.0884, "step": 873 }, { "epoch": 0.3881856540084388, "grad_norm": 0.6947987167903426, "learning_rate": 9.700332963374029e-06, "loss": 0.0676, "step": 874 }, { "epoch": 0.38862980235398625, "grad_norm": 0.7637218765216399, "learning_rate": 9.711431742508326e-06, "loss": 0.0855, "step": 875 }, { "epoch": 0.38907395069953365, "grad_norm": 0.8049041095871476, "learning_rate": 9.72253052164262e-06, "loss": 0.0783, "step": 876 }, { "epoch": 0.38951809904508106, "grad_norm": 1.2616209833129097, "learning_rate": 9.733629300776916e-06, "loss": 0.0994, "step": 877 }, { "epoch": 0.38996224739062846, "grad_norm": 0.8501327438099852, "learning_rate": 9.744728079911211e-06, "loss": 0.0865, "step": 878 }, { "epoch": 0.39040639573617586, "grad_norm": 0.8980289128819791, "learning_rate": 9.755826859045506e-06, "loss": 0.0644, "step": 879 }, { "epoch": 0.3908505440817233, "grad_norm": 0.7464217413483473, "learning_rate": 9.766925638179801e-06, "loss": 0.0751, "step": 880 }, { "epoch": 0.3912946924272707, "grad_norm": 1.0405400428578027, "learning_rate": 9.778024417314096e-06, "loss": 0.0862, "step": 881 }, { "epoch": 0.3917388407728181, "grad_norm": 0.7773928325897406, "learning_rate": 9.789123196448391e-06, "loss": 0.0877, "step": 882 }, { "epoch": 0.3921829891183655, "grad_norm": 0.6831878151686481, "learning_rate": 9.800221975582686e-06, "loss": 0.0613, "step": 883 }, { "epoch": 0.3926271374639129, "grad_norm": 0.8414451878698297, "learning_rate": 9.811320754716981e-06, "loss": 0.0742, "step": 884 }, { "epoch": 0.3930712858094604, "grad_norm": 0.8370630047631802, "learning_rate": 9.822419533851278e-06, "loss": 0.0788, "step": 885 }, { "epoch": 0.3935154341550078, "grad_norm": 0.66195315447052, "learning_rate": 9.833518312985573e-06, "loss": 0.0673, "step": 886 }, { "epoch": 0.3939595825005552, "grad_norm": 0.5443847344778766, "learning_rate": 9.844617092119868e-06, "loss": 0.065, "step": 887 }, { "epoch": 0.3944037308461026, "grad_norm": 0.7436541471597082, "learning_rate": 9.855715871254163e-06, "loss": 0.0718, "step": 888 }, { "epoch": 0.39484787919165, "grad_norm": 0.9048716378979127, "learning_rate": 9.866814650388458e-06, "loss": 0.0647, "step": 889 }, { "epoch": 0.39529202753719744, "grad_norm": 0.887466655199119, "learning_rate": 9.877913429522753e-06, "loss": 0.0742, "step": 890 }, { "epoch": 0.39573617588274485, "grad_norm": 0.5558633344968636, "learning_rate": 9.889012208657048e-06, "loss": 0.0547, "step": 891 }, { "epoch": 0.39618032422829225, "grad_norm": 0.6342481865802312, "learning_rate": 9.900110987791344e-06, "loss": 0.0652, "step": 892 }, { "epoch": 0.39662447257383965, "grad_norm": 0.8609031233478771, "learning_rate": 9.911209766925639e-06, "loss": 0.0837, "step": 893 }, { "epoch": 0.39706862091938705, "grad_norm": 0.8888916230177222, "learning_rate": 9.922308546059934e-06, "loss": 0.0892, "step": 894 }, { "epoch": 0.3975127692649345, "grad_norm": 0.8338604612800079, "learning_rate": 9.93340732519423e-06, "loss": 0.101, "step": 895 }, { "epoch": 0.3979569176104819, "grad_norm": 0.689341320326161, "learning_rate": 9.944506104328525e-06, "loss": 0.0778, "step": 896 }, { "epoch": 0.3984010659560293, "grad_norm": 0.7629364933455176, "learning_rate": 9.955604883462819e-06, "loss": 0.0702, "step": 897 }, { "epoch": 0.3988452143015767, "grad_norm": 0.7205533738637083, "learning_rate": 9.966703662597114e-06, "loss": 0.0712, "step": 898 }, { "epoch": 0.3992893626471241, "grad_norm": 1.1452213067609647, "learning_rate": 9.97780244173141e-06, "loss": 0.0868, "step": 899 }, { "epoch": 0.39973351099267157, "grad_norm": 1.7583678128183422, "learning_rate": 9.988901220865706e-06, "loss": 0.0866, "step": 900 }, { "epoch": 0.40017765933821897, "grad_norm": 1.0237065065243787, "learning_rate": 1e-05, "loss": 0.0966, "step": 901 }, { "epoch": 0.4006218076837664, "grad_norm": 0.9116552545055456, "learning_rate": 9.999999624207532e-06, "loss": 0.0752, "step": 902 }, { "epoch": 0.4010659560293138, "grad_norm": 0.9745084022934988, "learning_rate": 9.999998496830188e-06, "loss": 0.0816, "step": 903 }, { "epoch": 0.4015101043748612, "grad_norm": 0.8729574496188063, "learning_rate": 9.999996617868132e-06, "loss": 0.096, "step": 904 }, { "epoch": 0.40195425272040863, "grad_norm": 0.8195978110162689, "learning_rate": 9.999993987321651e-06, "loss": 0.0848, "step": 905 }, { "epoch": 0.40239840106595604, "grad_norm": 1.0642057405652703, "learning_rate": 9.999990605191136e-06, "loss": 0.0734, "step": 906 }, { "epoch": 0.40284254941150344, "grad_norm": 1.055742691312144, "learning_rate": 9.9999864714771e-06, "loss": 0.1099, "step": 907 }, { "epoch": 0.40328669775705084, "grad_norm": 1.2115140501137451, "learning_rate": 9.999981586180161e-06, "loss": 0.0742, "step": 908 }, { "epoch": 0.4037308461025983, "grad_norm": 0.899219389432472, "learning_rate": 9.999975949301057e-06, "loss": 0.078, "step": 909 }, { "epoch": 0.4041749944481457, "grad_norm": 0.8631042166204541, "learning_rate": 9.99996956084063e-06, "loss": 0.0721, "step": 910 }, { "epoch": 0.4046191427936931, "grad_norm": 0.90232933367849, "learning_rate": 9.999962420799846e-06, "loss": 0.0855, "step": 911 }, { "epoch": 0.4050632911392405, "grad_norm": 0.9832129529060516, "learning_rate": 9.999954529179773e-06, "loss": 0.0884, "step": 912 }, { "epoch": 0.4055074394847879, "grad_norm": 0.6541793575769728, "learning_rate": 9.999945885981603e-06, "loss": 0.0649, "step": 913 }, { "epoch": 0.40595158783033536, "grad_norm": 0.7878273299354217, "learning_rate": 9.999936491206631e-06, "loss": 0.0806, "step": 914 }, { "epoch": 0.40639573617588276, "grad_norm": 1.1901788044990722, "learning_rate": 9.99992634485627e-06, "loss": 0.0926, "step": 915 }, { "epoch": 0.40683988452143016, "grad_norm": 1.14952509907395, "learning_rate": 9.999915446932045e-06, "loss": 0.1071, "step": 916 }, { "epoch": 0.40728403286697756, "grad_norm": 0.7798635545238589, "learning_rate": 9.999903797435596e-06, "loss": 0.0771, "step": 917 }, { "epoch": 0.40772818121252496, "grad_norm": 1.0917620513168562, "learning_rate": 9.999891396368672e-06, "loss": 0.0949, "step": 918 }, { "epoch": 0.4081723295580724, "grad_norm": 1.0952645685085969, "learning_rate": 9.999878243733138e-06, "loss": 0.0817, "step": 919 }, { "epoch": 0.4086164779036198, "grad_norm": 0.8880652433088206, "learning_rate": 9.99986433953097e-06, "loss": 0.0776, "step": 920 }, { "epoch": 0.4090606262491672, "grad_norm": 0.8333025562604276, "learning_rate": 9.99984968376426e-06, "loss": 0.0956, "step": 921 }, { "epoch": 0.4095047745947146, "grad_norm": 1.0999527872907782, "learning_rate": 9.99983427643521e-06, "loss": 0.0782, "step": 922 }, { "epoch": 0.40994892294026203, "grad_norm": 1.0065680465940938, "learning_rate": 9.999818117546135e-06, "loss": 0.0862, "step": 923 }, { "epoch": 0.4103930712858095, "grad_norm": 0.7446846870231809, "learning_rate": 9.999801207099464e-06, "loss": 0.0693, "step": 924 }, { "epoch": 0.4108372196313569, "grad_norm": 0.7664119308596476, "learning_rate": 9.99978354509774e-06, "loss": 0.062, "step": 925 }, { "epoch": 0.4112813679769043, "grad_norm": 0.6216063626642743, "learning_rate": 9.99976513154362e-06, "loss": 0.0855, "step": 926 }, { "epoch": 0.4117255163224517, "grad_norm": 0.7583401455167236, "learning_rate": 9.99974596643987e-06, "loss": 0.0902, "step": 927 }, { "epoch": 0.4121696646679991, "grad_norm": 0.8236694673839551, "learning_rate": 9.999726049789367e-06, "loss": 0.094, "step": 928 }, { "epoch": 0.41261381301354655, "grad_norm": 1.0278078628460734, "learning_rate": 9.999705381595111e-06, "loss": 0.1162, "step": 929 }, { "epoch": 0.41305796135909395, "grad_norm": 0.9106208454624181, "learning_rate": 9.999683961860205e-06, "loss": 0.0832, "step": 930 }, { "epoch": 0.41350210970464135, "grad_norm": 0.9583162458878111, "learning_rate": 9.99966179058787e-06, "loss": 0.0622, "step": 931 }, { "epoch": 0.41394625805018875, "grad_norm": 1.02310895201938, "learning_rate": 9.999638867781437e-06, "loss": 0.0747, "step": 932 }, { "epoch": 0.41439040639573615, "grad_norm": 0.9574965389717746, "learning_rate": 9.999615193444354e-06, "loss": 0.0826, "step": 933 }, { "epoch": 0.4148345547412836, "grad_norm": 0.6820969947085748, "learning_rate": 9.99959076758018e-06, "loss": 0.0794, "step": 934 }, { "epoch": 0.415278703086831, "grad_norm": 0.8271166638389166, "learning_rate": 9.999565590192584e-06, "loss": 0.0677, "step": 935 }, { "epoch": 0.4157228514323784, "grad_norm": 0.7913919823378441, "learning_rate": 9.999539661285354e-06, "loss": 0.0899, "step": 936 }, { "epoch": 0.4161669997779258, "grad_norm": 0.6933241808880691, "learning_rate": 9.999512980862382e-06, "loss": 0.0762, "step": 937 }, { "epoch": 0.4166111481234732, "grad_norm": 1.0464472300779635, "learning_rate": 9.999485548927686e-06, "loss": 0.0879, "step": 938 }, { "epoch": 0.4170552964690207, "grad_norm": 0.9713981902628005, "learning_rate": 9.999457365485383e-06, "loss": 0.0859, "step": 939 }, { "epoch": 0.4174994448145681, "grad_norm": 0.7157769729378113, "learning_rate": 9.999428430539713e-06, "loss": 0.063, "step": 940 }, { "epoch": 0.4179435931601155, "grad_norm": 1.3110820742809257, "learning_rate": 9.999398744095024e-06, "loss": 0.1083, "step": 941 }, { "epoch": 0.4183877415056629, "grad_norm": 0.9278738003250622, "learning_rate": 9.999368306155778e-06, "loss": 0.0682, "step": 942 }, { "epoch": 0.4188318898512103, "grad_norm": 1.081752257204933, "learning_rate": 9.999337116726555e-06, "loss": 0.082, "step": 943 }, { "epoch": 0.41927603819675774, "grad_norm": 0.7655308857156353, "learning_rate": 9.999305175812035e-06, "loss": 0.0824, "step": 944 }, { "epoch": 0.41972018654230514, "grad_norm": 0.843733636393823, "learning_rate": 9.999272483417027e-06, "loss": 0.0583, "step": 945 }, { "epoch": 0.42016433488785254, "grad_norm": 1.0473704718025856, "learning_rate": 9.99923903954644e-06, "loss": 0.0893, "step": 946 }, { "epoch": 0.42060848323339994, "grad_norm": 0.5121181388849327, "learning_rate": 9.999204844205304e-06, "loss": 0.0599, "step": 947 }, { "epoch": 0.42105263157894735, "grad_norm": 0.9066795499623488, "learning_rate": 9.999169897398757e-06, "loss": 0.1344, "step": 948 }, { "epoch": 0.4214967799244948, "grad_norm": 0.910793967335512, "learning_rate": 9.999134199132054e-06, "loss": 0.0669, "step": 949 }, { "epoch": 0.4219409282700422, "grad_norm": 0.5923909271714048, "learning_rate": 9.999097749410561e-06, "loss": 0.0739, "step": 950 }, { "epoch": 0.4223850766155896, "grad_norm": 0.8626478233669734, "learning_rate": 9.999060548239757e-06, "loss": 0.085, "step": 951 }, { "epoch": 0.422829224961137, "grad_norm": 1.2157299829969963, "learning_rate": 9.999022595625233e-06, "loss": 0.0927, "step": 952 }, { "epoch": 0.4232733733066844, "grad_norm": 0.8683032499306118, "learning_rate": 9.998983891572693e-06, "loss": 0.0857, "step": 953 }, { "epoch": 0.42371752165223187, "grad_norm": 0.7958109904631795, "learning_rate": 9.998944436087956e-06, "loss": 0.0848, "step": 954 }, { "epoch": 0.42416166999777927, "grad_norm": 0.8911128877165261, "learning_rate": 9.998904229176955e-06, "loss": 0.0684, "step": 955 }, { "epoch": 0.42460581834332667, "grad_norm": 0.8480526510619464, "learning_rate": 9.998863270845731e-06, "loss": 0.0819, "step": 956 }, { "epoch": 0.42504996668887407, "grad_norm": 1.1640161965795854, "learning_rate": 9.99882156110044e-06, "loss": 0.1114, "step": 957 }, { "epoch": 0.42549411503442147, "grad_norm": 0.7232203452809441, "learning_rate": 9.998779099947356e-06, "loss": 0.0598, "step": 958 }, { "epoch": 0.42593826337996893, "grad_norm": 0.9706452342240306, "learning_rate": 9.998735887392858e-06, "loss": 0.083, "step": 959 }, { "epoch": 0.42638241172551633, "grad_norm": 0.9335641087127532, "learning_rate": 9.998691923443442e-06, "loss": 0.0866, "step": 960 }, { "epoch": 0.42682656007106373, "grad_norm": 0.6482813505555395, "learning_rate": 9.998647208105717e-06, "loss": 0.0792, "step": 961 }, { "epoch": 0.42727070841661113, "grad_norm": 1.0124751766766096, "learning_rate": 9.998601741386404e-06, "loss": 0.0981, "step": 962 }, { "epoch": 0.42771485676215854, "grad_norm": 0.8713994807005938, "learning_rate": 9.998555523292338e-06, "loss": 0.112, "step": 963 }, { "epoch": 0.428159005107706, "grad_norm": 0.7341323526220324, "learning_rate": 9.998508553830468e-06, "loss": 0.079, "step": 964 }, { "epoch": 0.4286031534532534, "grad_norm": 0.7729393765262612, "learning_rate": 9.99846083300785e-06, "loss": 0.0783, "step": 965 }, { "epoch": 0.4290473017988008, "grad_norm": 0.9156169394277074, "learning_rate": 9.99841236083166e-06, "loss": 0.0806, "step": 966 }, { "epoch": 0.4294914501443482, "grad_norm": 0.8551663881257657, "learning_rate": 9.998363137309187e-06, "loss": 0.0618, "step": 967 }, { "epoch": 0.4299355984898956, "grad_norm": 0.5370934826053735, "learning_rate": 9.998313162447824e-06, "loss": 0.0652, "step": 968 }, { "epoch": 0.43037974683544306, "grad_norm": 0.7190678146601934, "learning_rate": 9.998262436255087e-06, "loss": 0.0574, "step": 969 }, { "epoch": 0.43082389518099046, "grad_norm": 0.790706849866094, "learning_rate": 9.998210958738601e-06, "loss": 0.0805, "step": 970 }, { "epoch": 0.43126804352653786, "grad_norm": 0.7463621831379682, "learning_rate": 9.998158729906102e-06, "loss": 0.0712, "step": 971 }, { "epoch": 0.43171219187208526, "grad_norm": 1.0561578156156401, "learning_rate": 9.998105749765444e-06, "loss": 0.0901, "step": 972 }, { "epoch": 0.43215634021763266, "grad_norm": 0.7180209621862901, "learning_rate": 9.998052018324586e-06, "loss": 0.0687, "step": 973 }, { "epoch": 0.4326004885631801, "grad_norm": 0.8430240030491031, "learning_rate": 9.99799753559161e-06, "loss": 0.0763, "step": 974 }, { "epoch": 0.4330446369087275, "grad_norm": 0.6157610719315397, "learning_rate": 9.997942301574701e-06, "loss": 0.0679, "step": 975 }, { "epoch": 0.4334887852542749, "grad_norm": 0.8284454213431681, "learning_rate": 9.997886316282167e-06, "loss": 0.0901, "step": 976 }, { "epoch": 0.4339329335998223, "grad_norm": 0.6138908846381919, "learning_rate": 9.997829579722418e-06, "loss": 0.068, "step": 977 }, { "epoch": 0.4343770819453698, "grad_norm": 0.6977507297268527, "learning_rate": 9.997772091903984e-06, "loss": 0.0719, "step": 978 }, { "epoch": 0.4348212302909172, "grad_norm": 0.672879000533416, "learning_rate": 9.997713852835509e-06, "loss": 0.0859, "step": 979 }, { "epoch": 0.4352653786364646, "grad_norm": 1.0953915290421439, "learning_rate": 9.997654862525746e-06, "loss": 0.0984, "step": 980 }, { "epoch": 0.435709526982012, "grad_norm": 0.7214317597556656, "learning_rate": 9.997595120983561e-06, "loss": 0.0788, "step": 981 }, { "epoch": 0.4361536753275594, "grad_norm": 0.6240169495789859, "learning_rate": 9.997534628217935e-06, "loss": 0.0533, "step": 982 }, { "epoch": 0.43659782367310684, "grad_norm": 1.0463824152225416, "learning_rate": 9.997473384237962e-06, "loss": 0.1256, "step": 983 }, { "epoch": 0.43704197201865425, "grad_norm": 0.6075177918870873, "learning_rate": 9.997411389052846e-06, "loss": 0.0907, "step": 984 }, { "epoch": 0.43748612036420165, "grad_norm": 0.8030490613184201, "learning_rate": 9.997348642671906e-06, "loss": 0.075, "step": 985 }, { "epoch": 0.43793026870974905, "grad_norm": 0.6431009442762712, "learning_rate": 9.997285145104578e-06, "loss": 0.0681, "step": 986 }, { "epoch": 0.43837441705529645, "grad_norm": 0.7359898616321635, "learning_rate": 9.997220896360402e-06, "loss": 0.0668, "step": 987 }, { "epoch": 0.4388185654008439, "grad_norm": 0.9059235668805204, "learning_rate": 9.997155896449037e-06, "loss": 0.1006, "step": 988 }, { "epoch": 0.4392627137463913, "grad_norm": 0.5647469272415054, "learning_rate": 9.997090145380253e-06, "loss": 0.0699, "step": 989 }, { "epoch": 0.4397068620919387, "grad_norm": 0.7349393202965936, "learning_rate": 9.997023643163937e-06, "loss": 0.0731, "step": 990 }, { "epoch": 0.4401510104374861, "grad_norm": 0.6226852410738429, "learning_rate": 9.996956389810082e-06, "loss": 0.0663, "step": 991 }, { "epoch": 0.4405951587830335, "grad_norm": 0.6263169289849984, "learning_rate": 9.996888385328798e-06, "loss": 0.0733, "step": 992 }, { "epoch": 0.44103930712858097, "grad_norm": 0.7290496435666083, "learning_rate": 9.996819629730305e-06, "loss": 0.0664, "step": 993 }, { "epoch": 0.4414834554741284, "grad_norm": 0.6744456519687402, "learning_rate": 9.996750123024943e-06, "loss": 0.0637, "step": 994 }, { "epoch": 0.4419276038196758, "grad_norm": 0.9231205047824739, "learning_rate": 9.996679865223157e-06, "loss": 0.0598, "step": 995 }, { "epoch": 0.4423717521652232, "grad_norm": 0.6917241860836084, "learning_rate": 9.99660885633551e-06, "loss": 0.07, "step": 996 }, { "epoch": 0.4428159005107706, "grad_norm": 1.0027314643568717, "learning_rate": 9.996537096372672e-06, "loss": 0.0998, "step": 997 }, { "epoch": 0.44326004885631803, "grad_norm": 0.7262325240442806, "learning_rate": 9.996464585345433e-06, "loss": 0.0703, "step": 998 }, { "epoch": 0.44370419720186544, "grad_norm": 0.7680770645535525, "learning_rate": 9.996391323264693e-06, "loss": 0.0884, "step": 999 }, { "epoch": 0.44414834554741284, "grad_norm": 0.7559490106951665, "learning_rate": 9.996317310141462e-06, "loss": 0.0885, "step": 1000 }, { "epoch": 0.44459249389296024, "grad_norm": 0.648213091365164, "learning_rate": 9.996242545986868e-06, "loss": 0.0657, "step": 1001 }, { "epoch": 0.44503664223850764, "grad_norm": 0.941859001821441, "learning_rate": 9.996167030812146e-06, "loss": 0.0771, "step": 1002 }, { "epoch": 0.4454807905840551, "grad_norm": 0.9336722754874243, "learning_rate": 9.996090764628649e-06, "loss": 0.062, "step": 1003 }, { "epoch": 0.4459249389296025, "grad_norm": 0.6689537581560581, "learning_rate": 9.996013747447844e-06, "loss": 0.0676, "step": 1004 }, { "epoch": 0.4463690872751499, "grad_norm": 0.6868609219124203, "learning_rate": 9.995935979281304e-06, "loss": 0.0698, "step": 1005 }, { "epoch": 0.4468132356206973, "grad_norm": 0.7598513445760847, "learning_rate": 9.995857460140719e-06, "loss": 0.0663, "step": 1006 }, { "epoch": 0.4472573839662447, "grad_norm": 0.8425846771969293, "learning_rate": 9.995778190037893e-06, "loss": 0.0709, "step": 1007 }, { "epoch": 0.44770153231179216, "grad_norm": 0.7973802310676283, "learning_rate": 9.995698168984743e-06, "loss": 0.0669, "step": 1008 }, { "epoch": 0.44814568065733956, "grad_norm": 0.8057530371483355, "learning_rate": 9.995617396993297e-06, "loss": 0.105, "step": 1009 }, { "epoch": 0.44858982900288696, "grad_norm": 0.698450418542813, "learning_rate": 9.995535874075692e-06, "loss": 0.0886, "step": 1010 }, { "epoch": 0.44903397734843437, "grad_norm": 0.6685872712573397, "learning_rate": 9.99545360024419e-06, "loss": 0.097, "step": 1011 }, { "epoch": 0.44947812569398177, "grad_norm": 0.7945297170711836, "learning_rate": 9.995370575511151e-06, "loss": 0.0841, "step": 1012 }, { "epoch": 0.4499222740395292, "grad_norm": 0.6520316868875402, "learning_rate": 9.99528679988906e-06, "loss": 0.0787, "step": 1013 }, { "epoch": 0.4503664223850766, "grad_norm": 0.572444172084648, "learning_rate": 9.995202273390505e-06, "loss": 0.0594, "step": 1014 }, { "epoch": 0.450810570730624, "grad_norm": 0.6063370258668184, "learning_rate": 9.995116996028197e-06, "loss": 0.0683, "step": 1015 }, { "epoch": 0.45125471907617143, "grad_norm": 0.7022055879284854, "learning_rate": 9.995030967814952e-06, "loss": 0.0828, "step": 1016 }, { "epoch": 0.45169886742171883, "grad_norm": 0.6787805656005202, "learning_rate": 9.994944188763701e-06, "loss": 0.0886, "step": 1017 }, { "epoch": 0.4521430157672663, "grad_norm": 0.6743594234424202, "learning_rate": 9.994856658887491e-06, "loss": 0.0858, "step": 1018 }, { "epoch": 0.4525871641128137, "grad_norm": 0.701751582175111, "learning_rate": 9.994768378199476e-06, "loss": 0.0877, "step": 1019 }, { "epoch": 0.4530313124583611, "grad_norm": 0.7777579093243342, "learning_rate": 9.994679346712927e-06, "loss": 0.0809, "step": 1020 }, { "epoch": 0.4534754608039085, "grad_norm": 0.7407021400306011, "learning_rate": 9.994589564441229e-06, "loss": 0.0667, "step": 1021 }, { "epoch": 0.4539196091494559, "grad_norm": 0.8640591667177118, "learning_rate": 9.994499031397874e-06, "loss": 0.0779, "step": 1022 }, { "epoch": 0.45436375749500335, "grad_norm": 0.741626283806636, "learning_rate": 9.994407747596474e-06, "loss": 0.0598, "step": 1023 }, { "epoch": 0.45480790584055075, "grad_norm": 0.7426285442565452, "learning_rate": 9.994315713050749e-06, "loss": 0.0763, "step": 1024 }, { "epoch": 0.45525205418609815, "grad_norm": 1.1459706806907142, "learning_rate": 9.994222927774535e-06, "loss": 0.0928, "step": 1025 }, { "epoch": 0.45569620253164556, "grad_norm": 0.9102603719521771, "learning_rate": 9.994129391781777e-06, "loss": 0.0695, "step": 1026 }, { "epoch": 0.45614035087719296, "grad_norm": 0.8406791604735616, "learning_rate": 9.994035105086536e-06, "loss": 0.0655, "step": 1027 }, { "epoch": 0.4565844992227404, "grad_norm": 0.6764230445649027, "learning_rate": 9.993940067702985e-06, "loss": 0.0566, "step": 1028 }, { "epoch": 0.4570286475682878, "grad_norm": 1.1241952343326866, "learning_rate": 9.993844279645411e-06, "loss": 0.1138, "step": 1029 }, { "epoch": 0.4574727959138352, "grad_norm": 0.8203260077691763, "learning_rate": 9.993747740928207e-06, "loss": 0.0639, "step": 1030 }, { "epoch": 0.4579169442593826, "grad_norm": 0.9349985900242503, "learning_rate": 9.993650451565892e-06, "loss": 0.078, "step": 1031 }, { "epoch": 0.45836109260493, "grad_norm": 0.9978670067027727, "learning_rate": 9.993552411573088e-06, "loss": 0.0952, "step": 1032 }, { "epoch": 0.4588052409504775, "grad_norm": 1.1441082866740984, "learning_rate": 9.993453620964529e-06, "loss": 0.0897, "step": 1033 }, { "epoch": 0.4592493892960249, "grad_norm": 1.0696466066343144, "learning_rate": 9.993354079755066e-06, "loss": 0.0903, "step": 1034 }, { "epoch": 0.4596935376415723, "grad_norm": 0.6531078727573577, "learning_rate": 9.993253787959664e-06, "loss": 0.0673, "step": 1035 }, { "epoch": 0.4601376859871197, "grad_norm": 1.4198972391798237, "learning_rate": 9.993152745593398e-06, "loss": 0.083, "step": 1036 }, { "epoch": 0.4605818343326671, "grad_norm": 1.070478952947314, "learning_rate": 9.993050952671453e-06, "loss": 0.0687, "step": 1037 }, { "epoch": 0.46102598267821454, "grad_norm": 0.7417628375073222, "learning_rate": 9.992948409209134e-06, "loss": 0.0802, "step": 1038 }, { "epoch": 0.46147013102376194, "grad_norm": 0.6201081139409721, "learning_rate": 9.992845115221855e-06, "loss": 0.0612, "step": 1039 }, { "epoch": 0.46191427936930934, "grad_norm": 0.8344957050457038, "learning_rate": 9.992741070725137e-06, "loss": 0.0897, "step": 1040 }, { "epoch": 0.46235842771485675, "grad_norm": 0.5628453325687238, "learning_rate": 9.992636275734629e-06, "loss": 0.069, "step": 1041 }, { "epoch": 0.4628025760604042, "grad_norm": 0.6969650084016832, "learning_rate": 9.992530730266078e-06, "loss": 0.073, "step": 1042 }, { "epoch": 0.4632467244059516, "grad_norm": 1.0860876745728663, "learning_rate": 9.992424434335348e-06, "loss": 0.0921, "step": 1043 }, { "epoch": 0.463690872751499, "grad_norm": 0.5795193360907794, "learning_rate": 9.99231738795842e-06, "loss": 0.0504, "step": 1044 }, { "epoch": 0.4641350210970464, "grad_norm": 0.6583352279848995, "learning_rate": 9.992209591151386e-06, "loss": 0.0831, "step": 1045 }, { "epoch": 0.4645791694425938, "grad_norm": 0.5908927184246742, "learning_rate": 9.992101043930444e-06, "loss": 0.0996, "step": 1046 }, { "epoch": 0.46502331778814127, "grad_norm": 0.7629134106102797, "learning_rate": 9.991991746311916e-06, "loss": 0.0595, "step": 1047 }, { "epoch": 0.46546746613368867, "grad_norm": 0.8529582668397104, "learning_rate": 9.991881698312229e-06, "loss": 0.076, "step": 1048 }, { "epoch": 0.46591161447923607, "grad_norm": 0.6804037477070127, "learning_rate": 9.991770899947925e-06, "loss": 0.075, "step": 1049 }, { "epoch": 0.46635576282478347, "grad_norm": 0.8612466864991256, "learning_rate": 9.991659351235662e-06, "loss": 0.0796, "step": 1050 }, { "epoch": 0.4667999111703309, "grad_norm": 0.6453565757322578, "learning_rate": 9.991547052192203e-06, "loss": 0.0625, "step": 1051 }, { "epoch": 0.46724405951587833, "grad_norm": 1.7133852156999947, "learning_rate": 9.99143400283443e-06, "loss": 0.097, "step": 1052 }, { "epoch": 0.46768820786142573, "grad_norm": 0.7043109079290237, "learning_rate": 9.991320203179338e-06, "loss": 0.0556, "step": 1053 }, { "epoch": 0.46813235620697313, "grad_norm": 0.9119065164949084, "learning_rate": 9.991205653244032e-06, "loss": 0.0618, "step": 1054 }, { "epoch": 0.46857650455252053, "grad_norm": 0.5740417394473022, "learning_rate": 9.991090353045729e-06, "loss": 0.0492, "step": 1055 }, { "epoch": 0.46902065289806794, "grad_norm": 0.8813556475853607, "learning_rate": 9.990974302601762e-06, "loss": 0.0603, "step": 1056 }, { "epoch": 0.4694648012436154, "grad_norm": 0.6951634771023119, "learning_rate": 9.990857501929577e-06, "loss": 0.0641, "step": 1057 }, { "epoch": 0.4699089495891628, "grad_norm": 0.9703272601786429, "learning_rate": 9.990739951046729e-06, "loss": 0.1064, "step": 1058 }, { "epoch": 0.4703530979347102, "grad_norm": 1.1076310196170094, "learning_rate": 9.99062164997089e-06, "loss": 0.077, "step": 1059 }, { "epoch": 0.4707972462802576, "grad_norm": 0.813678875992106, "learning_rate": 9.990502598719837e-06, "loss": 0.0579, "step": 1060 }, { "epoch": 0.471241394625805, "grad_norm": 0.8445949631628347, "learning_rate": 9.990382797311474e-06, "loss": 0.0922, "step": 1061 }, { "epoch": 0.47168554297135246, "grad_norm": 0.6432477880049939, "learning_rate": 9.990262245763802e-06, "loss": 0.066, "step": 1062 }, { "epoch": 0.47212969131689986, "grad_norm": 1.0190163753358932, "learning_rate": 9.990140944094946e-06, "loss": 0.1059, "step": 1063 }, { "epoch": 0.47257383966244726, "grad_norm": 0.9736504358272349, "learning_rate": 9.990018892323138e-06, "loss": 0.0716, "step": 1064 }, { "epoch": 0.47301798800799466, "grad_norm": 0.8551545600086438, "learning_rate": 9.989896090466725e-06, "loss": 0.0721, "step": 1065 }, { "epoch": 0.47346213635354206, "grad_norm": 0.7846245356864615, "learning_rate": 9.989772538544167e-06, "loss": 0.0837, "step": 1066 }, { "epoch": 0.4739062846990895, "grad_norm": 0.6875499187875691, "learning_rate": 9.989648236574035e-06, "loss": 0.0604, "step": 1067 }, { "epoch": 0.4743504330446369, "grad_norm": 1.010871584074772, "learning_rate": 9.989523184575013e-06, "loss": 0.0836, "step": 1068 }, { "epoch": 0.4747945813901843, "grad_norm": 0.8312720125551346, "learning_rate": 9.989397382565898e-06, "loss": 0.0712, "step": 1069 }, { "epoch": 0.4752387297357317, "grad_norm": 0.96789104638543, "learning_rate": 9.989270830565603e-06, "loss": 0.0784, "step": 1070 }, { "epoch": 0.4756828780812791, "grad_norm": 0.7404878797473026, "learning_rate": 9.989143528593149e-06, "loss": 0.0872, "step": 1071 }, { "epoch": 0.4761270264268266, "grad_norm": 0.6280381360800792, "learning_rate": 9.98901547666767e-06, "loss": 0.0661, "step": 1072 }, { "epoch": 0.476571174772374, "grad_norm": 0.8234412511968835, "learning_rate": 9.988886674808418e-06, "loss": 0.0621, "step": 1073 }, { "epoch": 0.4770153231179214, "grad_norm": 0.5856951522424453, "learning_rate": 9.988757123034753e-06, "loss": 0.0657, "step": 1074 }, { "epoch": 0.4774594714634688, "grad_norm": 0.9863728037623454, "learning_rate": 9.988626821366147e-06, "loss": 0.0787, "step": 1075 }, { "epoch": 0.4779036198090162, "grad_norm": 0.8043968430169456, "learning_rate": 9.988495769822188e-06, "loss": 0.0712, "step": 1076 }, { "epoch": 0.47834776815456365, "grad_norm": 0.7330127963179667, "learning_rate": 9.988363968422577e-06, "loss": 0.0749, "step": 1077 }, { "epoch": 0.47879191650011105, "grad_norm": 0.8101029322049607, "learning_rate": 9.988231417187122e-06, "loss": 0.0901, "step": 1078 }, { "epoch": 0.47923606484565845, "grad_norm": 0.9467235647843469, "learning_rate": 9.98809811613575e-06, "loss": 0.0923, "step": 1079 }, { "epoch": 0.47968021319120585, "grad_norm": 0.7955200227969057, "learning_rate": 9.9879640652885e-06, "loss": 0.0802, "step": 1080 }, { "epoch": 0.48012436153675325, "grad_norm": 1.0074849821784144, "learning_rate": 9.987829264665518e-06, "loss": 0.1319, "step": 1081 }, { "epoch": 0.4805685098823007, "grad_norm": 0.680242996169655, "learning_rate": 9.98769371428707e-06, "loss": 0.0659, "step": 1082 }, { "epoch": 0.4810126582278481, "grad_norm": 1.1307628357255757, "learning_rate": 9.98755741417353e-06, "loss": 0.0962, "step": 1083 }, { "epoch": 0.4814568065733955, "grad_norm": 0.7637117418782801, "learning_rate": 9.987420364345388e-06, "loss": 0.0596, "step": 1084 }, { "epoch": 0.4819009549189429, "grad_norm": 0.8434389453780773, "learning_rate": 9.987282564823242e-06, "loss": 0.0832, "step": 1085 }, { "epoch": 0.4823451032644903, "grad_norm": 1.3802825505619392, "learning_rate": 9.98714401562781e-06, "loss": 0.1092, "step": 1086 }, { "epoch": 0.4827892516100378, "grad_norm": 0.7526559659659204, "learning_rate": 9.987004716779914e-06, "loss": 0.0649, "step": 1087 }, { "epoch": 0.4832333999555852, "grad_norm": 0.8564554305307791, "learning_rate": 9.986864668300494e-06, "loss": 0.0745, "step": 1088 }, { "epoch": 0.4836775483011326, "grad_norm": 0.6121214143253134, "learning_rate": 9.986723870210605e-06, "loss": 0.0648, "step": 1089 }, { "epoch": 0.48412169664668, "grad_norm": 1.2737874840599017, "learning_rate": 9.986582322531406e-06, "loss": 0.1167, "step": 1090 }, { "epoch": 0.4845658449922274, "grad_norm": 1.1300452790006459, "learning_rate": 9.986440025284177e-06, "loss": 0.1004, "step": 1091 }, { "epoch": 0.48500999333777484, "grad_norm": 0.695366891600787, "learning_rate": 9.986296978490308e-06, "loss": 0.0631, "step": 1092 }, { "epoch": 0.48545414168332224, "grad_norm": 0.907315227077863, "learning_rate": 9.9861531821713e-06, "loss": 0.0975, "step": 1093 }, { "epoch": 0.48589829002886964, "grad_norm": 0.8452739013500794, "learning_rate": 9.986008636348771e-06, "loss": 0.0725, "step": 1094 }, { "epoch": 0.48634243837441704, "grad_norm": 0.7641747681242534, "learning_rate": 9.985863341044444e-06, "loss": 0.0745, "step": 1095 }, { "epoch": 0.48678658671996444, "grad_norm": 1.0767424752885635, "learning_rate": 9.985717296280165e-06, "loss": 0.1343, "step": 1096 }, { "epoch": 0.4872307350655119, "grad_norm": 1.082068622065411, "learning_rate": 9.985570502077881e-06, "loss": 0.0709, "step": 1097 }, { "epoch": 0.4876748834110593, "grad_norm": 1.1385183378270511, "learning_rate": 9.98542295845966e-06, "loss": 0.1136, "step": 1098 }, { "epoch": 0.4881190317566067, "grad_norm": 0.8790710582646716, "learning_rate": 9.985274665447682e-06, "loss": 0.0796, "step": 1099 }, { "epoch": 0.4885631801021541, "grad_norm": 0.7645639481923898, "learning_rate": 9.985125623064238e-06, "loss": 0.1039, "step": 1100 }, { "epoch": 0.4890073284477015, "grad_norm": 0.9695248420140831, "learning_rate": 9.98497583133173e-06, "loss": 0.0667, "step": 1101 }, { "epoch": 0.48945147679324896, "grad_norm": 0.8791125306416613, "learning_rate": 9.984825290272673e-06, "loss": 0.0703, "step": 1102 }, { "epoch": 0.48989562513879636, "grad_norm": 0.6608187335865532, "learning_rate": 9.984673999909698e-06, "loss": 0.0607, "step": 1103 }, { "epoch": 0.49033977348434377, "grad_norm": 0.7238746077559932, "learning_rate": 9.984521960265545e-06, "loss": 0.0582, "step": 1104 }, { "epoch": 0.49078392182989117, "grad_norm": 0.7543190655536871, "learning_rate": 9.98436917136307e-06, "loss": 0.075, "step": 1105 }, { "epoch": 0.49122807017543857, "grad_norm": 0.9831510577501872, "learning_rate": 9.98421563322524e-06, "loss": 0.0832, "step": 1106 }, { "epoch": 0.491672218520986, "grad_norm": 1.0953392860217965, "learning_rate": 9.984061345875133e-06, "loss": 0.1055, "step": 1107 }, { "epoch": 0.49211636686653343, "grad_norm": 1.0004839390300868, "learning_rate": 9.983906309335942e-06, "loss": 0.0882, "step": 1108 }, { "epoch": 0.49256051521208083, "grad_norm": 0.7389464810035322, "learning_rate": 9.98375052363097e-06, "loss": 0.0608, "step": 1109 }, { "epoch": 0.49300466355762823, "grad_norm": 0.6469655367019491, "learning_rate": 9.983593988783634e-06, "loss": 0.0687, "step": 1110 }, { "epoch": 0.4934488119031757, "grad_norm": 0.7833813336297293, "learning_rate": 9.983436704817466e-06, "loss": 0.0902, "step": 1111 }, { "epoch": 0.4938929602487231, "grad_norm": 1.1676871219264449, "learning_rate": 9.983278671756107e-06, "loss": 0.0741, "step": 1112 }, { "epoch": 0.4943371085942705, "grad_norm": 0.5948813388699934, "learning_rate": 9.983119889623314e-06, "loss": 0.0584, "step": 1113 }, { "epoch": 0.4947812569398179, "grad_norm": 1.0932192784955042, "learning_rate": 9.982960358442952e-06, "loss": 0.0814, "step": 1114 }, { "epoch": 0.4952254052853653, "grad_norm": 0.9576191702524788, "learning_rate": 9.982800078239004e-06, "loss": 0.0939, "step": 1115 }, { "epoch": 0.49566955363091275, "grad_norm": 0.8091517385774816, "learning_rate": 9.982639049035559e-06, "loss": 0.0894, "step": 1116 }, { "epoch": 0.49611370197646015, "grad_norm": 1.1318080237230808, "learning_rate": 9.982477270856827e-06, "loss": 0.0743, "step": 1117 }, { "epoch": 0.49655785032200755, "grad_norm": 1.2019606615618712, "learning_rate": 9.982314743727121e-06, "loss": 0.107, "step": 1118 }, { "epoch": 0.49700199866755496, "grad_norm": 0.9414473018138827, "learning_rate": 9.982151467670876e-06, "loss": 0.0746, "step": 1119 }, { "epoch": 0.49744614701310236, "grad_norm": 0.7821900901787131, "learning_rate": 9.981987442712634e-06, "loss": 0.062, "step": 1120 }, { "epoch": 0.4978902953586498, "grad_norm": 0.8874834331056065, "learning_rate": 9.981822668877048e-06, "loss": 0.0964, "step": 1121 }, { "epoch": 0.4983344437041972, "grad_norm": 0.763549676147823, "learning_rate": 9.98165714618889e-06, "loss": 0.0683, "step": 1122 }, { "epoch": 0.4987785920497446, "grad_norm": 0.7237088556818837, "learning_rate": 9.98149087467304e-06, "loss": 0.0706, "step": 1123 }, { "epoch": 0.499222740395292, "grad_norm": 0.5389621815014336, "learning_rate": 9.98132385435449e-06, "loss": 0.0739, "step": 1124 }, { "epoch": 0.4996668887408394, "grad_norm": 0.5780461463044186, "learning_rate": 9.981156085258347e-06, "loss": 0.0532, "step": 1125 }, { "epoch": 0.5001110370863868, "grad_norm": 0.9337676739733405, "learning_rate": 9.980987567409829e-06, "loss": 0.0728, "step": 1126 }, { "epoch": 0.5005551854319342, "grad_norm": 0.7121233826282255, "learning_rate": 9.980818300834267e-06, "loss": 0.0684, "step": 1127 }, { "epoch": 0.5009993337774816, "grad_norm": 0.7441687384799486, "learning_rate": 9.980648285557106e-06, "loss": 0.0692, "step": 1128 }, { "epoch": 0.5014434821230291, "grad_norm": 0.6423868990281854, "learning_rate": 9.980477521603901e-06, "loss": 0.0677, "step": 1129 }, { "epoch": 0.5018876304685765, "grad_norm": 0.5495871418316753, "learning_rate": 9.98030600900032e-06, "loss": 0.0728, "step": 1130 }, { "epoch": 0.5023317788141239, "grad_norm": 0.6241490380000925, "learning_rate": 9.980133747772148e-06, "loss": 0.0662, "step": 1131 }, { "epoch": 0.5027759271596713, "grad_norm": 0.7343652098087937, "learning_rate": 9.979960737945273e-06, "loss": 0.0678, "step": 1132 }, { "epoch": 0.5032200755052187, "grad_norm": 1.1015252671394224, "learning_rate": 9.979786979545704e-06, "loss": 0.0993, "step": 1133 }, { "epoch": 0.5036642238507661, "grad_norm": 0.8816390886875364, "learning_rate": 9.979612472599563e-06, "loss": 0.0839, "step": 1134 }, { "epoch": 0.5041083721963135, "grad_norm": 0.7833322958956014, "learning_rate": 9.979437217133077e-06, "loss": 0.082, "step": 1135 }, { "epoch": 0.504552520541861, "grad_norm": 0.9520003222392113, "learning_rate": 9.979261213172592e-06, "loss": 0.0768, "step": 1136 }, { "epoch": 0.5049966688874084, "grad_norm": 0.6650156355994222, "learning_rate": 9.979084460744563e-06, "loss": 0.0594, "step": 1137 }, { "epoch": 0.5054408172329558, "grad_norm": 0.7702494870526182, "learning_rate": 9.97890695987556e-06, "loss": 0.0744, "step": 1138 }, { "epoch": 0.5058849655785033, "grad_norm": 0.6441998434508551, "learning_rate": 9.978728710592265e-06, "loss": 0.0639, "step": 1139 }, { "epoch": 0.5063291139240507, "grad_norm": 0.8638339516257324, "learning_rate": 9.97854971292147e-06, "loss": 0.0737, "step": 1140 }, { "epoch": 0.5067732622695981, "grad_norm": 0.6599486193027585, "learning_rate": 9.978369966890082e-06, "loss": 0.0677, "step": 1141 }, { "epoch": 0.5072174106151455, "grad_norm": 0.5684535397639588, "learning_rate": 9.978189472525121e-06, "loss": 0.0667, "step": 1142 }, { "epoch": 0.5076615589606929, "grad_norm": 0.7930545362878336, "learning_rate": 9.978008229853717e-06, "loss": 0.0768, "step": 1143 }, { "epoch": 0.5081057073062403, "grad_norm": 1.013824145483496, "learning_rate": 9.977826238903116e-06, "loss": 0.0878, "step": 1144 }, { "epoch": 0.5085498556517877, "grad_norm": 0.7663207754691962, "learning_rate": 9.97764349970067e-06, "loss": 0.0757, "step": 1145 }, { "epoch": 0.5089940039973351, "grad_norm": 0.7907421181118522, "learning_rate": 9.977460012273854e-06, "loss": 0.0784, "step": 1146 }, { "epoch": 0.5094381523428825, "grad_norm": 0.6096514964784717, "learning_rate": 9.977275776650244e-06, "loss": 0.0806, "step": 1147 }, { "epoch": 0.50988230068843, "grad_norm": 0.5159120034000472, "learning_rate": 9.977090792857536e-06, "loss": 0.0598, "step": 1148 }, { "epoch": 0.5103264490339774, "grad_norm": 0.7102391658769992, "learning_rate": 9.976905060923536e-06, "loss": 0.077, "step": 1149 }, { "epoch": 0.5107705973795248, "grad_norm": 0.7376867178754969, "learning_rate": 9.97671858087616e-06, "loss": 0.0726, "step": 1150 }, { "epoch": 0.5112147457250722, "grad_norm": 0.7790796748709151, "learning_rate": 9.976531352743445e-06, "loss": 0.0806, "step": 1151 }, { "epoch": 0.5116588940706196, "grad_norm": 0.618833561044351, "learning_rate": 9.97634337655353e-06, "loss": 0.0654, "step": 1152 }, { "epoch": 0.512103042416167, "grad_norm": 0.808662756842487, "learning_rate": 9.976154652334673e-06, "loss": 0.1081, "step": 1153 }, { "epoch": 0.5125471907617144, "grad_norm": 1.0180790521960883, "learning_rate": 9.97596518011524e-06, "loss": 0.1018, "step": 1154 }, { "epoch": 0.5129913391072618, "grad_norm": 0.638695304240369, "learning_rate": 9.975774959923717e-06, "loss": 0.0655, "step": 1155 }, { "epoch": 0.5134354874528092, "grad_norm": 0.7566614134333154, "learning_rate": 9.975583991788691e-06, "loss": 0.0658, "step": 1156 }, { "epoch": 0.5138796357983566, "grad_norm": 0.7034717435901088, "learning_rate": 9.97539227573887e-06, "loss": 0.0815, "step": 1157 }, { "epoch": 0.5143237841439041, "grad_norm": 0.7342548435589596, "learning_rate": 9.975199811803073e-06, "loss": 0.0755, "step": 1158 }, { "epoch": 0.5147679324894515, "grad_norm": 0.881634196160518, "learning_rate": 9.975006600010233e-06, "loss": 0.0649, "step": 1159 }, { "epoch": 0.5152120808349989, "grad_norm": 0.82644127591562, "learning_rate": 9.97481264038939e-06, "loss": 0.0589, "step": 1160 }, { "epoch": 0.5156562291805463, "grad_norm": 0.8462515915216339, "learning_rate": 9.974617932969697e-06, "loss": 0.0735, "step": 1161 }, { "epoch": 0.5161003775260937, "grad_norm": 0.6116708783372564, "learning_rate": 9.974422477780426e-06, "loss": 0.0593, "step": 1162 }, { "epoch": 0.5165445258716411, "grad_norm": 0.7646830820557489, "learning_rate": 9.974226274850956e-06, "loss": 0.0866, "step": 1163 }, { "epoch": 0.5169886742171885, "grad_norm": 0.6551362409279565, "learning_rate": 9.97402932421078e-06, "loss": 0.0742, "step": 1164 }, { "epoch": 0.5174328225627359, "grad_norm": 0.5323199794273931, "learning_rate": 9.973831625889501e-06, "loss": 0.0599, "step": 1165 }, { "epoch": 0.5178769709082833, "grad_norm": 0.694095375162108, "learning_rate": 9.97363317991684e-06, "loss": 0.0646, "step": 1166 }, { "epoch": 0.5183211192538307, "grad_norm": 0.9040017314033096, "learning_rate": 9.973433986322625e-06, "loss": 0.0755, "step": 1167 }, { "epoch": 0.5187652675993782, "grad_norm": 0.7059881945266824, "learning_rate": 9.973234045136798e-06, "loss": 0.0869, "step": 1168 }, { "epoch": 0.5192094159449256, "grad_norm": 0.7791362124843618, "learning_rate": 9.973033356389412e-06, "loss": 0.0912, "step": 1169 }, { "epoch": 0.519653564290473, "grad_norm": 0.7295114970239928, "learning_rate": 9.972831920110635e-06, "loss": 0.0769, "step": 1170 }, { "epoch": 0.5200977126360204, "grad_norm": 0.5612528230166707, "learning_rate": 9.972629736330748e-06, "loss": 0.0679, "step": 1171 }, { "epoch": 0.5205418609815678, "grad_norm": 0.7254809001904728, "learning_rate": 9.972426805080141e-06, "loss": 0.0715, "step": 1172 }, { "epoch": 0.5209860093271153, "grad_norm": 0.7275192896210665, "learning_rate": 9.97222312638932e-06, "loss": 0.0841, "step": 1173 }, { "epoch": 0.5214301576726627, "grad_norm": 0.8132581046482065, "learning_rate": 9.972018700288898e-06, "loss": 0.0715, "step": 1174 }, { "epoch": 0.52187430601821, "grad_norm": 0.9966008387669913, "learning_rate": 9.971813526809606e-06, "loss": 0.0844, "step": 1175 }, { "epoch": 0.5223184543637575, "grad_norm": 0.621117239882631, "learning_rate": 9.971607605982285e-06, "loss": 0.0572, "step": 1176 }, { "epoch": 0.5227626027093049, "grad_norm": 0.8375409793845516, "learning_rate": 9.971400937837887e-06, "loss": 0.0728, "step": 1177 }, { "epoch": 0.5232067510548524, "grad_norm": 0.9177354811192968, "learning_rate": 9.97119352240748e-06, "loss": 0.0878, "step": 1178 }, { "epoch": 0.5236508994003998, "grad_norm": 0.7412042248938593, "learning_rate": 9.97098535972224e-06, "loss": 0.0649, "step": 1179 }, { "epoch": 0.5240950477459472, "grad_norm": 0.75392664768525, "learning_rate": 9.970776449813457e-06, "loss": 0.0682, "step": 1180 }, { "epoch": 0.5245391960914946, "grad_norm": 0.7437991387327917, "learning_rate": 9.970566792712537e-06, "loss": 0.0646, "step": 1181 }, { "epoch": 0.524983344437042, "grad_norm": 0.708247278848663, "learning_rate": 9.970356388450992e-06, "loss": 0.0781, "step": 1182 }, { "epoch": 0.5254274927825894, "grad_norm": 0.508933410042531, "learning_rate": 9.97014523706045e-06, "loss": 0.0535, "step": 1183 }, { "epoch": 0.5258716411281368, "grad_norm": 1.0021738372524498, "learning_rate": 9.96993333857265e-06, "loss": 0.0925, "step": 1184 }, { "epoch": 0.5263157894736842, "grad_norm": 0.8285101542802605, "learning_rate": 9.969720693019447e-06, "loss": 0.0653, "step": 1185 }, { "epoch": 0.5267599378192316, "grad_norm": 0.6010536869971059, "learning_rate": 9.9695073004328e-06, "loss": 0.0601, "step": 1186 }, { "epoch": 0.527204086164779, "grad_norm": 0.43111203380102636, "learning_rate": 9.969293160844793e-06, "loss": 0.0435, "step": 1187 }, { "epoch": 0.5276482345103265, "grad_norm": 0.7272050961589046, "learning_rate": 9.969078274287607e-06, "loss": 0.0582, "step": 1188 }, { "epoch": 0.5280923828558739, "grad_norm": 1.0434151274290433, "learning_rate": 9.968862640793547e-06, "loss": 0.0882, "step": 1189 }, { "epoch": 0.5285365312014213, "grad_norm": 0.6610761510472204, "learning_rate": 9.968646260395027e-06, "loss": 0.0701, "step": 1190 }, { "epoch": 0.5289806795469687, "grad_norm": 0.7703015492278306, "learning_rate": 9.96842913312457e-06, "loss": 0.0755, "step": 1191 }, { "epoch": 0.5294248278925161, "grad_norm": 1.1334603205830818, "learning_rate": 9.968211259014817e-06, "loss": 0.102, "step": 1192 }, { "epoch": 0.5298689762380635, "grad_norm": 0.5680812506173959, "learning_rate": 9.967992638098517e-06, "loss": 0.0592, "step": 1193 }, { "epoch": 0.5303131245836109, "grad_norm": 0.7128376996533422, "learning_rate": 9.96777327040853e-06, "loss": 0.0757, "step": 1194 }, { "epoch": 0.5307572729291583, "grad_norm": 0.6547153588619065, "learning_rate": 9.967553155977833e-06, "loss": 0.0513, "step": 1195 }, { "epoch": 0.5312014212747057, "grad_norm": 0.8276344454564539, "learning_rate": 9.967332294839514e-06, "loss": 0.0722, "step": 1196 }, { "epoch": 0.5316455696202531, "grad_norm": 0.5942789130977645, "learning_rate": 9.967110687026769e-06, "loss": 0.0735, "step": 1197 }, { "epoch": 0.5320897179658006, "grad_norm": 0.858420102793841, "learning_rate": 9.966888332572913e-06, "loss": 0.0888, "step": 1198 }, { "epoch": 0.532533866311348, "grad_norm": 0.5306419562771065, "learning_rate": 9.966665231511367e-06, "loss": 0.0735, "step": 1199 }, { "epoch": 0.5329780146568954, "grad_norm": 0.8818696703835058, "learning_rate": 9.96644138387567e-06, "loss": 0.0836, "step": 1200 }, { "epoch": 0.5334221630024428, "grad_norm": 0.6953800899178525, "learning_rate": 9.966216789699466e-06, "loss": 0.0609, "step": 1201 }, { "epoch": 0.5338663113479902, "grad_norm": 0.6141704558915728, "learning_rate": 9.965991449016517e-06, "loss": 0.0593, "step": 1202 }, { "epoch": 0.5343104596935376, "grad_norm": 0.7784143890008246, "learning_rate": 9.965765361860696e-06, "loss": 0.0682, "step": 1203 }, { "epoch": 0.534754608039085, "grad_norm": 0.7591572831776785, "learning_rate": 9.965538528265986e-06, "loss": 0.0713, "step": 1204 }, { "epoch": 0.5351987563846324, "grad_norm": 1.0112012318529477, "learning_rate": 9.965310948266488e-06, "loss": 0.0885, "step": 1205 }, { "epoch": 0.5356429047301798, "grad_norm": 0.540867345550289, "learning_rate": 9.965082621896407e-06, "loss": 0.0683, "step": 1206 }, { "epoch": 0.5360870530757272, "grad_norm": 0.6987948599480821, "learning_rate": 9.964853549190067e-06, "loss": 0.0623, "step": 1207 }, { "epoch": 0.5365312014212748, "grad_norm": 1.0949702611838528, "learning_rate": 9.9646237301819e-06, "loss": 0.0715, "step": 1208 }, { "epoch": 0.5369753497668222, "grad_norm": 0.5434822885792773, "learning_rate": 9.964393164906452e-06, "loss": 0.077, "step": 1209 }, { "epoch": 0.5374194981123696, "grad_norm": 0.785351221183312, "learning_rate": 9.964161853398381e-06, "loss": 0.07, "step": 1210 }, { "epoch": 0.537863646457917, "grad_norm": 0.5762754098127977, "learning_rate": 9.963929795692458e-06, "loss": 0.0553, "step": 1211 }, { "epoch": 0.5383077948034644, "grad_norm": 0.7778031310614973, "learning_rate": 9.963696991823563e-06, "loss": 0.0671, "step": 1212 }, { "epoch": 0.5387519431490118, "grad_norm": 0.7236031416359869, "learning_rate": 9.963463441826693e-06, "loss": 0.0861, "step": 1213 }, { "epoch": 0.5391960914945592, "grad_norm": 0.8183548985521056, "learning_rate": 9.963229145736952e-06, "loss": 0.0905, "step": 1214 }, { "epoch": 0.5396402398401066, "grad_norm": 0.8238577196866539, "learning_rate": 9.96299410358956e-06, "loss": 0.0791, "step": 1215 }, { "epoch": 0.540084388185654, "grad_norm": 0.620826351541747, "learning_rate": 9.962758315419847e-06, "loss": 0.0627, "step": 1216 }, { "epoch": 0.5405285365312015, "grad_norm": 0.8069644430296897, "learning_rate": 9.962521781263259e-06, "loss": 0.0846, "step": 1217 }, { "epoch": 0.5409726848767489, "grad_norm": 0.6551620736321753, "learning_rate": 9.962284501155347e-06, "loss": 0.0626, "step": 1218 }, { "epoch": 0.5414168332222963, "grad_norm": 0.7163185717239511, "learning_rate": 9.96204647513178e-06, "loss": 0.0742, "step": 1219 }, { "epoch": 0.5418609815678437, "grad_norm": 0.6464830809225138, "learning_rate": 9.96180770322834e-06, "loss": 0.0597, "step": 1220 }, { "epoch": 0.5423051299133911, "grad_norm": 0.4702533761313221, "learning_rate": 9.961568185480912e-06, "loss": 0.0663, "step": 1221 }, { "epoch": 0.5427492782589385, "grad_norm": 0.5377074866144479, "learning_rate": 9.961327921925506e-06, "loss": 0.0682, "step": 1222 }, { "epoch": 0.5431934266044859, "grad_norm": 0.6242182477426378, "learning_rate": 9.961086912598232e-06, "loss": 0.0558, "step": 1223 }, { "epoch": 0.5436375749500333, "grad_norm": 0.919842928693314, "learning_rate": 9.960845157535324e-06, "loss": 0.0979, "step": 1224 }, { "epoch": 0.5440817232955807, "grad_norm": 0.4865673297809684, "learning_rate": 9.960602656773118e-06, "loss": 0.0591, "step": 1225 }, { "epoch": 0.5445258716411281, "grad_norm": 0.5994196236588832, "learning_rate": 9.960359410348066e-06, "loss": 0.06, "step": 1226 }, { "epoch": 0.5449700199866756, "grad_norm": 0.6813026653368286, "learning_rate": 9.960115418296734e-06, "loss": 0.0563, "step": 1227 }, { "epoch": 0.545414168332223, "grad_norm": 0.4200726079302436, "learning_rate": 9.959870680655797e-06, "loss": 0.0448, "step": 1228 }, { "epoch": 0.5458583166777704, "grad_norm": 1.3484707743369218, "learning_rate": 9.959625197462042e-06, "loss": 0.058, "step": 1229 }, { "epoch": 0.5463024650233178, "grad_norm": 0.91837601072641, "learning_rate": 9.959378968752371e-06, "loss": 0.1046, "step": 1230 }, { "epoch": 0.5467466133688652, "grad_norm": 0.7675630782674875, "learning_rate": 9.959131994563795e-06, "loss": 0.0738, "step": 1231 }, { "epoch": 0.5471907617144126, "grad_norm": 0.5684812066450396, "learning_rate": 9.958884274933442e-06, "loss": 0.0699, "step": 1232 }, { "epoch": 0.54763491005996, "grad_norm": 0.6471523630810303, "learning_rate": 9.958635809898544e-06, "loss": 0.0713, "step": 1233 }, { "epoch": 0.5480790584055074, "grad_norm": 0.6967540553492192, "learning_rate": 9.95838659949645e-06, "loss": 0.0776, "step": 1234 }, { "epoch": 0.5485232067510548, "grad_norm": 0.6469245863206669, "learning_rate": 9.958136643764624e-06, "loss": 0.0764, "step": 1235 }, { "epoch": 0.5489673550966022, "grad_norm": 0.8210874368087515, "learning_rate": 9.957885942740635e-06, "loss": 0.0779, "step": 1236 }, { "epoch": 0.5494115034421497, "grad_norm": 0.6891702613627725, "learning_rate": 9.957634496462169e-06, "loss": 0.0575, "step": 1237 }, { "epoch": 0.5498556517876971, "grad_norm": 0.5415934981486168, "learning_rate": 9.957382304967024e-06, "loss": 0.0653, "step": 1238 }, { "epoch": 0.5502998001332445, "grad_norm": 0.606098328678092, "learning_rate": 9.957129368293108e-06, "loss": 0.0578, "step": 1239 }, { "epoch": 0.5507439484787919, "grad_norm": 0.5816870040354246, "learning_rate": 9.95687568647844e-06, "loss": 0.0543, "step": 1240 }, { "epoch": 0.5511880968243393, "grad_norm": 0.7853352727315064, "learning_rate": 9.956621259561152e-06, "loss": 0.0731, "step": 1241 }, { "epoch": 0.5516322451698867, "grad_norm": 0.6165368112917523, "learning_rate": 9.956366087579492e-06, "loss": 0.0688, "step": 1242 }, { "epoch": 0.5520763935154341, "grad_norm": 0.8107389760561104, "learning_rate": 9.956110170571816e-06, "loss": 0.0823, "step": 1243 }, { "epoch": 0.5525205418609815, "grad_norm": 0.6920576051280234, "learning_rate": 9.95585350857659e-06, "loss": 0.0661, "step": 1244 }, { "epoch": 0.5529646902065289, "grad_norm": 0.6032293972841873, "learning_rate": 9.9555961016324e-06, "loss": 0.0629, "step": 1245 }, { "epoch": 0.5534088385520763, "grad_norm": 0.7423430185463719, "learning_rate": 9.955337949777931e-06, "loss": 0.0781, "step": 1246 }, { "epoch": 0.5538529868976239, "grad_norm": 0.7004081423242674, "learning_rate": 9.955079053051992e-06, "loss": 0.0695, "step": 1247 }, { "epoch": 0.5542971352431713, "grad_norm": 0.6468930634930644, "learning_rate": 9.9548194114935e-06, "loss": 0.0549, "step": 1248 }, { "epoch": 0.5547412835887187, "grad_norm": 0.4810505823476616, "learning_rate": 9.954559025141484e-06, "loss": 0.0514, "step": 1249 }, { "epoch": 0.5551854319342661, "grad_norm": 0.802783907262409, "learning_rate": 9.95429789403508e-06, "loss": 0.1085, "step": 1250 }, { "epoch": 0.5556295802798135, "grad_norm": 0.5353100797059109, "learning_rate": 9.954036018213548e-06, "loss": 0.0448, "step": 1251 }, { "epoch": 0.5560737286253609, "grad_norm": 0.5946904373037035, "learning_rate": 9.953773397716247e-06, "loss": 0.0724, "step": 1252 }, { "epoch": 0.5565178769709083, "grad_norm": 1.0490812743358358, "learning_rate": 9.953510032582652e-06, "loss": 0.0675, "step": 1253 }, { "epoch": 0.5569620253164557, "grad_norm": 0.5087723175679687, "learning_rate": 9.953245922852355e-06, "loss": 0.0599, "step": 1254 }, { "epoch": 0.5574061736620031, "grad_norm": 0.5243033641589454, "learning_rate": 9.952981068565055e-06, "loss": 0.0569, "step": 1255 }, { "epoch": 0.5578503220075505, "grad_norm": 0.7607475927289213, "learning_rate": 9.952715469760566e-06, "loss": 0.0775, "step": 1256 }, { "epoch": 0.558294470353098, "grad_norm": 0.8559494882081514, "learning_rate": 9.952449126478808e-06, "loss": 0.0866, "step": 1257 }, { "epoch": 0.5587386186986454, "grad_norm": 0.7656790089546245, "learning_rate": 9.952182038759818e-06, "loss": 0.0722, "step": 1258 }, { "epoch": 0.5591827670441928, "grad_norm": 0.5660284655001593, "learning_rate": 9.951914206643744e-06, "loss": 0.0564, "step": 1259 }, { "epoch": 0.5596269153897402, "grad_norm": 0.5835362013761695, "learning_rate": 9.95164563017085e-06, "loss": 0.0788, "step": 1260 }, { "epoch": 0.5600710637352876, "grad_norm": 0.6769019916334489, "learning_rate": 9.951376309381502e-06, "loss": 0.0594, "step": 1261 }, { "epoch": 0.560515212080835, "grad_norm": 0.5253934078988655, "learning_rate": 9.951106244316184e-06, "loss": 0.053, "step": 1262 }, { "epoch": 0.5609593604263824, "grad_norm": 0.7300022918221285, "learning_rate": 9.950835435015495e-06, "loss": 0.0651, "step": 1263 }, { "epoch": 0.5614035087719298, "grad_norm": 0.5278089608881616, "learning_rate": 9.95056388152014e-06, "loss": 0.0555, "step": 1264 }, { "epoch": 0.5618476571174772, "grad_norm": 0.6301552291199896, "learning_rate": 9.950291583870938e-06, "loss": 0.0862, "step": 1265 }, { "epoch": 0.5622918054630246, "grad_norm": 0.7781648389786646, "learning_rate": 9.950018542108818e-06, "loss": 0.0699, "step": 1266 }, { "epoch": 0.5627359538085721, "grad_norm": 0.6889192633019509, "learning_rate": 9.949744756274828e-06, "loss": 0.059, "step": 1267 }, { "epoch": 0.5631801021541195, "grad_norm": 0.84114983491113, "learning_rate": 9.94947022641012e-06, "loss": 0.0855, "step": 1268 }, { "epoch": 0.5636242504996669, "grad_norm": 0.6070250007384902, "learning_rate": 9.949194952555958e-06, "loss": 0.0635, "step": 1269 }, { "epoch": 0.5640683988452143, "grad_norm": 0.8282717974485503, "learning_rate": 9.948918934753724e-06, "loss": 0.0606, "step": 1270 }, { "epoch": 0.5645125471907617, "grad_norm": 0.8327690010964344, "learning_rate": 9.948642173044906e-06, "loss": 0.0617, "step": 1271 }, { "epoch": 0.5649566955363091, "grad_norm": 0.6842942727397727, "learning_rate": 9.948364667471106e-06, "loss": 0.0795, "step": 1272 }, { "epoch": 0.5654008438818565, "grad_norm": 0.5302443265967214, "learning_rate": 9.94808641807404e-06, "loss": 0.0512, "step": 1273 }, { "epoch": 0.5658449922274039, "grad_norm": 0.6110171632056333, "learning_rate": 9.94780742489553e-06, "loss": 0.0574, "step": 1274 }, { "epoch": 0.5662891405729513, "grad_norm": 0.5491078913412093, "learning_rate": 9.947527687977519e-06, "loss": 0.0512, "step": 1275 }, { "epoch": 0.5667332889184987, "grad_norm": 0.6269674055110014, "learning_rate": 9.94724720736205e-06, "loss": 0.074, "step": 1276 }, { "epoch": 0.5671774372640462, "grad_norm": 0.7276246324646, "learning_rate": 9.946965983091286e-06, "loss": 0.1055, "step": 1277 }, { "epoch": 0.5676215856095936, "grad_norm": 0.7755784175713893, "learning_rate": 9.946684015207501e-06, "loss": 0.0878, "step": 1278 }, { "epoch": 0.568065733955141, "grad_norm": 0.6682673092927641, "learning_rate": 9.94640130375308e-06, "loss": 0.0985, "step": 1279 }, { "epoch": 0.5685098823006884, "grad_norm": 0.71248575096518, "learning_rate": 9.946117848770518e-06, "loss": 0.0546, "step": 1280 }, { "epoch": 0.5689540306462358, "grad_norm": 0.6304803324875314, "learning_rate": 9.945833650302423e-06, "loss": 0.0645, "step": 1281 }, { "epoch": 0.5693981789917832, "grad_norm": 0.6709593851017193, "learning_rate": 9.945548708391517e-06, "loss": 0.0711, "step": 1282 }, { "epoch": 0.5698423273373306, "grad_norm": 0.6036373550762799, "learning_rate": 9.94526302308063e-06, "loss": 0.0766, "step": 1283 }, { "epoch": 0.570286475682878, "grad_norm": 0.6055441845545156, "learning_rate": 9.944976594412702e-06, "loss": 0.0663, "step": 1284 }, { "epoch": 0.5707306240284254, "grad_norm": 1.1523396950548679, "learning_rate": 9.944689422430794e-06, "loss": 0.0876, "step": 1285 }, { "epoch": 0.571174772373973, "grad_norm": 0.8498417793676747, "learning_rate": 9.94440150717807e-06, "loss": 0.0764, "step": 1286 }, { "epoch": 0.5716189207195204, "grad_norm": 0.5983413012513809, "learning_rate": 9.944112848697809e-06, "loss": 0.0564, "step": 1287 }, { "epoch": 0.5720630690650678, "grad_norm": 0.7260849111122668, "learning_rate": 9.9438234470334e-06, "loss": 0.0728, "step": 1288 }, { "epoch": 0.5725072174106152, "grad_norm": 1.0050849305541534, "learning_rate": 9.943533302228346e-06, "loss": 0.0711, "step": 1289 }, { "epoch": 0.5729513657561626, "grad_norm": 0.800396030085453, "learning_rate": 9.943242414326263e-06, "loss": 0.0724, "step": 1290 }, { "epoch": 0.57339551410171, "grad_norm": 0.9224090719942251, "learning_rate": 9.94295078337087e-06, "loss": 0.1002, "step": 1291 }, { "epoch": 0.5738396624472574, "grad_norm": 0.8382142062568964, "learning_rate": 9.942658409406012e-06, "loss": 0.0756, "step": 1292 }, { "epoch": 0.5742838107928048, "grad_norm": 0.5876570577747893, "learning_rate": 9.942365292475632e-06, "loss": 0.0605, "step": 1293 }, { "epoch": 0.5747279591383522, "grad_norm": 0.9775545409759507, "learning_rate": 9.942071432623794e-06, "loss": 0.0786, "step": 1294 }, { "epoch": 0.5751721074838996, "grad_norm": 0.6598504226159471, "learning_rate": 9.941776829894667e-06, "loss": 0.0847, "step": 1295 }, { "epoch": 0.5756162558294471, "grad_norm": 0.7984485905609663, "learning_rate": 9.941481484332537e-06, "loss": 0.0699, "step": 1296 }, { "epoch": 0.5760604041749945, "grad_norm": 0.6762263367814108, "learning_rate": 9.941185395981799e-06, "loss": 0.0786, "step": 1297 }, { "epoch": 0.5765045525205419, "grad_norm": 0.7731297634993985, "learning_rate": 9.940888564886959e-06, "loss": 0.0673, "step": 1298 }, { "epoch": 0.5769487008660893, "grad_norm": 0.919743609676064, "learning_rate": 9.940590991092639e-06, "loss": 0.065, "step": 1299 }, { "epoch": 0.5773928492116367, "grad_norm": 0.6382780908912015, "learning_rate": 9.940292674643564e-06, "loss": 0.0578, "step": 1300 }, { "epoch": 0.5778369975571841, "grad_norm": 0.6620249742612588, "learning_rate": 9.93999361558458e-06, "loss": 0.0635, "step": 1301 }, { "epoch": 0.5782811459027315, "grad_norm": 0.7452135944776405, "learning_rate": 9.93969381396064e-06, "loss": 0.0806, "step": 1302 }, { "epoch": 0.5787252942482789, "grad_norm": 0.6403437438856001, "learning_rate": 9.93939326981681e-06, "loss": 0.0687, "step": 1303 }, { "epoch": 0.5791694425938263, "grad_norm": 1.1014917271306792, "learning_rate": 9.939091983198266e-06, "loss": 0.098, "step": 1304 }, { "epoch": 0.5796135909393737, "grad_norm": 1.0225638964559536, "learning_rate": 9.938789954150296e-06, "loss": 0.0644, "step": 1305 }, { "epoch": 0.5800577392849212, "grad_norm": 0.637383310530645, "learning_rate": 9.9384871827183e-06, "loss": 0.06, "step": 1306 }, { "epoch": 0.5805018876304686, "grad_norm": 0.6546930860338038, "learning_rate": 9.93818366894779e-06, "loss": 0.0681, "step": 1307 }, { "epoch": 0.580946035976016, "grad_norm": 0.5341000978856885, "learning_rate": 9.93787941288439e-06, "loss": 0.0589, "step": 1308 }, { "epoch": 0.5813901843215634, "grad_norm": 0.5540761577564739, "learning_rate": 9.937574414573834e-06, "loss": 0.07, "step": 1309 }, { "epoch": 0.5818343326671108, "grad_norm": 0.6399098162524178, "learning_rate": 9.937268674061968e-06, "loss": 0.0681, "step": 1310 }, { "epoch": 0.5822784810126582, "grad_norm": 0.9788827379867293, "learning_rate": 9.936962191394753e-06, "loss": 0.0775, "step": 1311 }, { "epoch": 0.5827226293582056, "grad_norm": 0.5825969221377589, "learning_rate": 9.936654966618255e-06, "loss": 0.0662, "step": 1312 }, { "epoch": 0.583166777703753, "grad_norm": 0.6990166610409554, "learning_rate": 9.936346999778657e-06, "loss": 0.0748, "step": 1313 }, { "epoch": 0.5836109260493004, "grad_norm": 0.7726036624010885, "learning_rate": 9.93603829092225e-06, "loss": 0.0806, "step": 1314 }, { "epoch": 0.5840550743948478, "grad_norm": 0.5892730393957506, "learning_rate": 9.93572884009544e-06, "loss": 0.0687, "step": 1315 }, { "epoch": 0.5844992227403953, "grad_norm": 0.7324361900970796, "learning_rate": 9.935418647344741e-06, "loss": 0.0722, "step": 1316 }, { "epoch": 0.5849433710859427, "grad_norm": 0.8954678705930711, "learning_rate": 9.935107712716781e-06, "loss": 0.0829, "step": 1317 }, { "epoch": 0.5853875194314901, "grad_norm": 0.5986143926701439, "learning_rate": 9.9347960362583e-06, "loss": 0.068, "step": 1318 }, { "epoch": 0.5858316677770375, "grad_norm": 0.6105531585151114, "learning_rate": 9.934483618016148e-06, "loss": 0.0719, "step": 1319 }, { "epoch": 0.586275816122585, "grad_norm": 0.7814579585476528, "learning_rate": 9.934170458037285e-06, "loss": 0.0899, "step": 1320 }, { "epoch": 0.5867199644681323, "grad_norm": 0.6130620918486331, "learning_rate": 9.933856556368785e-06, "loss": 0.0663, "step": 1321 }, { "epoch": 0.5871641128136798, "grad_norm": 0.7911643262947309, "learning_rate": 9.933541913057833e-06, "loss": 0.0782, "step": 1322 }, { "epoch": 0.5876082611592272, "grad_norm": 0.7091228860294103, "learning_rate": 9.933226528151725e-06, "loss": 0.0637, "step": 1323 }, { "epoch": 0.5880524095047746, "grad_norm": 0.7999422262730185, "learning_rate": 9.93291040169787e-06, "loss": 0.0616, "step": 1324 }, { "epoch": 0.588496557850322, "grad_norm": 0.7189040382281308, "learning_rate": 9.932593533743786e-06, "loss": 0.0602, "step": 1325 }, { "epoch": 0.5889407061958695, "grad_norm": 0.5837515830643734, "learning_rate": 9.932275924337104e-06, "loss": 0.0586, "step": 1326 }, { "epoch": 0.5893848545414169, "grad_norm": 0.8281667273984812, "learning_rate": 9.931957573525566e-06, "loss": 0.0648, "step": 1327 }, { "epoch": 0.5898290028869643, "grad_norm": 0.5286485536715172, "learning_rate": 9.931638481357024e-06, "loss": 0.0531, "step": 1328 }, { "epoch": 0.5902731512325117, "grad_norm": 0.6275875445640038, "learning_rate": 9.931318647879445e-06, "loss": 0.064, "step": 1329 }, { "epoch": 0.5907172995780591, "grad_norm": 0.8183874857273685, "learning_rate": 9.930998073140905e-06, "loss": 0.1023, "step": 1330 }, { "epoch": 0.5911614479236065, "grad_norm": 0.6557543608908961, "learning_rate": 9.93067675718959e-06, "loss": 0.0591, "step": 1331 }, { "epoch": 0.5916055962691539, "grad_norm": 0.7946472702361359, "learning_rate": 9.930354700073803e-06, "loss": 0.0783, "step": 1332 }, { "epoch": 0.5920497446147013, "grad_norm": 0.4755584849969953, "learning_rate": 9.930031901841952e-06, "loss": 0.065, "step": 1333 }, { "epoch": 0.5924938929602487, "grad_norm": 0.6292133058734393, "learning_rate": 9.929708362542559e-06, "loss": 0.065, "step": 1334 }, { "epoch": 0.5929380413057961, "grad_norm": 0.7194753693955848, "learning_rate": 9.929384082224258e-06, "loss": 0.0649, "step": 1335 }, { "epoch": 0.5933821896513436, "grad_norm": 0.8182929882444262, "learning_rate": 9.929059060935795e-06, "loss": 0.0735, "step": 1336 }, { "epoch": 0.593826337996891, "grad_norm": 0.6452166028375979, "learning_rate": 9.928733298726024e-06, "loss": 0.0773, "step": 1337 }, { "epoch": 0.5942704863424384, "grad_norm": 0.6174785647804547, "learning_rate": 9.928406795643913e-06, "loss": 0.088, "step": 1338 }, { "epoch": 0.5947146346879858, "grad_norm": 0.9075257756258466, "learning_rate": 9.928079551738542e-06, "loss": 0.0966, "step": 1339 }, { "epoch": 0.5951587830335332, "grad_norm": 0.7930757535133779, "learning_rate": 9.927751567059103e-06, "loss": 0.0788, "step": 1340 }, { "epoch": 0.5956029313790806, "grad_norm": 0.7918355971015104, "learning_rate": 9.927422841654894e-06, "loss": 0.0732, "step": 1341 }, { "epoch": 0.596047079724628, "grad_norm": 0.5344795042621896, "learning_rate": 9.92709337557533e-06, "loss": 0.0555, "step": 1342 }, { "epoch": 0.5964912280701754, "grad_norm": 0.7151020498309698, "learning_rate": 9.926763168869935e-06, "loss": 0.0699, "step": 1343 }, { "epoch": 0.5969353764157228, "grad_norm": 0.8228281453238034, "learning_rate": 9.926432221588342e-06, "loss": 0.0723, "step": 1344 }, { "epoch": 0.5973795247612703, "grad_norm": 0.6976750613205994, "learning_rate": 9.926100533780304e-06, "loss": 0.0877, "step": 1345 }, { "epoch": 0.5978236731068177, "grad_norm": 0.8796022908023586, "learning_rate": 9.925768105495675e-06, "loss": 0.0867, "step": 1346 }, { "epoch": 0.5982678214523651, "grad_norm": 0.6214669294617633, "learning_rate": 9.925434936784426e-06, "loss": 0.0724, "step": 1347 }, { "epoch": 0.5987119697979125, "grad_norm": 0.5493814047261378, "learning_rate": 9.925101027696636e-06, "loss": 0.0546, "step": 1348 }, { "epoch": 0.5991561181434599, "grad_norm": 0.6738873954295155, "learning_rate": 9.924766378282499e-06, "loss": 0.068, "step": 1349 }, { "epoch": 0.5996002664890073, "grad_norm": 0.662002450028893, "learning_rate": 9.92443098859232e-06, "loss": 0.0618, "step": 1350 }, { "epoch": 0.6000444148345547, "grad_norm": 0.5691844703394149, "learning_rate": 9.92409485867651e-06, "loss": 0.0675, "step": 1351 }, { "epoch": 0.6004885631801021, "grad_norm": 0.5107643154244839, "learning_rate": 9.923757988585599e-06, "loss": 0.0582, "step": 1352 }, { "epoch": 0.6009327115256495, "grad_norm": 0.4892180420795259, "learning_rate": 9.923420378370221e-06, "loss": 0.0707, "step": 1353 }, { "epoch": 0.6013768598711969, "grad_norm": 0.5128501684775312, "learning_rate": 9.923082028081125e-06, "loss": 0.0515, "step": 1354 }, { "epoch": 0.6018210082167444, "grad_norm": 0.6712578771133637, "learning_rate": 9.922742937769172e-06, "loss": 0.0668, "step": 1355 }, { "epoch": 0.6022651565622918, "grad_norm": 0.8489596826538364, "learning_rate": 9.922403107485335e-06, "loss": 0.0852, "step": 1356 }, { "epoch": 0.6027093049078392, "grad_norm": 0.7130581914563381, "learning_rate": 9.922062537280692e-06, "loss": 0.0877, "step": 1357 }, { "epoch": 0.6031534532533867, "grad_norm": 0.5999507766881641, "learning_rate": 9.921721227206438e-06, "loss": 0.0693, "step": 1358 }, { "epoch": 0.603597601598934, "grad_norm": 0.5747773135899322, "learning_rate": 9.92137917731388e-06, "loss": 0.0556, "step": 1359 }, { "epoch": 0.6040417499444815, "grad_norm": 0.7771993716251702, "learning_rate": 9.921036387654429e-06, "loss": 0.0643, "step": 1360 }, { "epoch": 0.6044858982900289, "grad_norm": 1.307956020844314, "learning_rate": 9.920692858279616e-06, "loss": 0.0551, "step": 1361 }, { "epoch": 0.6049300466355763, "grad_norm": 0.6255379323591398, "learning_rate": 9.92034858924108e-06, "loss": 0.0636, "step": 1362 }, { "epoch": 0.6053741949811237, "grad_norm": 0.553962551501975, "learning_rate": 9.92000358059057e-06, "loss": 0.0814, "step": 1363 }, { "epoch": 0.6058183433266711, "grad_norm": 0.7454747633443178, "learning_rate": 9.919657832379943e-06, "loss": 0.0603, "step": 1364 }, { "epoch": 0.6062624916722186, "grad_norm": 0.9733260117481642, "learning_rate": 9.919311344661174e-06, "loss": 0.1153, "step": 1365 }, { "epoch": 0.606706640017766, "grad_norm": 0.613902681985122, "learning_rate": 9.918964117486346e-06, "loss": 0.0578, "step": 1366 }, { "epoch": 0.6071507883633134, "grad_norm": 0.7376648326818326, "learning_rate": 9.918616150907651e-06, "loss": 0.066, "step": 1367 }, { "epoch": 0.6075949367088608, "grad_norm": 0.6476167687641429, "learning_rate": 9.918267444977398e-06, "loss": 0.0467, "step": 1368 }, { "epoch": 0.6080390850544082, "grad_norm": 0.6720043569712078, "learning_rate": 9.917917999747999e-06, "loss": 0.0647, "step": 1369 }, { "epoch": 0.6084832333999556, "grad_norm": 0.6554718376719761, "learning_rate": 9.917567815271986e-06, "loss": 0.0652, "step": 1370 }, { "epoch": 0.608927381745503, "grad_norm": 0.7668531453750762, "learning_rate": 9.917216891601996e-06, "loss": 0.07, "step": 1371 }, { "epoch": 0.6093715300910504, "grad_norm": 0.8088811507082688, "learning_rate": 9.916865228790776e-06, "loss": 0.0771, "step": 1372 }, { "epoch": 0.6098156784365978, "grad_norm": 0.687085496356637, "learning_rate": 9.91651282689119e-06, "loss": 0.0758, "step": 1373 }, { "epoch": 0.6102598267821452, "grad_norm": 0.7118263597209087, "learning_rate": 9.916159685956208e-06, "loss": 0.0704, "step": 1374 }, { "epoch": 0.6107039751276927, "grad_norm": 0.5722664877188374, "learning_rate": 9.915805806038917e-06, "loss": 0.0568, "step": 1375 }, { "epoch": 0.6111481234732401, "grad_norm": 0.5809145124340924, "learning_rate": 9.915451187192507e-06, "loss": 0.0508, "step": 1376 }, { "epoch": 0.6115922718187875, "grad_norm": 0.7936146709815162, "learning_rate": 9.915095829470284e-06, "loss": 0.0685, "step": 1377 }, { "epoch": 0.6120364201643349, "grad_norm": 0.6343789137617302, "learning_rate": 9.914739732925665e-06, "loss": 0.0727, "step": 1378 }, { "epoch": 0.6124805685098823, "grad_norm": 0.6049507318538985, "learning_rate": 9.914382897612178e-06, "loss": 0.0613, "step": 1379 }, { "epoch": 0.6129247168554297, "grad_norm": 0.7820325942346816, "learning_rate": 9.91402532358346e-06, "loss": 0.0631, "step": 1380 }, { "epoch": 0.6133688652009771, "grad_norm": 0.647686549326212, "learning_rate": 9.913667010893261e-06, "loss": 0.0593, "step": 1381 }, { "epoch": 0.6138130135465245, "grad_norm": 0.5699910207599692, "learning_rate": 9.913307959595443e-06, "loss": 0.0617, "step": 1382 }, { "epoch": 0.6142571618920719, "grad_norm": 0.8214987498163691, "learning_rate": 9.912948169743977e-06, "loss": 0.1062, "step": 1383 }, { "epoch": 0.6147013102376193, "grad_norm": 0.7190447874189771, "learning_rate": 9.912587641392943e-06, "loss": 0.0816, "step": 1384 }, { "epoch": 0.6151454585831668, "grad_norm": 0.4801110982287438, "learning_rate": 9.912226374596536e-06, "loss": 0.0531, "step": 1385 }, { "epoch": 0.6155896069287142, "grad_norm": 0.5777279849946639, "learning_rate": 9.911864369409062e-06, "loss": 0.0593, "step": 1386 }, { "epoch": 0.6160337552742616, "grad_norm": 0.6445627055234285, "learning_rate": 9.911501625884934e-06, "loss": 0.0562, "step": 1387 }, { "epoch": 0.616477903619809, "grad_norm": 0.6402760221505619, "learning_rate": 9.911138144078681e-06, "loss": 0.0628, "step": 1388 }, { "epoch": 0.6169220519653564, "grad_norm": 0.5943075838810645, "learning_rate": 9.910773924044937e-06, "loss": 0.0694, "step": 1389 }, { "epoch": 0.6173662003109038, "grad_norm": 0.6887164265976037, "learning_rate": 9.910408965838455e-06, "loss": 0.0662, "step": 1390 }, { "epoch": 0.6178103486564512, "grad_norm": 0.7025269495789147, "learning_rate": 9.91004326951409e-06, "loss": 0.0596, "step": 1391 }, { "epoch": 0.6182544970019986, "grad_norm": 0.6968614560116964, "learning_rate": 9.909676835126819e-06, "loss": 0.08, "step": 1392 }, { "epoch": 0.618698645347546, "grad_norm": 0.9599784608542535, "learning_rate": 9.909309662731713e-06, "loss": 0.0981, "step": 1393 }, { "epoch": 0.6191427936930934, "grad_norm": 0.5936127126118025, "learning_rate": 9.908941752383974e-06, "loss": 0.0502, "step": 1394 }, { "epoch": 0.619586942038641, "grad_norm": 0.6281665293752687, "learning_rate": 9.9085731041389e-06, "loss": 0.0894, "step": 1395 }, { "epoch": 0.6200310903841884, "grad_norm": 0.7270920690197641, "learning_rate": 9.908203718051907e-06, "loss": 0.0772, "step": 1396 }, { "epoch": 0.6204752387297358, "grad_norm": 0.9689326115460888, "learning_rate": 9.90783359417852e-06, "loss": 0.0955, "step": 1397 }, { "epoch": 0.6209193870752832, "grad_norm": 0.5718494750591436, "learning_rate": 9.907462732574373e-06, "loss": 0.0586, "step": 1398 }, { "epoch": 0.6213635354208306, "grad_norm": 0.547133748335618, "learning_rate": 9.907091133295214e-06, "loss": 0.0546, "step": 1399 }, { "epoch": 0.621807683766378, "grad_norm": 0.7492139302396027, "learning_rate": 9.906718796396901e-06, "loss": 0.0726, "step": 1400 }, { "epoch": 0.6222518321119254, "grad_norm": 1.1332966961373652, "learning_rate": 9.906345721935402e-06, "loss": 0.0837, "step": 1401 }, { "epoch": 0.6226959804574728, "grad_norm": 0.6506799230647055, "learning_rate": 9.905971909966798e-06, "loss": 0.0676, "step": 1402 }, { "epoch": 0.6231401288030202, "grad_norm": 0.7078271615105967, "learning_rate": 9.905597360547276e-06, "loss": 0.0763, "step": 1403 }, { "epoch": 0.6235842771485676, "grad_norm": 0.8061592052669874, "learning_rate": 9.90522207373314e-06, "loss": 0.0941, "step": 1404 }, { "epoch": 0.6240284254941151, "grad_norm": 0.7215188703499849, "learning_rate": 9.904846049580804e-06, "loss": 0.0913, "step": 1405 }, { "epoch": 0.6244725738396625, "grad_norm": 0.7648810068346831, "learning_rate": 9.904469288146785e-06, "loss": 0.103, "step": 1406 }, { "epoch": 0.6249167221852099, "grad_norm": 0.7848759400980109, "learning_rate": 9.90409178948772e-06, "loss": 0.0853, "step": 1407 }, { "epoch": 0.6253608705307573, "grad_norm": 0.5249169314876924, "learning_rate": 9.903713553660352e-06, "loss": 0.0539, "step": 1408 }, { "epoch": 0.6258050188763047, "grad_norm": 0.651696006803284, "learning_rate": 9.90333458072154e-06, "loss": 0.0725, "step": 1409 }, { "epoch": 0.6262491672218521, "grad_norm": 0.5581372640238701, "learning_rate": 9.902954870728246e-06, "loss": 0.0516, "step": 1410 }, { "epoch": 0.6266933155673995, "grad_norm": 0.46394395658623727, "learning_rate": 9.902574423737547e-06, "loss": 0.0543, "step": 1411 }, { "epoch": 0.6271374639129469, "grad_norm": 0.6101555748112776, "learning_rate": 9.902193239806634e-06, "loss": 0.0688, "step": 1412 }, { "epoch": 0.6275816122584943, "grad_norm": 0.6266576234326965, "learning_rate": 9.901811318992802e-06, "loss": 0.0619, "step": 1413 }, { "epoch": 0.6280257606040418, "grad_norm": 0.9093069033662159, "learning_rate": 9.901428661353462e-06, "loss": 0.1009, "step": 1414 }, { "epoch": 0.6284699089495892, "grad_norm": 0.796133446241698, "learning_rate": 9.901045266946134e-06, "loss": 0.0919, "step": 1415 }, { "epoch": 0.6289140572951366, "grad_norm": 0.75094830010195, "learning_rate": 9.900661135828448e-06, "loss": 0.0594, "step": 1416 }, { "epoch": 0.629358205640684, "grad_norm": 0.8379496784608408, "learning_rate": 9.900276268058147e-06, "loss": 0.0659, "step": 1417 }, { "epoch": 0.6298023539862314, "grad_norm": 0.7794466397530133, "learning_rate": 9.899890663693078e-06, "loss": 0.086, "step": 1418 }, { "epoch": 0.6302465023317788, "grad_norm": 1.781154839834256, "learning_rate": 9.899504322791212e-06, "loss": 0.1089, "step": 1419 }, { "epoch": 0.6306906506773262, "grad_norm": 0.582797339406938, "learning_rate": 9.899117245410615e-06, "loss": 0.0449, "step": 1420 }, { "epoch": 0.6311347990228736, "grad_norm": 0.7463622614388931, "learning_rate": 9.898729431609477e-06, "loss": 0.0542, "step": 1421 }, { "epoch": 0.631578947368421, "grad_norm": 0.7697052470584549, "learning_rate": 9.89834088144609e-06, "loss": 0.0635, "step": 1422 }, { "epoch": 0.6320230957139684, "grad_norm": 0.900347396335065, "learning_rate": 9.897951594978858e-06, "loss": 0.0778, "step": 1423 }, { "epoch": 0.6324672440595159, "grad_norm": 0.6468712468526486, "learning_rate": 9.897561572266301e-06, "loss": 0.066, "step": 1424 }, { "epoch": 0.6329113924050633, "grad_norm": 0.4986782583243913, "learning_rate": 9.897170813367045e-06, "loss": 0.0662, "step": 1425 }, { "epoch": 0.6333555407506107, "grad_norm": 0.6014710971137512, "learning_rate": 9.896779318339826e-06, "loss": 0.0657, "step": 1426 }, { "epoch": 0.6337996890961581, "grad_norm": 0.8579597362562785, "learning_rate": 9.896387087243496e-06, "loss": 0.08, "step": 1427 }, { "epoch": 0.6342438374417055, "grad_norm": 0.9386752220131082, "learning_rate": 9.89599412013701e-06, "loss": 0.1233, "step": 1428 }, { "epoch": 0.6346879857872529, "grad_norm": 0.7426460359725069, "learning_rate": 9.89560041707944e-06, "loss": 0.0861, "step": 1429 }, { "epoch": 0.6351321341328003, "grad_norm": 0.5143125228492677, "learning_rate": 9.895205978129966e-06, "loss": 0.0446, "step": 1430 }, { "epoch": 0.6355762824783477, "grad_norm": 1.049177103786211, "learning_rate": 9.894810803347878e-06, "loss": 0.0912, "step": 1431 }, { "epoch": 0.6360204308238951, "grad_norm": 0.5972518436510755, "learning_rate": 9.894414892792579e-06, "loss": 0.0657, "step": 1432 }, { "epoch": 0.6364645791694425, "grad_norm": 0.4154671445638412, "learning_rate": 9.894018246523577e-06, "loss": 0.0532, "step": 1433 }, { "epoch": 0.6369087275149901, "grad_norm": 0.65372867549855, "learning_rate": 9.893620864600501e-06, "loss": 0.087, "step": 1434 }, { "epoch": 0.6373528758605375, "grad_norm": 0.7777909981283909, "learning_rate": 9.89322274708308e-06, "loss": 0.0795, "step": 1435 }, { "epoch": 0.6377970242060849, "grad_norm": 0.6281013351353801, "learning_rate": 9.892823894031159e-06, "loss": 0.0525, "step": 1436 }, { "epoch": 0.6382411725516323, "grad_norm": 0.5274870510804406, "learning_rate": 9.89242430550469e-06, "loss": 0.0701, "step": 1437 }, { "epoch": 0.6386853208971797, "grad_norm": 0.9840882810513909, "learning_rate": 9.892023981563744e-06, "loss": 0.0888, "step": 1438 }, { "epoch": 0.6391294692427271, "grad_norm": 0.7186121306665683, "learning_rate": 9.89162292226849e-06, "loss": 0.0666, "step": 1439 }, { "epoch": 0.6395736175882745, "grad_norm": 0.7619824705257715, "learning_rate": 9.891221127679216e-06, "loss": 0.0654, "step": 1440 }, { "epoch": 0.6400177659338219, "grad_norm": 0.6418832068583645, "learning_rate": 9.89081859785632e-06, "loss": 0.0706, "step": 1441 }, { "epoch": 0.6404619142793693, "grad_norm": 0.6934692957378497, "learning_rate": 9.890415332860308e-06, "loss": 0.0684, "step": 1442 }, { "epoch": 0.6409060626249167, "grad_norm": 0.6769661850243969, "learning_rate": 9.8900113327518e-06, "loss": 0.0684, "step": 1443 }, { "epoch": 0.6413502109704642, "grad_norm": 0.6158605594742834, "learning_rate": 9.88960659759152e-06, "loss": 0.0652, "step": 1444 }, { "epoch": 0.6417943593160116, "grad_norm": 0.5847451238116226, "learning_rate": 9.88920112744031e-06, "loss": 0.062, "step": 1445 }, { "epoch": 0.642238507661559, "grad_norm": 0.8117410133012406, "learning_rate": 9.888794922359116e-06, "loss": 0.0611, "step": 1446 }, { "epoch": 0.6426826560071064, "grad_norm": 0.7705515566871968, "learning_rate": 9.888387982408998e-06, "loss": 0.0752, "step": 1447 }, { "epoch": 0.6431268043526538, "grad_norm": 0.777589892103545, "learning_rate": 9.887980307651128e-06, "loss": 0.0764, "step": 1448 }, { "epoch": 0.6435709526982012, "grad_norm": 0.8491119564862617, "learning_rate": 9.887571898146787e-06, "loss": 0.081, "step": 1449 }, { "epoch": 0.6440151010437486, "grad_norm": 0.7918875216411717, "learning_rate": 9.887162753957362e-06, "loss": 0.0626, "step": 1450 }, { "epoch": 0.644459249389296, "grad_norm": 0.5756994501297381, "learning_rate": 9.886752875144358e-06, "loss": 0.0635, "step": 1451 }, { "epoch": 0.6449033977348434, "grad_norm": 0.6492149852575843, "learning_rate": 9.886342261769387e-06, "loss": 0.0632, "step": 1452 }, { "epoch": 0.6453475460803908, "grad_norm": 0.7503617012041023, "learning_rate": 9.885930913894166e-06, "loss": 0.0627, "step": 1453 }, { "epoch": 0.6457916944259383, "grad_norm": 0.6405303313329822, "learning_rate": 9.885518831580533e-06, "loss": 0.0782, "step": 1454 }, { "epoch": 0.6462358427714857, "grad_norm": 0.6250298341424497, "learning_rate": 9.88510601489043e-06, "loss": 0.0807, "step": 1455 }, { "epoch": 0.6466799911170331, "grad_norm": 0.7939161691732389, "learning_rate": 9.88469246388591e-06, "loss": 0.069, "step": 1456 }, { "epoch": 0.6471241394625805, "grad_norm": 0.7492359887972416, "learning_rate": 9.884278178629134e-06, "loss": 0.0854, "step": 1457 }, { "epoch": 0.6475682878081279, "grad_norm": 0.6085287559130933, "learning_rate": 9.883863159182379e-06, "loss": 0.0608, "step": 1458 }, { "epoch": 0.6480124361536753, "grad_norm": 0.6361755513609607, "learning_rate": 9.883447405608032e-06, "loss": 0.0433, "step": 1459 }, { "epoch": 0.6484565844992227, "grad_norm": 0.6431691982633025, "learning_rate": 9.88303091796858e-06, "loss": 0.0419, "step": 1460 }, { "epoch": 0.6489007328447701, "grad_norm": 0.8190085662166918, "learning_rate": 9.882613696326634e-06, "loss": 0.0767, "step": 1461 }, { "epoch": 0.6493448811903175, "grad_norm": 0.755548239297633, "learning_rate": 9.882195740744911e-06, "loss": 0.0923, "step": 1462 }, { "epoch": 0.6497890295358649, "grad_norm": 0.5719039887354458, "learning_rate": 9.881777051286232e-06, "loss": 0.0723, "step": 1463 }, { "epoch": 0.6502331778814124, "grad_norm": 0.7825393714474173, "learning_rate": 9.881357628013535e-06, "loss": 0.0835, "step": 1464 }, { "epoch": 0.6506773262269598, "grad_norm": 0.6610007380508074, "learning_rate": 9.880937470989868e-06, "loss": 0.0656, "step": 1465 }, { "epoch": 0.6511214745725072, "grad_norm": 0.6218987787310035, "learning_rate": 9.880516580278386e-06, "loss": 0.0594, "step": 1466 }, { "epoch": 0.6515656229180546, "grad_norm": 0.6871813560151103, "learning_rate": 9.880094955942357e-06, "loss": 0.0539, "step": 1467 }, { "epoch": 0.652009771263602, "grad_norm": 0.6311698562321532, "learning_rate": 9.879672598045156e-06, "loss": 0.0806, "step": 1468 }, { "epoch": 0.6524539196091494, "grad_norm": 0.6693355850981739, "learning_rate": 9.879249506650275e-06, "loss": 0.0856, "step": 1469 }, { "epoch": 0.6528980679546968, "grad_norm": 0.7127562713682332, "learning_rate": 9.878825681821306e-06, "loss": 0.0685, "step": 1470 }, { "epoch": 0.6533422163002442, "grad_norm": 0.9425811868582009, "learning_rate": 9.878401123621963e-06, "loss": 0.0823, "step": 1471 }, { "epoch": 0.6537863646457917, "grad_norm": 0.6086374723526442, "learning_rate": 9.87797583211606e-06, "loss": 0.0614, "step": 1472 }, { "epoch": 0.654230512991339, "grad_norm": 0.7763257358459474, "learning_rate": 9.877549807367528e-06, "loss": 0.0857, "step": 1473 }, { "epoch": 0.6546746613368866, "grad_norm": 0.539995990500211, "learning_rate": 9.877123049440405e-06, "loss": 0.0531, "step": 1474 }, { "epoch": 0.655118809682434, "grad_norm": 0.5915202116072419, "learning_rate": 9.876695558398838e-06, "loss": 0.0663, "step": 1475 }, { "epoch": 0.6555629580279814, "grad_norm": 0.6814775856965638, "learning_rate": 9.876267334307091e-06, "loss": 0.0536, "step": 1476 }, { "epoch": 0.6560071063735288, "grad_norm": 0.6565534779979346, "learning_rate": 9.875838377229528e-06, "loss": 0.0854, "step": 1477 }, { "epoch": 0.6564512547190762, "grad_norm": 0.6074291178048928, "learning_rate": 9.875408687230633e-06, "loss": 0.0643, "step": 1478 }, { "epoch": 0.6568954030646236, "grad_norm": 0.6683735797478915, "learning_rate": 9.874978264374991e-06, "loss": 0.0657, "step": 1479 }, { "epoch": 0.657339551410171, "grad_norm": 0.6802029779866509, "learning_rate": 9.874547108727306e-06, "loss": 0.0571, "step": 1480 }, { "epoch": 0.6577836997557184, "grad_norm": 0.7709030410876545, "learning_rate": 9.874115220352386e-06, "loss": 0.0515, "step": 1481 }, { "epoch": 0.6582278481012658, "grad_norm": 0.6029748648689206, "learning_rate": 9.873682599315152e-06, "loss": 0.054, "step": 1482 }, { "epoch": 0.6586719964468133, "grad_norm": 0.7218151069926805, "learning_rate": 9.873249245680634e-06, "loss": 0.0842, "step": 1483 }, { "epoch": 0.6591161447923607, "grad_norm": 0.7118725638175186, "learning_rate": 9.872815159513972e-06, "loss": 0.0783, "step": 1484 }, { "epoch": 0.6595602931379081, "grad_norm": 0.5278035850844903, "learning_rate": 9.872380340880416e-06, "loss": 0.0504, "step": 1485 }, { "epoch": 0.6600044414834555, "grad_norm": 0.7090158968533085, "learning_rate": 9.87194478984533e-06, "loss": 0.0518, "step": 1486 }, { "epoch": 0.6604485898290029, "grad_norm": 0.7140940579751807, "learning_rate": 9.87150850647418e-06, "loss": 0.0644, "step": 1487 }, { "epoch": 0.6608927381745503, "grad_norm": 0.5233702996222387, "learning_rate": 9.87107149083255e-06, "loss": 0.0484, "step": 1488 }, { "epoch": 0.6613368865200977, "grad_norm": 0.5255868844598361, "learning_rate": 9.870633742986129e-06, "loss": 0.0486, "step": 1489 }, { "epoch": 0.6617810348656451, "grad_norm": 0.7456554727487221, "learning_rate": 9.870195263000719e-06, "loss": 0.0926, "step": 1490 }, { "epoch": 0.6622251832111925, "grad_norm": 0.5094239000908518, "learning_rate": 9.869756050942231e-06, "loss": 0.0609, "step": 1491 }, { "epoch": 0.6626693315567399, "grad_norm": 0.5934076694430587, "learning_rate": 9.869316106876687e-06, "loss": 0.0815, "step": 1492 }, { "epoch": 0.6631134799022874, "grad_norm": 0.5800872547088418, "learning_rate": 9.868875430870217e-06, "loss": 0.0562, "step": 1493 }, { "epoch": 0.6635576282478348, "grad_norm": 0.6466806398918721, "learning_rate": 9.86843402298906e-06, "loss": 0.0576, "step": 1494 }, { "epoch": 0.6640017765933822, "grad_norm": 0.49149929770034256, "learning_rate": 9.86799188329957e-06, "loss": 0.0552, "step": 1495 }, { "epoch": 0.6644459249389296, "grad_norm": 0.6869307446643815, "learning_rate": 9.867549011868208e-06, "loss": 0.0689, "step": 1496 }, { "epoch": 0.664890073284477, "grad_norm": 1.0182179632443422, "learning_rate": 9.867105408761544e-06, "loss": 0.0658, "step": 1497 }, { "epoch": 0.6653342216300244, "grad_norm": 0.5094314413955369, "learning_rate": 9.866661074046258e-06, "loss": 0.0509, "step": 1498 }, { "epoch": 0.6657783699755718, "grad_norm": 0.6982818166570782, "learning_rate": 9.866216007789145e-06, "loss": 0.0728, "step": 1499 }, { "epoch": 0.6662225183211192, "grad_norm": 0.7524936010719828, "learning_rate": 9.8657702100571e-06, "loss": 0.0911, "step": 1500 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7557138142113028, "learning_rate": 9.86532368091714e-06, "loss": 0.059, "step": 1501 }, { "epoch": 0.667110815012214, "grad_norm": 0.6888932895497213, "learning_rate": 9.864876420436383e-06, "loss": 0.0614, "step": 1502 }, { "epoch": 0.6675549633577615, "grad_norm": 0.5506269374818886, "learning_rate": 9.86442842868206e-06, "loss": 0.0599, "step": 1503 }, { "epoch": 0.667999111703309, "grad_norm": 0.8018485119174299, "learning_rate": 9.86397970572151e-06, "loss": 0.0875, "step": 1504 }, { "epoch": 0.6684432600488563, "grad_norm": 0.5631462958993229, "learning_rate": 9.863530251622189e-06, "loss": 0.0691, "step": 1505 }, { "epoch": 0.6688874083944037, "grad_norm": 0.8176186875229494, "learning_rate": 9.863080066451653e-06, "loss": 0.072, "step": 1506 }, { "epoch": 0.6693315567399512, "grad_norm": 0.7128371312290194, "learning_rate": 9.862629150277574e-06, "loss": 0.0773, "step": 1507 }, { "epoch": 0.6697757050854986, "grad_norm": 0.759379781839776, "learning_rate": 9.86217750316773e-06, "loss": 0.0729, "step": 1508 }, { "epoch": 0.670219853431046, "grad_norm": 0.6634457926439065, "learning_rate": 9.861725125190017e-06, "loss": 0.0595, "step": 1509 }, { "epoch": 0.6706640017765934, "grad_norm": 0.7426433953184679, "learning_rate": 9.861272016412429e-06, "loss": 0.065, "step": 1510 }, { "epoch": 0.6711081501221408, "grad_norm": 0.6600032637907064, "learning_rate": 9.86081817690308e-06, "loss": 0.0625, "step": 1511 }, { "epoch": 0.6715522984676882, "grad_norm": 0.703748253196849, "learning_rate": 9.860363606730185e-06, "loss": 0.073, "step": 1512 }, { "epoch": 0.6719964468132357, "grad_norm": 0.8768454660219879, "learning_rate": 9.85990830596208e-06, "loss": 0.093, "step": 1513 }, { "epoch": 0.6724405951587831, "grad_norm": 0.5810307286903331, "learning_rate": 9.859452274667199e-06, "loss": 0.0664, "step": 1514 }, { "epoch": 0.6728847435043305, "grad_norm": 0.6467795840989637, "learning_rate": 9.858995512914096e-06, "loss": 0.0736, "step": 1515 }, { "epoch": 0.6733288918498779, "grad_norm": 0.8604239627870944, "learning_rate": 9.858538020771424e-06, "loss": 0.0819, "step": 1516 }, { "epoch": 0.6737730401954253, "grad_norm": 0.5072234228135453, "learning_rate": 9.858079798307959e-06, "loss": 0.0764, "step": 1517 }, { "epoch": 0.6742171885409727, "grad_norm": 0.8843203312811015, "learning_rate": 9.857620845592573e-06, "loss": 0.0941, "step": 1518 }, { "epoch": 0.6746613368865201, "grad_norm": 0.7766118997696602, "learning_rate": 9.85716116269426e-06, "loss": 0.0697, "step": 1519 }, { "epoch": 0.6751054852320675, "grad_norm": 0.5371610065691803, "learning_rate": 9.856700749682114e-06, "loss": 0.075, "step": 1520 }, { "epoch": 0.6755496335776149, "grad_norm": 0.5048461380926635, "learning_rate": 9.856239606625345e-06, "loss": 0.0584, "step": 1521 }, { "epoch": 0.6759937819231623, "grad_norm": 0.819913205588501, "learning_rate": 9.855777733593269e-06, "loss": 0.0748, "step": 1522 }, { "epoch": 0.6764379302687098, "grad_norm": 0.6492894969300076, "learning_rate": 9.855315130655315e-06, "loss": 0.0573, "step": 1523 }, { "epoch": 0.6768820786142572, "grad_norm": 1.8883752330861565, "learning_rate": 9.854851797881018e-06, "loss": 0.0586, "step": 1524 }, { "epoch": 0.6773262269598046, "grad_norm": 0.6674563588470771, "learning_rate": 9.854387735340028e-06, "loss": 0.0618, "step": 1525 }, { "epoch": 0.677770375305352, "grad_norm": 0.5745105185416173, "learning_rate": 9.853922943102099e-06, "loss": 0.0486, "step": 1526 }, { "epoch": 0.6782145236508994, "grad_norm": 0.8823764363833241, "learning_rate": 9.853457421237098e-06, "loss": 0.0655, "step": 1527 }, { "epoch": 0.6786586719964468, "grad_norm": 0.7502418552024257, "learning_rate": 9.852991169815002e-06, "loss": 0.0697, "step": 1528 }, { "epoch": 0.6791028203419942, "grad_norm": 0.9371750933741902, "learning_rate": 9.852524188905894e-06, "loss": 0.0637, "step": 1529 }, { "epoch": 0.6795469686875416, "grad_norm": 0.4760980667666431, "learning_rate": 9.85205647857997e-06, "loss": 0.0507, "step": 1530 }, { "epoch": 0.679991117033089, "grad_norm": 0.5957775022770642, "learning_rate": 9.851588038907536e-06, "loss": 0.0725, "step": 1531 }, { "epoch": 0.6804352653786364, "grad_norm": 0.8583868263839582, "learning_rate": 9.851118869959006e-06, "loss": 0.0926, "step": 1532 }, { "epoch": 0.6808794137241839, "grad_norm": 0.5065773305637178, "learning_rate": 9.850648971804903e-06, "loss": 0.0519, "step": 1533 }, { "epoch": 0.6813235620697313, "grad_norm": 0.6463242249375162, "learning_rate": 9.850178344515861e-06, "loss": 0.0495, "step": 1534 }, { "epoch": 0.6817677104152787, "grad_norm": 0.713200223511551, "learning_rate": 9.849706988162626e-06, "loss": 0.0667, "step": 1535 }, { "epoch": 0.6822118587608261, "grad_norm": 0.8106886906091921, "learning_rate": 9.849234902816047e-06, "loss": 0.078, "step": 1536 }, { "epoch": 0.6826560071063735, "grad_norm": 0.618211559505216, "learning_rate": 9.848762088547089e-06, "loss": 0.0499, "step": 1537 }, { "epoch": 0.6831001554519209, "grad_norm": 0.4683137101850568, "learning_rate": 9.848288545426821e-06, "loss": 0.0485, "step": 1538 }, { "epoch": 0.6835443037974683, "grad_norm": 0.6498884933818719, "learning_rate": 9.847814273526428e-06, "loss": 0.0717, "step": 1539 }, { "epoch": 0.6839884521430157, "grad_norm": 0.6112948857459086, "learning_rate": 9.8473392729172e-06, "loss": 0.0556, "step": 1540 }, { "epoch": 0.6844326004885631, "grad_norm": 0.6350627430979878, "learning_rate": 9.846863543670536e-06, "loss": 0.067, "step": 1541 }, { "epoch": 0.6848767488341105, "grad_norm": 1.1432261374175676, "learning_rate": 9.846387085857949e-06, "loss": 0.1195, "step": 1542 }, { "epoch": 0.685320897179658, "grad_norm": 0.5445218211710267, "learning_rate": 9.845909899551056e-06, "loss": 0.0633, "step": 1543 }, { "epoch": 0.6857650455252055, "grad_norm": 0.7666433728201425, "learning_rate": 9.845431984821588e-06, "loss": 0.0652, "step": 1544 }, { "epoch": 0.6862091938707529, "grad_norm": 0.6445193247682808, "learning_rate": 9.844953341741383e-06, "loss": 0.044, "step": 1545 }, { "epoch": 0.6866533422163003, "grad_norm": 0.6165834169963341, "learning_rate": 9.844473970382391e-06, "loss": 0.07, "step": 1546 }, { "epoch": 0.6870974905618477, "grad_norm": 0.785019896367438, "learning_rate": 9.843993870816665e-06, "loss": 0.0692, "step": 1547 }, { "epoch": 0.6875416389073951, "grad_norm": 0.8097989024467772, "learning_rate": 9.843513043116378e-06, "loss": 0.0714, "step": 1548 }, { "epoch": 0.6879857872529425, "grad_norm": 0.7549105996829288, "learning_rate": 9.843031487353803e-06, "loss": 0.0642, "step": 1549 }, { "epoch": 0.6884299355984899, "grad_norm": 0.4886439517505401, "learning_rate": 9.842549203601327e-06, "loss": 0.0562, "step": 1550 }, { "epoch": 0.6888740839440373, "grad_norm": 0.7561494998266006, "learning_rate": 9.842066191931442e-06, "loss": 0.0643, "step": 1551 }, { "epoch": 0.6893182322895848, "grad_norm": 0.854720220061704, "learning_rate": 9.84158245241676e-06, "loss": 0.0522, "step": 1552 }, { "epoch": 0.6897623806351322, "grad_norm": 0.5346332465205076, "learning_rate": 9.84109798512999e-06, "loss": 0.0518, "step": 1553 }, { "epoch": 0.6902065289806796, "grad_norm": 0.508165162429757, "learning_rate": 9.840612790143958e-06, "loss": 0.0538, "step": 1554 }, { "epoch": 0.690650677326227, "grad_norm": 0.4808558159762227, "learning_rate": 9.840126867531594e-06, "loss": 0.0604, "step": 1555 }, { "epoch": 0.6910948256717744, "grad_norm": 0.6036124527734282, "learning_rate": 9.839640217365941e-06, "loss": 0.0636, "step": 1556 }, { "epoch": 0.6915389740173218, "grad_norm": 0.5365369852245644, "learning_rate": 9.839152839720157e-06, "loss": 0.0571, "step": 1557 }, { "epoch": 0.6919831223628692, "grad_norm": 0.9675442829718458, "learning_rate": 9.838664734667496e-06, "loss": 0.0768, "step": 1558 }, { "epoch": 0.6924272707084166, "grad_norm": 0.6682601437620973, "learning_rate": 9.83817590228133e-06, "loss": 0.0682, "step": 1559 }, { "epoch": 0.692871419053964, "grad_norm": 0.915552570084289, "learning_rate": 9.83768634263514e-06, "loss": 0.0689, "step": 1560 }, { "epoch": 0.6933155673995114, "grad_norm": 0.7294790821869619, "learning_rate": 9.837196055802514e-06, "loss": 0.0595, "step": 1561 }, { "epoch": 0.6937597157450589, "grad_norm": 0.6877843622659181, "learning_rate": 9.836705041857153e-06, "loss": 0.0894, "step": 1562 }, { "epoch": 0.6942038640906063, "grad_norm": 0.5609065701691318, "learning_rate": 9.836213300872862e-06, "loss": 0.0523, "step": 1563 }, { "epoch": 0.6946480124361537, "grad_norm": 0.6750341373878816, "learning_rate": 9.83572083292356e-06, "loss": 0.0723, "step": 1564 }, { "epoch": 0.6950921607817011, "grad_norm": 0.8179982672102986, "learning_rate": 9.835227638083271e-06, "loss": 0.0741, "step": 1565 }, { "epoch": 0.6955363091272485, "grad_norm": 0.514148745778486, "learning_rate": 9.834733716426133e-06, "loss": 0.048, "step": 1566 }, { "epoch": 0.6959804574727959, "grad_norm": 0.5288948126890644, "learning_rate": 9.834239068026388e-06, "loss": 0.0665, "step": 1567 }, { "epoch": 0.6964246058183433, "grad_norm": 0.5107938326869891, "learning_rate": 9.833743692958392e-06, "loss": 0.0482, "step": 1568 }, { "epoch": 0.6968687541638907, "grad_norm": 1.053299477751095, "learning_rate": 9.83324759129661e-06, "loss": 0.0726, "step": 1569 }, { "epoch": 0.6973129025094381, "grad_norm": 0.6266804348595715, "learning_rate": 9.832750763115611e-06, "loss": 0.0738, "step": 1570 }, { "epoch": 0.6977570508549855, "grad_norm": 0.7607794431765756, "learning_rate": 9.83225320849008e-06, "loss": 0.0739, "step": 1571 }, { "epoch": 0.698201199200533, "grad_norm": 0.7778393457616919, "learning_rate": 9.831754927494803e-06, "loss": 0.0948, "step": 1572 }, { "epoch": 0.6986453475460804, "grad_norm": 0.7195786654461274, "learning_rate": 9.831255920204685e-06, "loss": 0.0646, "step": 1573 }, { "epoch": 0.6990894958916278, "grad_norm": 0.6366879939595091, "learning_rate": 9.830756186694734e-06, "loss": 0.0808, "step": 1574 }, { "epoch": 0.6995336442371752, "grad_norm": 0.478436966515515, "learning_rate": 9.830255727040066e-06, "loss": 0.059, "step": 1575 }, { "epoch": 0.6999777925827226, "grad_norm": 0.5378537626686002, "learning_rate": 9.829754541315912e-06, "loss": 0.0624, "step": 1576 }, { "epoch": 0.70042194092827, "grad_norm": 0.5458661735869976, "learning_rate": 9.829252629597607e-06, "loss": 0.056, "step": 1577 }, { "epoch": 0.7008660892738174, "grad_norm": 0.7598746622748797, "learning_rate": 9.828749991960598e-06, "loss": 0.0742, "step": 1578 }, { "epoch": 0.7013102376193648, "grad_norm": 0.7874301906671837, "learning_rate": 9.828246628480438e-06, "loss": 0.1126, "step": 1579 }, { "epoch": 0.7017543859649122, "grad_norm": 0.5696760012747649, "learning_rate": 9.827742539232791e-06, "loss": 0.0622, "step": 1580 }, { "epoch": 0.7021985343104596, "grad_norm": 0.49299605250258527, "learning_rate": 9.827237724293434e-06, "loss": 0.0542, "step": 1581 }, { "epoch": 0.7026426826560072, "grad_norm": 0.7950299040696133, "learning_rate": 9.826732183738246e-06, "loss": 0.0736, "step": 1582 }, { "epoch": 0.7030868310015546, "grad_norm": 0.6701858553269311, "learning_rate": 9.826225917643217e-06, "loss": 0.0769, "step": 1583 }, { "epoch": 0.703530979347102, "grad_norm": 0.7198204203508045, "learning_rate": 9.825718926084451e-06, "loss": 0.0607, "step": 1584 }, { "epoch": 0.7039751276926494, "grad_norm": 0.7221114160407107, "learning_rate": 9.825211209138154e-06, "loss": 0.0662, "step": 1585 }, { "epoch": 0.7044192760381968, "grad_norm": 0.5718930945068039, "learning_rate": 9.82470276688065e-06, "loss": 0.063, "step": 1586 }, { "epoch": 0.7048634243837442, "grad_norm": 0.7156464008159005, "learning_rate": 9.824193599388358e-06, "loss": 0.061, "step": 1587 }, { "epoch": 0.7053075727292916, "grad_norm": 0.4730100411783297, "learning_rate": 9.823683706737824e-06, "loss": 0.0538, "step": 1588 }, { "epoch": 0.705751721074839, "grad_norm": 0.8151288747975431, "learning_rate": 9.823173089005686e-06, "loss": 0.0586, "step": 1589 }, { "epoch": 0.7061958694203864, "grad_norm": 0.6500001880192491, "learning_rate": 9.822661746268702e-06, "loss": 0.0558, "step": 1590 }, { "epoch": 0.7066400177659338, "grad_norm": 0.47387928526278555, "learning_rate": 9.822149678603733e-06, "loss": 0.0537, "step": 1591 }, { "epoch": 0.7070841661114813, "grad_norm": 0.5533123786666888, "learning_rate": 9.821636886087755e-06, "loss": 0.049, "step": 1592 }, { "epoch": 0.7075283144570287, "grad_norm": 0.5418212216604064, "learning_rate": 9.82112336879785e-06, "loss": 0.0443, "step": 1593 }, { "epoch": 0.7079724628025761, "grad_norm": 0.595384134286072, "learning_rate": 9.820609126811202e-06, "loss": 0.0701, "step": 1594 }, { "epoch": 0.7084166111481235, "grad_norm": 0.5686391716299167, "learning_rate": 9.820094160205118e-06, "loss": 0.0612, "step": 1595 }, { "epoch": 0.7088607594936709, "grad_norm": 0.6501450869897375, "learning_rate": 9.819578469057e-06, "loss": 0.0571, "step": 1596 }, { "epoch": 0.7093049078392183, "grad_norm": 0.8251798873678969, "learning_rate": 9.819062053444369e-06, "loss": 0.0903, "step": 1597 }, { "epoch": 0.7097490561847657, "grad_norm": 0.659869482663948, "learning_rate": 9.81854491344485e-06, "loss": 0.077, "step": 1598 }, { "epoch": 0.7101932045303131, "grad_norm": 0.5233735880329885, "learning_rate": 9.818027049136177e-06, "loss": 0.0756, "step": 1599 }, { "epoch": 0.7106373528758605, "grad_norm": 0.5435283663423869, "learning_rate": 9.817508460596195e-06, "loss": 0.0517, "step": 1600 }, { "epoch": 0.7110815012214079, "grad_norm": 0.8462175785185146, "learning_rate": 9.816989147902855e-06, "loss": 0.0776, "step": 1601 }, { "epoch": 0.7115256495669554, "grad_norm": 0.5250146868792045, "learning_rate": 9.816469111134221e-06, "loss": 0.0636, "step": 1602 }, { "epoch": 0.7119697979125028, "grad_norm": 0.4561566850818785, "learning_rate": 9.81594835036846e-06, "loss": 0.0549, "step": 1603 }, { "epoch": 0.7124139462580502, "grad_norm": 0.639031123731633, "learning_rate": 9.815426865683858e-06, "loss": 0.0739, "step": 1604 }, { "epoch": 0.7128580946035976, "grad_norm": 0.4546814856440393, "learning_rate": 9.814904657158793e-06, "loss": 0.0536, "step": 1605 }, { "epoch": 0.713302242949145, "grad_norm": 0.49725993065397267, "learning_rate": 9.81438172487177e-06, "loss": 0.0538, "step": 1606 }, { "epoch": 0.7137463912946924, "grad_norm": 0.7681224223760409, "learning_rate": 9.813858068901391e-06, "loss": 0.0738, "step": 1607 }, { "epoch": 0.7141905396402398, "grad_norm": 0.5475444111433433, "learning_rate": 9.813333689326371e-06, "loss": 0.0532, "step": 1608 }, { "epoch": 0.7146346879857872, "grad_norm": 0.5001622305576355, "learning_rate": 9.812808586225533e-06, "loss": 0.0504, "step": 1609 }, { "epoch": 0.7150788363313346, "grad_norm": 0.5966203326606518, "learning_rate": 9.812282759677811e-06, "loss": 0.0632, "step": 1610 }, { "epoch": 0.715522984676882, "grad_norm": 0.5024269719688089, "learning_rate": 9.811756209762242e-06, "loss": 0.054, "step": 1611 }, { "epoch": 0.7159671330224295, "grad_norm": 0.5953895951996429, "learning_rate": 9.811228936557977e-06, "loss": 0.0687, "step": 1612 }, { "epoch": 0.7164112813679769, "grad_norm": 0.9253768611459666, "learning_rate": 9.810700940144275e-06, "loss": 0.0936, "step": 1613 }, { "epoch": 0.7168554297135243, "grad_norm": 0.40706001374840456, "learning_rate": 9.810172220600503e-06, "loss": 0.0501, "step": 1614 }, { "epoch": 0.7172995780590717, "grad_norm": 0.6702001680907567, "learning_rate": 9.809642778006135e-06, "loss": 0.0721, "step": 1615 }, { "epoch": 0.7177437264046191, "grad_norm": 0.5621892036366335, "learning_rate": 9.809112612440757e-06, "loss": 0.0624, "step": 1616 }, { "epoch": 0.7181878747501665, "grad_norm": 0.6550556626236272, "learning_rate": 9.808581723984059e-06, "loss": 0.064, "step": 1617 }, { "epoch": 0.718632023095714, "grad_norm": 1.171906822226206, "learning_rate": 9.808050112715845e-06, "loss": 0.1172, "step": 1618 }, { "epoch": 0.7190761714412613, "grad_norm": 0.4455710256139167, "learning_rate": 9.807517778716025e-06, "loss": 0.045, "step": 1619 }, { "epoch": 0.7195203197868087, "grad_norm": 0.6254516596662227, "learning_rate": 9.806984722064616e-06, "loss": 0.0705, "step": 1620 }, { "epoch": 0.7199644681323563, "grad_norm": 0.5571332804687471, "learning_rate": 9.806450942841747e-06, "loss": 0.0502, "step": 1621 }, { "epoch": 0.7204086164779037, "grad_norm": 1.0191625308451748, "learning_rate": 9.805916441127657e-06, "loss": 0.0589, "step": 1622 }, { "epoch": 0.7208527648234511, "grad_norm": 0.7859502245676739, "learning_rate": 9.805381217002684e-06, "loss": 0.0431, "step": 1623 }, { "epoch": 0.7212969131689985, "grad_norm": 0.8065166441211357, "learning_rate": 9.804845270547288e-06, "loss": 0.0728, "step": 1624 }, { "epoch": 0.7217410615145459, "grad_norm": 0.7073921214139905, "learning_rate": 9.804308601842026e-06, "loss": 0.0753, "step": 1625 }, { "epoch": 0.7221852098600933, "grad_norm": 0.7787625206064858, "learning_rate": 9.80377121096757e-06, "loss": 0.0854, "step": 1626 }, { "epoch": 0.7226293582056407, "grad_norm": 1.148628838473968, "learning_rate": 9.8032330980047e-06, "loss": 0.092, "step": 1627 }, { "epoch": 0.7230735065511881, "grad_norm": 0.5694973855182914, "learning_rate": 9.802694263034302e-06, "loss": 0.0661, "step": 1628 }, { "epoch": 0.7235176548967355, "grad_norm": 0.6928354319078996, "learning_rate": 9.802154706137372e-06, "loss": 0.0588, "step": 1629 }, { "epoch": 0.7239618032422829, "grad_norm": 0.5747608874251084, "learning_rate": 9.801614427395018e-06, "loss": 0.073, "step": 1630 }, { "epoch": 0.7244059515878304, "grad_norm": 0.8245000270563847, "learning_rate": 9.801073426888447e-06, "loss": 0.0602, "step": 1631 }, { "epoch": 0.7248500999333778, "grad_norm": 0.6063953480963461, "learning_rate": 9.800531704698986e-06, "loss": 0.0658, "step": 1632 }, { "epoch": 0.7252942482789252, "grad_norm": 0.5240737295642546, "learning_rate": 9.799989260908063e-06, "loss": 0.0564, "step": 1633 }, { "epoch": 0.7257383966244726, "grad_norm": 0.7016224236011144, "learning_rate": 9.799446095597216e-06, "loss": 0.0646, "step": 1634 }, { "epoch": 0.72618254497002, "grad_norm": 0.6626808866197511, "learning_rate": 9.798902208848093e-06, "loss": 0.0818, "step": 1635 }, { "epoch": 0.7266266933155674, "grad_norm": 0.6141836548174839, "learning_rate": 9.79835760074245e-06, "loss": 0.0508, "step": 1636 }, { "epoch": 0.7270708416611148, "grad_norm": 0.6868383771343689, "learning_rate": 9.797812271362149e-06, "loss": 0.0552, "step": 1637 }, { "epoch": 0.7275149900066622, "grad_norm": 0.4532209654763099, "learning_rate": 9.79726622078916e-06, "loss": 0.0451, "step": 1638 }, { "epoch": 0.7279591383522096, "grad_norm": 0.6808218136734143, "learning_rate": 9.79671944910557e-06, "loss": 0.0571, "step": 1639 }, { "epoch": 0.728403286697757, "grad_norm": 0.7510893535708428, "learning_rate": 9.796171956393566e-06, "loss": 0.0669, "step": 1640 }, { "epoch": 0.7288474350433045, "grad_norm": 0.41663397739980335, "learning_rate": 9.79562374273544e-06, "loss": 0.0441, "step": 1641 }, { "epoch": 0.7292915833888519, "grad_norm": 0.5592036648884667, "learning_rate": 9.795074808213604e-06, "loss": 0.0562, "step": 1642 }, { "epoch": 0.7297357317343993, "grad_norm": 0.495293158677455, "learning_rate": 9.794525152910573e-06, "loss": 0.0621, "step": 1643 }, { "epoch": 0.7301798800799467, "grad_norm": 0.5602886889611556, "learning_rate": 9.793974776908963e-06, "loss": 0.0531, "step": 1644 }, { "epoch": 0.7306240284254941, "grad_norm": 0.5944252332564274, "learning_rate": 9.79342368029151e-06, "loss": 0.0563, "step": 1645 }, { "epoch": 0.7310681767710415, "grad_norm": 0.482240455293522, "learning_rate": 9.792871863141052e-06, "loss": 0.0576, "step": 1646 }, { "epoch": 0.7315123251165889, "grad_norm": 0.5733741929298166, "learning_rate": 9.792319325540537e-06, "loss": 0.0684, "step": 1647 }, { "epoch": 0.7319564734621363, "grad_norm": 0.4985634447799434, "learning_rate": 9.79176606757302e-06, "loss": 0.0597, "step": 1648 }, { "epoch": 0.7324006218076837, "grad_norm": 0.685246073677029, "learning_rate": 9.791212089321662e-06, "loss": 0.0721, "step": 1649 }, { "epoch": 0.7328447701532311, "grad_norm": 0.7566932790234222, "learning_rate": 9.790657390869742e-06, "loss": 0.0665, "step": 1650 }, { "epoch": 0.7332889184987786, "grad_norm": 0.47289806502397586, "learning_rate": 9.790101972300635e-06, "loss": 0.0536, "step": 1651 }, { "epoch": 0.733733066844326, "grad_norm": 0.9971658246699995, "learning_rate": 9.789545833697833e-06, "loss": 0.0959, "step": 1652 }, { "epoch": 0.7341772151898734, "grad_norm": 0.588355861783506, "learning_rate": 9.788988975144933e-06, "loss": 0.0556, "step": 1653 }, { "epoch": 0.7346213635354208, "grad_norm": 0.5671908470000354, "learning_rate": 9.788431396725637e-06, "loss": 0.0669, "step": 1654 }, { "epoch": 0.7350655118809682, "grad_norm": 0.758993148904425, "learning_rate": 9.787873098523763e-06, "loss": 0.0652, "step": 1655 }, { "epoch": 0.7355096602265156, "grad_norm": 0.8048604855196553, "learning_rate": 9.787314080623229e-06, "loss": 0.0698, "step": 1656 }, { "epoch": 0.735953808572063, "grad_norm": 0.8623667194773064, "learning_rate": 9.786754343108066e-06, "loss": 0.0779, "step": 1657 }, { "epoch": 0.7363979569176105, "grad_norm": 0.61076619104796, "learning_rate": 9.786193886062415e-06, "loss": 0.0769, "step": 1658 }, { "epoch": 0.7368421052631579, "grad_norm": 0.36227469684290603, "learning_rate": 9.785632709570519e-06, "loss": 0.0414, "step": 1659 }, { "epoch": 0.7372862536087053, "grad_norm": 4.048112931451507, "learning_rate": 9.785070813716733e-06, "loss": 0.0454, "step": 1660 }, { "epoch": 0.7377304019542528, "grad_norm": 0.6035767336706519, "learning_rate": 9.784508198585519e-06, "loss": 0.0633, "step": 1661 }, { "epoch": 0.7381745502998002, "grad_norm": 0.469714457030227, "learning_rate": 9.783944864261448e-06, "loss": 0.0472, "step": 1662 }, { "epoch": 0.7386186986453476, "grad_norm": 0.8129946899965498, "learning_rate": 9.783380810829198e-06, "loss": 0.0613, "step": 1663 }, { "epoch": 0.739062846990895, "grad_norm": 0.6753504015110883, "learning_rate": 9.782816038373556e-06, "loss": 0.0902, "step": 1664 }, { "epoch": 0.7395069953364424, "grad_norm": 0.6384325521859789, "learning_rate": 9.782250546979421e-06, "loss": 0.074, "step": 1665 }, { "epoch": 0.7399511436819898, "grad_norm": 0.6956365636896437, "learning_rate": 9.781684336731791e-06, "loss": 0.0567, "step": 1666 }, { "epoch": 0.7403952920275372, "grad_norm": 0.8256720537662366, "learning_rate": 9.781117407715779e-06, "loss": 0.0651, "step": 1667 }, { "epoch": 0.7408394403730846, "grad_norm": 0.5355855493602046, "learning_rate": 9.780549760016602e-06, "loss": 0.0619, "step": 1668 }, { "epoch": 0.741283588718632, "grad_norm": 0.5253416844707828, "learning_rate": 9.77998139371959e-06, "loss": 0.0592, "step": 1669 }, { "epoch": 0.7417277370641794, "grad_norm": 0.6458381116052228, "learning_rate": 9.779412308910176e-06, "loss": 0.0631, "step": 1670 }, { "epoch": 0.7421718854097269, "grad_norm": 0.6458810882415751, "learning_rate": 9.778842505673906e-06, "loss": 0.0721, "step": 1671 }, { "epoch": 0.7426160337552743, "grad_norm": 0.578667702719237, "learning_rate": 9.778271984096427e-06, "loss": 0.0622, "step": 1672 }, { "epoch": 0.7430601821008217, "grad_norm": 0.7294244004095107, "learning_rate": 9.777700744263502e-06, "loss": 0.0763, "step": 1673 }, { "epoch": 0.7435043304463691, "grad_norm": 0.4949092508929545, "learning_rate": 9.777128786260995e-06, "loss": 0.0491, "step": 1674 }, { "epoch": 0.7439484787919165, "grad_norm": 0.49019523675297216, "learning_rate": 9.776556110174882e-06, "loss": 0.0487, "step": 1675 }, { "epoch": 0.7443926271374639, "grad_norm": 0.6147792916360124, "learning_rate": 9.775982716091245e-06, "loss": 0.0468, "step": 1676 }, { "epoch": 0.7448367754830113, "grad_norm": 0.7728423947263149, "learning_rate": 9.775408604096276e-06, "loss": 0.0843, "step": 1677 }, { "epoch": 0.7452809238285587, "grad_norm": 0.46460994551018925, "learning_rate": 9.774833774276278e-06, "loss": 0.0475, "step": 1678 }, { "epoch": 0.7457250721741061, "grad_norm": 0.6020989880539805, "learning_rate": 9.77425822671765e-06, "loss": 0.0548, "step": 1679 }, { "epoch": 0.7461692205196535, "grad_norm": 0.4185492378453488, "learning_rate": 9.77368196150691e-06, "loss": 0.0697, "step": 1680 }, { "epoch": 0.746613368865201, "grad_norm": 0.47026038790260477, "learning_rate": 9.77310497873068e-06, "loss": 0.0577, "step": 1681 }, { "epoch": 0.7470575172107484, "grad_norm": 0.7242331312035356, "learning_rate": 9.772527278475694e-06, "loss": 0.0646, "step": 1682 }, { "epoch": 0.7475016655562958, "grad_norm": 0.5498639479583922, "learning_rate": 9.771948860828783e-06, "loss": 0.0768, "step": 1683 }, { "epoch": 0.7479458139018432, "grad_norm": 0.5132952000374971, "learning_rate": 9.7713697258769e-06, "loss": 0.0749, "step": 1684 }, { "epoch": 0.7483899622473906, "grad_norm": 0.567848025082148, "learning_rate": 9.770789873707095e-06, "loss": 0.0853, "step": 1685 }, { "epoch": 0.748834110592938, "grad_norm": 0.5686689909367685, "learning_rate": 9.770209304406531e-06, "loss": 0.0628, "step": 1686 }, { "epoch": 0.7492782589384854, "grad_norm": 0.6034469126916269, "learning_rate": 9.769628018062477e-06, "loss": 0.0479, "step": 1687 }, { "epoch": 0.7497224072840328, "grad_norm": 0.43723219237091576, "learning_rate": 9.769046014762307e-06, "loss": 0.0654, "step": 1688 }, { "epoch": 0.7501665556295802, "grad_norm": 0.6855260472494596, "learning_rate": 9.76846329459351e-06, "loss": 0.0651, "step": 1689 }, { "epoch": 0.7506107039751277, "grad_norm": 0.722060394293234, "learning_rate": 9.767879857643681e-06, "loss": 0.0552, "step": 1690 }, { "epoch": 0.7510548523206751, "grad_norm": 0.463106704754472, "learning_rate": 9.767295704000514e-06, "loss": 0.0534, "step": 1691 }, { "epoch": 0.7514990006662225, "grad_norm": 0.6415358905679467, "learning_rate": 9.766710833751823e-06, "loss": 0.0806, "step": 1692 }, { "epoch": 0.75194314901177, "grad_norm": 0.5352784756952503, "learning_rate": 9.76612524698552e-06, "loss": 0.0558, "step": 1693 }, { "epoch": 0.7523872973573174, "grad_norm": 1.1814652854922993, "learning_rate": 9.76553894378963e-06, "loss": 0.0847, "step": 1694 }, { "epoch": 0.7528314457028648, "grad_norm": 0.6986298733118028, "learning_rate": 9.764951924252284e-06, "loss": 0.05, "step": 1695 }, { "epoch": 0.7532755940484122, "grad_norm": 0.5447554709895742, "learning_rate": 9.764364188461723e-06, "loss": 0.0485, "step": 1696 }, { "epoch": 0.7537197423939596, "grad_norm": 0.47846902684606657, "learning_rate": 9.76377573650629e-06, "loss": 0.0551, "step": 1697 }, { "epoch": 0.754163890739507, "grad_norm": 0.8115375903439791, "learning_rate": 9.763186568474443e-06, "loss": 0.0701, "step": 1698 }, { "epoch": 0.7546080390850544, "grad_norm": 0.5954397107201211, "learning_rate": 9.762596684454742e-06, "loss": 0.0474, "step": 1699 }, { "epoch": 0.7550521874306019, "grad_norm": 0.9043348007985654, "learning_rate": 9.762006084535857e-06, "loss": 0.0752, "step": 1700 }, { "epoch": 0.7554963357761493, "grad_norm": 0.5876955603814114, "learning_rate": 9.761414768806566e-06, "loss": 0.058, "step": 1701 }, { "epoch": 0.7559404841216967, "grad_norm": 0.6215494346347864, "learning_rate": 9.76082273735575e-06, "loss": 0.074, "step": 1702 }, { "epoch": 0.7563846324672441, "grad_norm": 0.6691646984262034, "learning_rate": 9.760229990272407e-06, "loss": 0.0752, "step": 1703 }, { "epoch": 0.7568287808127915, "grad_norm": 0.6530103461059469, "learning_rate": 9.759636527645633e-06, "loss": 0.0512, "step": 1704 }, { "epoch": 0.7572729291583389, "grad_norm": 0.5915232768461202, "learning_rate": 9.759042349564638e-06, "loss": 0.0505, "step": 1705 }, { "epoch": 0.7577170775038863, "grad_norm": 0.6211496225849954, "learning_rate": 9.758447456118734e-06, "loss": 0.0527, "step": 1706 }, { "epoch": 0.7581612258494337, "grad_norm": 0.9060279971096812, "learning_rate": 9.757851847397349e-06, "loss": 0.0687, "step": 1707 }, { "epoch": 0.7586053741949811, "grad_norm": 1.0250865769852973, "learning_rate": 9.757255523490006e-06, "loss": 0.0935, "step": 1708 }, { "epoch": 0.7590495225405285, "grad_norm": 0.546436812313742, "learning_rate": 9.756658484486348e-06, "loss": 0.0667, "step": 1709 }, { "epoch": 0.759493670886076, "grad_norm": 0.5515990500251661, "learning_rate": 9.756060730476117e-06, "loss": 0.0529, "step": 1710 }, { "epoch": 0.7599378192316234, "grad_norm": 0.618132276932801, "learning_rate": 9.755462261549167e-06, "loss": 0.0506, "step": 1711 }, { "epoch": 0.7603819675771708, "grad_norm": 0.6625277428767956, "learning_rate": 9.754863077795459e-06, "loss": 0.0718, "step": 1712 }, { "epoch": 0.7608261159227182, "grad_norm": 0.45483688763929425, "learning_rate": 9.754263179305058e-06, "loss": 0.0446, "step": 1713 }, { "epoch": 0.7612702642682656, "grad_norm": 0.6438763676440094, "learning_rate": 9.753662566168142e-06, "loss": 0.0856, "step": 1714 }, { "epoch": 0.761714412613813, "grad_norm": 0.6595899132325235, "learning_rate": 9.75306123847499e-06, "loss": 0.0725, "step": 1715 }, { "epoch": 0.7621585609593604, "grad_norm": 0.6533315196440966, "learning_rate": 9.752459196315996e-06, "loss": 0.0607, "step": 1716 }, { "epoch": 0.7626027093049078, "grad_norm": 0.5435323069050279, "learning_rate": 9.751856439781653e-06, "loss": 0.0511, "step": 1717 }, { "epoch": 0.7630468576504552, "grad_norm": 0.5454956836386448, "learning_rate": 9.751252968962567e-06, "loss": 0.0828, "step": 1718 }, { "epoch": 0.7634910059960026, "grad_norm": 0.5715320834454507, "learning_rate": 9.75064878394945e-06, "loss": 0.0518, "step": 1719 }, { "epoch": 0.7639351543415501, "grad_norm": 0.6744811382532548, "learning_rate": 9.750043884833121e-06, "loss": 0.0508, "step": 1720 }, { "epoch": 0.7643793026870975, "grad_norm": 0.6108688890165651, "learning_rate": 9.749438271704508e-06, "loss": 0.0615, "step": 1721 }, { "epoch": 0.7648234510326449, "grad_norm": 0.5674222482544424, "learning_rate": 9.748831944654643e-06, "loss": 0.0644, "step": 1722 }, { "epoch": 0.7652675993781923, "grad_norm": 0.7356158997613679, "learning_rate": 9.74822490377467e-06, "loss": 0.0733, "step": 1723 }, { "epoch": 0.7657117477237397, "grad_norm": 0.4849462211495647, "learning_rate": 9.747617149155834e-06, "loss": 0.0443, "step": 1724 }, { "epoch": 0.7661558960692871, "grad_norm": 0.831191855315457, "learning_rate": 9.747008680889493e-06, "loss": 0.0659, "step": 1725 }, { "epoch": 0.7666000444148345, "grad_norm": 0.6526473429491929, "learning_rate": 9.746399499067109e-06, "loss": 0.064, "step": 1726 }, { "epoch": 0.7670441927603819, "grad_norm": 0.5803840610292479, "learning_rate": 9.745789603780254e-06, "loss": 0.053, "step": 1727 }, { "epoch": 0.7674883411059293, "grad_norm": 0.6610162442549882, "learning_rate": 9.745178995120604e-06, "loss": 0.0626, "step": 1728 }, { "epoch": 0.7679324894514767, "grad_norm": 0.4876204778498726, "learning_rate": 9.744567673179946e-06, "loss": 0.0522, "step": 1729 }, { "epoch": 0.7683766377970243, "grad_norm": 0.5622038449381063, "learning_rate": 9.743955638050169e-06, "loss": 0.0524, "step": 1730 }, { "epoch": 0.7688207861425717, "grad_norm": 0.577658138102478, "learning_rate": 9.743342889823273e-06, "loss": 0.0559, "step": 1731 }, { "epoch": 0.7692649344881191, "grad_norm": 0.6597870381577758, "learning_rate": 9.742729428591368e-06, "loss": 0.0603, "step": 1732 }, { "epoch": 0.7697090828336665, "grad_norm": 0.8344370622440579, "learning_rate": 9.742115254446665e-06, "loss": 0.0836, "step": 1733 }, { "epoch": 0.7701532311792139, "grad_norm": 0.5622150768145531, "learning_rate": 9.741500367481481e-06, "loss": 0.0826, "step": 1734 }, { "epoch": 0.7705973795247613, "grad_norm": 0.6330801870134145, "learning_rate": 9.740884767788253e-06, "loss": 0.0569, "step": 1735 }, { "epoch": 0.7710415278703087, "grad_norm": 0.5484349976426807, "learning_rate": 9.740268455459507e-06, "loss": 0.0759, "step": 1736 }, { "epoch": 0.7714856762158561, "grad_norm": 0.6003363970239456, "learning_rate": 9.739651430587891e-06, "loss": 0.0559, "step": 1737 }, { "epoch": 0.7719298245614035, "grad_norm": 0.6514870858876586, "learning_rate": 9.739033693266152e-06, "loss": 0.0583, "step": 1738 }, { "epoch": 0.7723739729069509, "grad_norm": 0.7128176043400777, "learning_rate": 9.738415243587146e-06, "loss": 0.0615, "step": 1739 }, { "epoch": 0.7728181212524984, "grad_norm": 0.4997932884849887, "learning_rate": 9.737796081643838e-06, "loss": 0.0572, "step": 1740 }, { "epoch": 0.7732622695980458, "grad_norm": 0.6809941857723402, "learning_rate": 9.737176207529296e-06, "loss": 0.0579, "step": 1741 }, { "epoch": 0.7737064179435932, "grad_norm": 0.579421035714199, "learning_rate": 9.736555621336701e-06, "loss": 0.0553, "step": 1742 }, { "epoch": 0.7741505662891406, "grad_norm": 0.49198429179195113, "learning_rate": 9.735934323159337e-06, "loss": 0.046, "step": 1743 }, { "epoch": 0.774594714634688, "grad_norm": 0.6373312482523877, "learning_rate": 9.735312313090593e-06, "loss": 0.0645, "step": 1744 }, { "epoch": 0.7750388629802354, "grad_norm": 0.49553655680906583, "learning_rate": 9.734689591223971e-06, "loss": 0.0523, "step": 1745 }, { "epoch": 0.7754830113257828, "grad_norm": 0.7575640777298455, "learning_rate": 9.734066157653075e-06, "loss": 0.1155, "step": 1746 }, { "epoch": 0.7759271596713302, "grad_norm": 0.3941642367752884, "learning_rate": 9.733442012471617e-06, "loss": 0.0494, "step": 1747 }, { "epoch": 0.7763713080168776, "grad_norm": 0.6007166723222428, "learning_rate": 9.732817155773417e-06, "loss": 0.0809, "step": 1748 }, { "epoch": 0.7768154563624251, "grad_norm": 0.6048168057227016, "learning_rate": 9.732191587652402e-06, "loss": 0.0827, "step": 1749 }, { "epoch": 0.7772596047079725, "grad_norm": 0.5258636663852201, "learning_rate": 9.731565308202607e-06, "loss": 0.0532, "step": 1750 }, { "epoch": 0.7777037530535199, "grad_norm": 0.5183503036370198, "learning_rate": 9.73093831751817e-06, "loss": 0.0597, "step": 1751 }, { "epoch": 0.7781479013990673, "grad_norm": 0.4539057898000694, "learning_rate": 9.73031061569334e-06, "loss": 0.0449, "step": 1752 }, { "epoch": 0.7785920497446147, "grad_norm": 0.43734440001807745, "learning_rate": 9.72968220282247e-06, "loss": 0.0496, "step": 1753 }, { "epoch": 0.7790361980901621, "grad_norm": 0.4621698388749695, "learning_rate": 9.729053079000021e-06, "loss": 0.0446, "step": 1754 }, { "epoch": 0.7794803464357095, "grad_norm": 0.8795540562598588, "learning_rate": 9.728423244320561e-06, "loss": 0.0756, "step": 1755 }, { "epoch": 0.7799244947812569, "grad_norm": 0.517878545048104, "learning_rate": 9.727792698878767e-06, "loss": 0.0577, "step": 1756 }, { "epoch": 0.7803686431268043, "grad_norm": 0.5755116631603886, "learning_rate": 9.72716144276942e-06, "loss": 0.0507, "step": 1757 }, { "epoch": 0.7808127914723517, "grad_norm": 0.501557195914397, "learning_rate": 9.726529476087406e-06, "loss": 0.0695, "step": 1758 }, { "epoch": 0.7812569398178992, "grad_norm": 0.6162038871090194, "learning_rate": 9.725896798927724e-06, "loss": 0.073, "step": 1759 }, { "epoch": 0.7817010881634466, "grad_norm": 0.44599102883855124, "learning_rate": 9.725263411385471e-06, "loss": 0.046, "step": 1760 }, { "epoch": 0.782145236508994, "grad_norm": 0.5089144227251173, "learning_rate": 9.724629313555862e-06, "loss": 0.0566, "step": 1761 }, { "epoch": 0.7825893848545414, "grad_norm": 0.5159558058324487, "learning_rate": 9.723994505534209e-06, "loss": 0.062, "step": 1762 }, { "epoch": 0.7830335332000888, "grad_norm": 0.7445098154181274, "learning_rate": 9.723358987415933e-06, "loss": 0.0774, "step": 1763 }, { "epoch": 0.7834776815456362, "grad_norm": 0.5480997952522682, "learning_rate": 9.722722759296568e-06, "loss": 0.0446, "step": 1764 }, { "epoch": 0.7839218298911836, "grad_norm": 0.6171588111915909, "learning_rate": 9.722085821271747e-06, "loss": 0.0695, "step": 1765 }, { "epoch": 0.784365978236731, "grad_norm": 0.5440136789733718, "learning_rate": 9.721448173437212e-06, "loss": 0.0509, "step": 1766 }, { "epoch": 0.7848101265822784, "grad_norm": 0.5213635915956231, "learning_rate": 9.720809815888814e-06, "loss": 0.0543, "step": 1767 }, { "epoch": 0.7852542749278258, "grad_norm": 0.5190117014644481, "learning_rate": 9.720170748722507e-06, "loss": 0.0625, "step": 1768 }, { "epoch": 0.7856984232733734, "grad_norm": 0.4927607718599878, "learning_rate": 9.719530972034356e-06, "loss": 0.0609, "step": 1769 }, { "epoch": 0.7861425716189208, "grad_norm": 0.48217603174270474, "learning_rate": 9.718890485920529e-06, "loss": 0.0496, "step": 1770 }, { "epoch": 0.7865867199644682, "grad_norm": 0.5424722566403071, "learning_rate": 9.7182492904773e-06, "loss": 0.0553, "step": 1771 }, { "epoch": 0.7870308683100156, "grad_norm": 0.549948589003628, "learning_rate": 9.717607385801055e-06, "loss": 0.0673, "step": 1772 }, { "epoch": 0.787475016655563, "grad_norm": 0.5315442944470616, "learning_rate": 9.716964771988281e-06, "loss": 0.0696, "step": 1773 }, { "epoch": 0.7879191650011104, "grad_norm": 0.6429809896985884, "learning_rate": 9.716321449135578e-06, "loss": 0.0789, "step": 1774 }, { "epoch": 0.7883633133466578, "grad_norm": 0.7545678487097474, "learning_rate": 9.715677417339641e-06, "loss": 0.0791, "step": 1775 }, { "epoch": 0.7888074616922052, "grad_norm": 0.5539973797183765, "learning_rate": 9.715032676697285e-06, "loss": 0.059, "step": 1776 }, { "epoch": 0.7892516100377526, "grad_norm": 0.5317979991080277, "learning_rate": 9.714387227305422e-06, "loss": 0.0622, "step": 1777 }, { "epoch": 0.7896957583833, "grad_norm": 0.5252592441778533, "learning_rate": 9.713741069261076e-06, "loss": 0.0463, "step": 1778 }, { "epoch": 0.7901399067288475, "grad_norm": 0.4965411657812668, "learning_rate": 9.713094202661374e-06, "loss": 0.0498, "step": 1779 }, { "epoch": 0.7905840550743949, "grad_norm": 0.7856085130172293, "learning_rate": 9.712446627603553e-06, "loss": 0.0732, "step": 1780 }, { "epoch": 0.7910282034199423, "grad_norm": 0.6433472150837076, "learning_rate": 9.711798344184952e-06, "loss": 0.0536, "step": 1781 }, { "epoch": 0.7914723517654897, "grad_norm": 0.5926660288471911, "learning_rate": 9.711149352503022e-06, "loss": 0.0611, "step": 1782 }, { "epoch": 0.7919165001110371, "grad_norm": 0.5407209189318974, "learning_rate": 9.710499652655313e-06, "loss": 0.0464, "step": 1783 }, { "epoch": 0.7923606484565845, "grad_norm": 1.2981488377904062, "learning_rate": 9.709849244739493e-06, "loss": 0.111, "step": 1784 }, { "epoch": 0.7928047968021319, "grad_norm": 0.6752401202019049, "learning_rate": 9.709198128853323e-06, "loss": 0.0662, "step": 1785 }, { "epoch": 0.7932489451476793, "grad_norm": 0.5128430129414299, "learning_rate": 9.708546305094679e-06, "loss": 0.0892, "step": 1786 }, { "epoch": 0.7936930934932267, "grad_norm": 0.5031597058072875, "learning_rate": 9.707893773561541e-06, "loss": 0.0552, "step": 1787 }, { "epoch": 0.7941372418387741, "grad_norm": 0.7529404051762868, "learning_rate": 9.707240534351995e-06, "loss": 0.0829, "step": 1788 }, { "epoch": 0.7945813901843216, "grad_norm": 0.549999308663066, "learning_rate": 9.706586587564236e-06, "loss": 0.0456, "step": 1789 }, { "epoch": 0.795025538529869, "grad_norm": 0.5507988655050372, "learning_rate": 9.705931933296563e-06, "loss": 0.0565, "step": 1790 }, { "epoch": 0.7954696868754164, "grad_norm": 0.5120352810106473, "learning_rate": 9.705276571647377e-06, "loss": 0.0529, "step": 1791 }, { "epoch": 0.7959138352209638, "grad_norm": 0.6321538206444037, "learning_rate": 9.704620502715196e-06, "loss": 0.0604, "step": 1792 }, { "epoch": 0.7963579835665112, "grad_norm": 0.5978551293263742, "learning_rate": 9.703963726598636e-06, "loss": 0.0615, "step": 1793 }, { "epoch": 0.7968021319120586, "grad_norm": 0.6845514683107613, "learning_rate": 9.70330624339642e-06, "loss": 0.0759, "step": 1794 }, { "epoch": 0.797246280257606, "grad_norm": 0.5162860917319786, "learning_rate": 9.702648053207381e-06, "loss": 0.0606, "step": 1795 }, { "epoch": 0.7976904286031534, "grad_norm": 0.5757276725457067, "learning_rate": 9.701989156130459e-06, "loss": 0.0494, "step": 1796 }, { "epoch": 0.7981345769487008, "grad_norm": 0.7240460732805346, "learning_rate": 9.70132955226469e-06, "loss": 0.0681, "step": 1797 }, { "epoch": 0.7985787252942482, "grad_norm": 0.6852456458554386, "learning_rate": 9.700669241709229e-06, "loss": 0.0696, "step": 1798 }, { "epoch": 0.7990228736397957, "grad_norm": 0.7728747951665925, "learning_rate": 9.70000822456333e-06, "loss": 0.075, "step": 1799 }, { "epoch": 0.7994670219853431, "grad_norm": 0.44864271076401085, "learning_rate": 9.699346500926357e-06, "loss": 0.0396, "step": 1800 }, { "epoch": 0.7999111703308905, "grad_norm": 0.45736945189860095, "learning_rate": 9.698684070897774e-06, "loss": 0.0528, "step": 1801 }, { "epoch": 0.8003553186764379, "grad_norm": 0.7755479737213115, "learning_rate": 9.69802093457716e-06, "loss": 0.0604, "step": 1802 }, { "epoch": 0.8007994670219853, "grad_norm": 0.5300657753961209, "learning_rate": 9.697357092064196e-06, "loss": 0.0675, "step": 1803 }, { "epoch": 0.8012436153675327, "grad_norm": 0.5613880914239765, "learning_rate": 9.696692543458666e-06, "loss": 0.0565, "step": 1804 }, { "epoch": 0.8016877637130801, "grad_norm": 1.5622355700759611, "learning_rate": 9.696027288860463e-06, "loss": 0.0748, "step": 1805 }, { "epoch": 0.8021319120586275, "grad_norm": 0.6809476301245401, "learning_rate": 9.695361328369588e-06, "loss": 0.077, "step": 1806 }, { "epoch": 0.802576060404175, "grad_norm": 0.573004020361602, "learning_rate": 9.694694662086143e-06, "loss": 0.0688, "step": 1807 }, { "epoch": 0.8030202087497224, "grad_norm": 0.48241041676332075, "learning_rate": 9.694027290110344e-06, "loss": 0.046, "step": 1808 }, { "epoch": 0.8034643570952699, "grad_norm": 0.726848223856282, "learning_rate": 9.693359212542504e-06, "loss": 0.0554, "step": 1809 }, { "epoch": 0.8039085054408173, "grad_norm": 0.5939220725933183, "learning_rate": 9.692690429483049e-06, "loss": 0.0599, "step": 1810 }, { "epoch": 0.8043526537863647, "grad_norm": 0.8703575105729704, "learning_rate": 9.692020941032508e-06, "loss": 0.0697, "step": 1811 }, { "epoch": 0.8047968021319121, "grad_norm": 0.6745036941332984, "learning_rate": 9.691350747291514e-06, "loss": 0.0622, "step": 1812 }, { "epoch": 0.8052409504774595, "grad_norm": 0.5924055978232626, "learning_rate": 9.690679848360811e-06, "loss": 0.068, "step": 1813 }, { "epoch": 0.8056850988230069, "grad_norm": 0.6624345407427882, "learning_rate": 9.690008244341247e-06, "loss": 0.0671, "step": 1814 }, { "epoch": 0.8061292471685543, "grad_norm": 0.760605230819851, "learning_rate": 9.689335935333775e-06, "loss": 0.0703, "step": 1815 }, { "epoch": 0.8065733955141017, "grad_norm": 0.6543633249820602, "learning_rate": 9.688662921439454e-06, "loss": 0.0537, "step": 1816 }, { "epoch": 0.8070175438596491, "grad_norm": 0.6888300517583433, "learning_rate": 9.687989202759448e-06, "loss": 0.0495, "step": 1817 }, { "epoch": 0.8074616922051966, "grad_norm": 0.686773842932207, "learning_rate": 9.68731477939503e-06, "loss": 0.0659, "step": 1818 }, { "epoch": 0.807905840550744, "grad_norm": 0.5300847493623394, "learning_rate": 9.686639651447578e-06, "loss": 0.0518, "step": 1819 }, { "epoch": 0.8083499888962914, "grad_norm": 0.5159319768487426, "learning_rate": 9.685963819018575e-06, "loss": 0.0515, "step": 1820 }, { "epoch": 0.8087941372418388, "grad_norm": 0.7950101365777946, "learning_rate": 9.685287282209607e-06, "loss": 0.0728, "step": 1821 }, { "epoch": 0.8092382855873862, "grad_norm": 1.1412965984185115, "learning_rate": 9.684610041122375e-06, "loss": 0.0802, "step": 1822 }, { "epoch": 0.8096824339329336, "grad_norm": 0.45513195688472363, "learning_rate": 9.683932095858673e-06, "loss": 0.0615, "step": 1823 }, { "epoch": 0.810126582278481, "grad_norm": 0.7127474010386736, "learning_rate": 9.683253446520412e-06, "loss": 0.0617, "step": 1824 }, { "epoch": 0.8105707306240284, "grad_norm": 0.5146350672182237, "learning_rate": 9.682574093209603e-06, "loss": 0.0821, "step": 1825 }, { "epoch": 0.8110148789695758, "grad_norm": 0.5927766784074162, "learning_rate": 9.681894036028365e-06, "loss": 0.0616, "step": 1826 }, { "epoch": 0.8114590273151232, "grad_norm": 0.6070773128654839, "learning_rate": 9.681213275078922e-06, "loss": 0.0615, "step": 1827 }, { "epoch": 0.8119031756606707, "grad_norm": 0.532740364018843, "learning_rate": 9.680531810463606e-06, "loss": 0.0572, "step": 1828 }, { "epoch": 0.8123473240062181, "grad_norm": 0.5406785460306237, "learning_rate": 9.679849642284846e-06, "loss": 0.0553, "step": 1829 }, { "epoch": 0.8127914723517655, "grad_norm": 0.7113722607782379, "learning_rate": 9.679166770645193e-06, "loss": 0.0588, "step": 1830 }, { "epoch": 0.8132356206973129, "grad_norm": 0.5228564923020365, "learning_rate": 9.678483195647286e-06, "loss": 0.0762, "step": 1831 }, { "epoch": 0.8136797690428603, "grad_norm": 0.3998194117652634, "learning_rate": 9.67779891739388e-06, "loss": 0.045, "step": 1832 }, { "epoch": 0.8141239173884077, "grad_norm": 0.46502028820157165, "learning_rate": 9.677113935987839e-06, "loss": 0.0508, "step": 1833 }, { "epoch": 0.8145680657339551, "grad_norm": 0.6755084418924148, "learning_rate": 9.67642825153212e-06, "loss": 0.0935, "step": 1834 }, { "epoch": 0.8150122140795025, "grad_norm": 0.6820496339271076, "learning_rate": 9.675741864129797e-06, "loss": 0.0648, "step": 1835 }, { "epoch": 0.8154563624250499, "grad_norm": 0.51959592407304, "learning_rate": 9.675054773884045e-06, "loss": 0.0804, "step": 1836 }, { "epoch": 0.8159005107705973, "grad_norm": 0.4703704489029568, "learning_rate": 9.674366980898145e-06, "loss": 0.0639, "step": 1837 }, { "epoch": 0.8163446591161448, "grad_norm": 0.4874780271160304, "learning_rate": 9.673678485275484e-06, "loss": 0.0508, "step": 1838 }, { "epoch": 0.8167888074616922, "grad_norm": 0.44148176077100115, "learning_rate": 9.672989287119555e-06, "loss": 0.0588, "step": 1839 }, { "epoch": 0.8172329558072396, "grad_norm": 0.7005816504280763, "learning_rate": 9.672299386533956e-06, "loss": 0.074, "step": 1840 }, { "epoch": 0.817677104152787, "grad_norm": 0.6435780803327446, "learning_rate": 9.67160878362239e-06, "loss": 0.064, "step": 1841 }, { "epoch": 0.8181212524983345, "grad_norm": 0.6068596716742378, "learning_rate": 9.670917478488669e-06, "loss": 0.0626, "step": 1842 }, { "epoch": 0.8185654008438819, "grad_norm": 0.48495659068419156, "learning_rate": 9.670225471236703e-06, "loss": 0.0566, "step": 1843 }, { "epoch": 0.8190095491894293, "grad_norm": 0.5732730166801319, "learning_rate": 9.669532761970518e-06, "loss": 0.0594, "step": 1844 }, { "epoch": 0.8194536975349767, "grad_norm": 0.6406816449651614, "learning_rate": 9.668839350794236e-06, "loss": 0.0602, "step": 1845 }, { "epoch": 0.8198978458805241, "grad_norm": 0.7660699462244367, "learning_rate": 9.66814523781209e-06, "loss": 0.1004, "step": 1846 }, { "epoch": 0.8203419942260715, "grad_norm": 0.6805404383061566, "learning_rate": 9.667450423128417e-06, "loss": 0.0727, "step": 1847 }, { "epoch": 0.820786142571619, "grad_norm": 1.8549554381012934, "learning_rate": 9.666754906847659e-06, "loss": 0.0612, "step": 1848 }, { "epoch": 0.8212302909171664, "grad_norm": 0.9016082075933314, "learning_rate": 9.666058689074364e-06, "loss": 0.0792, "step": 1849 }, { "epoch": 0.8216744392627138, "grad_norm": 0.5403087387935102, "learning_rate": 9.665361769913187e-06, "loss": 0.045, "step": 1850 }, { "epoch": 0.8221185876082612, "grad_norm": 0.45933630900068073, "learning_rate": 9.664664149468885e-06, "loss": 0.046, "step": 1851 }, { "epoch": 0.8225627359538086, "grad_norm": 0.6057928280686538, "learning_rate": 9.663965827846321e-06, "loss": 0.053, "step": 1852 }, { "epoch": 0.823006884299356, "grad_norm": 0.7862585104580586, "learning_rate": 9.663266805150468e-06, "loss": 0.0706, "step": 1853 }, { "epoch": 0.8234510326449034, "grad_norm": 0.59770705811337, "learning_rate": 9.662567081486398e-06, "loss": 0.0568, "step": 1854 }, { "epoch": 0.8238951809904508, "grad_norm": 0.5457681124884074, "learning_rate": 9.661866656959293e-06, "loss": 0.0534, "step": 1855 }, { "epoch": 0.8243393293359982, "grad_norm": 0.7281386032100584, "learning_rate": 9.661165531674438e-06, "loss": 0.081, "step": 1856 }, { "epoch": 0.8247834776815456, "grad_norm": 0.5330397841966769, "learning_rate": 9.660463705737224e-06, "loss": 0.0657, "step": 1857 }, { "epoch": 0.8252276260270931, "grad_norm": 0.5773185928330131, "learning_rate": 9.65976117925315e-06, "loss": 0.0657, "step": 1858 }, { "epoch": 0.8256717743726405, "grad_norm": 0.6769338412517576, "learning_rate": 9.659057952327812e-06, "loss": 0.0713, "step": 1859 }, { "epoch": 0.8261159227181879, "grad_norm": 0.5999087119449273, "learning_rate": 9.65835402506692e-06, "loss": 0.0776, "step": 1860 }, { "epoch": 0.8265600710637353, "grad_norm": 0.46186334132300766, "learning_rate": 9.657649397576289e-06, "loss": 0.0435, "step": 1861 }, { "epoch": 0.8270042194092827, "grad_norm": 0.6132380100477, "learning_rate": 9.656944069961832e-06, "loss": 0.0503, "step": 1862 }, { "epoch": 0.8274483677548301, "grad_norm": 0.511934760130446, "learning_rate": 9.656238042329575e-06, "loss": 0.047, "step": 1863 }, { "epoch": 0.8278925161003775, "grad_norm": 0.9622008544679613, "learning_rate": 9.655531314785643e-06, "loss": 0.0727, "step": 1864 }, { "epoch": 0.8283366644459249, "grad_norm": 0.41625404724105025, "learning_rate": 9.654823887436272e-06, "loss": 0.0452, "step": 1865 }, { "epoch": 0.8287808127914723, "grad_norm": 0.6534224883169892, "learning_rate": 9.6541157603878e-06, "loss": 0.0812, "step": 1866 }, { "epoch": 0.8292249611370197, "grad_norm": 0.3831024637275559, "learning_rate": 9.653406933746667e-06, "loss": 0.0406, "step": 1867 }, { "epoch": 0.8296691094825672, "grad_norm": 0.5750420337273457, "learning_rate": 9.652697407619425e-06, "loss": 0.0655, "step": 1868 }, { "epoch": 0.8301132578281146, "grad_norm": 0.5340996381251997, "learning_rate": 9.651987182112727e-06, "loss": 0.0684, "step": 1869 }, { "epoch": 0.830557406173662, "grad_norm": 0.5216822693020573, "learning_rate": 9.651276257333334e-06, "loss": 0.0506, "step": 1870 }, { "epoch": 0.8310015545192094, "grad_norm": 0.6549627952655241, "learning_rate": 9.650564633388106e-06, "loss": 0.0746, "step": 1871 }, { "epoch": 0.8314457028647568, "grad_norm": 0.41543419779685914, "learning_rate": 9.649852310384017e-06, "loss": 0.0418, "step": 1872 }, { "epoch": 0.8318898512103042, "grad_norm": 0.8710457342050398, "learning_rate": 9.649139288428136e-06, "loss": 0.0725, "step": 1873 }, { "epoch": 0.8323339995558516, "grad_norm": 0.7280421909189481, "learning_rate": 9.648425567627646e-06, "loss": 0.0834, "step": 1874 }, { "epoch": 0.832778147901399, "grad_norm": 0.48421338065941566, "learning_rate": 9.647711148089829e-06, "loss": 0.051, "step": 1875 }, { "epoch": 0.8332222962469464, "grad_norm": 0.8228707840085564, "learning_rate": 9.646996029922078e-06, "loss": 0.0899, "step": 1876 }, { "epoch": 0.8336664445924938, "grad_norm": 0.44548368481732253, "learning_rate": 9.646280213231882e-06, "loss": 0.0459, "step": 1877 }, { "epoch": 0.8341105929380414, "grad_norm": 0.77874935532614, "learning_rate": 9.645563698126846e-06, "loss": 0.0874, "step": 1878 }, { "epoch": 0.8345547412835888, "grad_norm": 0.6153818855236423, "learning_rate": 9.64484648471467e-06, "loss": 0.0538, "step": 1879 }, { "epoch": 0.8349988896291362, "grad_norm": 0.6341254163576385, "learning_rate": 9.644128573103166e-06, "loss": 0.0794, "step": 1880 }, { "epoch": 0.8354430379746836, "grad_norm": 1.0916751108937903, "learning_rate": 9.643409963400247e-06, "loss": 0.0775, "step": 1881 }, { "epoch": 0.835887186320231, "grad_norm": 0.5090370659080116, "learning_rate": 9.642690655713935e-06, "loss": 0.0515, "step": 1882 }, { "epoch": 0.8363313346657784, "grad_norm": 0.8125756753691618, "learning_rate": 9.641970650152351e-06, "loss": 0.0856, "step": 1883 }, { "epoch": 0.8367754830113258, "grad_norm": 0.7917848803419996, "learning_rate": 9.641249946823722e-06, "loss": 0.0789, "step": 1884 }, { "epoch": 0.8372196313568732, "grad_norm": 0.6341635191576055, "learning_rate": 9.640528545836388e-06, "loss": 0.0689, "step": 1885 }, { "epoch": 0.8376637797024206, "grad_norm": 0.45285744978069686, "learning_rate": 9.639806447298786e-06, "loss": 0.0502, "step": 1886 }, { "epoch": 0.8381079280479681, "grad_norm": 0.5918006498840634, "learning_rate": 9.639083651319455e-06, "loss": 0.077, "step": 1887 }, { "epoch": 0.8385520763935155, "grad_norm": 0.5566719058184162, "learning_rate": 9.638360158007049e-06, "loss": 0.0518, "step": 1888 }, { "epoch": 0.8389962247390629, "grad_norm": 0.5775755465954959, "learning_rate": 9.637635967470317e-06, "loss": 0.0583, "step": 1889 }, { "epoch": 0.8394403730846103, "grad_norm": 0.5105748469342627, "learning_rate": 9.636911079818121e-06, "loss": 0.0547, "step": 1890 }, { "epoch": 0.8398845214301577, "grad_norm": 0.7153133794510262, "learning_rate": 9.636185495159423e-06, "loss": 0.0598, "step": 1891 }, { "epoch": 0.8403286697757051, "grad_norm": 0.580672148124565, "learning_rate": 9.63545921360329e-06, "loss": 0.0601, "step": 1892 }, { "epoch": 0.8407728181212525, "grad_norm": 0.43212755100892075, "learning_rate": 9.634732235258895e-06, "loss": 0.0501, "step": 1893 }, { "epoch": 0.8412169664667999, "grad_norm": 0.6336655954017367, "learning_rate": 9.634004560235513e-06, "loss": 0.0742, "step": 1894 }, { "epoch": 0.8416611148123473, "grad_norm": 0.6609701387334967, "learning_rate": 9.633276188642529e-06, "loss": 0.0579, "step": 1895 }, { "epoch": 0.8421052631578947, "grad_norm": 0.6932382794686757, "learning_rate": 9.632547120589426e-06, "loss": 0.0581, "step": 1896 }, { "epoch": 0.8425494115034422, "grad_norm": 0.5148690171452629, "learning_rate": 9.631817356185799e-06, "loss": 0.0507, "step": 1897 }, { "epoch": 0.8429935598489896, "grad_norm": 0.5764411558916347, "learning_rate": 9.631086895541343e-06, "loss": 0.0485, "step": 1898 }, { "epoch": 0.843437708194537, "grad_norm": 0.713479181836388, "learning_rate": 9.630355738765859e-06, "loss": 0.0811, "step": 1899 }, { "epoch": 0.8438818565400844, "grad_norm": 0.6095181273242816, "learning_rate": 9.62962388596925e-06, "loss": 0.051, "step": 1900 }, { "epoch": 0.8443260048856318, "grad_norm": 0.6746333568364197, "learning_rate": 9.628891337261527e-06, "loss": 0.051, "step": 1901 }, { "epoch": 0.8447701532311792, "grad_norm": 0.5956567675202917, "learning_rate": 9.628158092752807e-06, "loss": 0.0544, "step": 1902 }, { "epoch": 0.8452143015767266, "grad_norm": 0.9239154351205555, "learning_rate": 9.627424152553305e-06, "loss": 0.0801, "step": 1903 }, { "epoch": 0.845658449922274, "grad_norm": 0.8434016325954864, "learning_rate": 9.626689516773348e-06, "loss": 0.0597, "step": 1904 }, { "epoch": 0.8461025982678214, "grad_norm": 0.6102688127848747, "learning_rate": 9.625954185523361e-06, "loss": 0.0751, "step": 1905 }, { "epoch": 0.8465467466133688, "grad_norm": 0.7017970233762413, "learning_rate": 9.62521815891388e-06, "loss": 0.0665, "step": 1906 }, { "epoch": 0.8469908949589163, "grad_norm": 0.6370603868487351, "learning_rate": 9.624481437055542e-06, "loss": 0.0584, "step": 1907 }, { "epoch": 0.8474350433044637, "grad_norm": 0.8073786270061732, "learning_rate": 9.623744020059086e-06, "loss": 0.0739, "step": 1908 }, { "epoch": 0.8478791916500111, "grad_norm": 0.6765473850641361, "learning_rate": 9.623005908035362e-06, "loss": 0.0578, "step": 1909 }, { "epoch": 0.8483233399955585, "grad_norm": 0.7723618661951789, "learning_rate": 9.622267101095318e-06, "loss": 0.0477, "step": 1910 }, { "epoch": 0.8487674883411059, "grad_norm": 0.5605923323935563, "learning_rate": 9.621527599350008e-06, "loss": 0.0603, "step": 1911 }, { "epoch": 0.8492116366866533, "grad_norm": 0.7755772246757074, "learning_rate": 9.620787402910597e-06, "loss": 0.0983, "step": 1912 }, { "epoch": 0.8496557850322007, "grad_norm": 0.7813757821925909, "learning_rate": 9.620046511888343e-06, "loss": 0.0499, "step": 1913 }, { "epoch": 0.8500999333777481, "grad_norm": 0.8027788261683163, "learning_rate": 9.619304926394619e-06, "loss": 0.0588, "step": 1914 }, { "epoch": 0.8505440817232955, "grad_norm": 0.8601044017181758, "learning_rate": 9.618562646540897e-06, "loss": 0.0638, "step": 1915 }, { "epoch": 0.8509882300688429, "grad_norm": 0.5716901663276188, "learning_rate": 9.617819672438754e-06, "loss": 0.056, "step": 1916 }, { "epoch": 0.8514323784143905, "grad_norm": 0.7888611302795855, "learning_rate": 9.617076004199868e-06, "loss": 0.08, "step": 1917 }, { "epoch": 0.8518765267599379, "grad_norm": 0.5937752892253898, "learning_rate": 9.616331641936031e-06, "loss": 0.0545, "step": 1918 }, { "epoch": 0.8523206751054853, "grad_norm": 0.6861875565077891, "learning_rate": 9.61558658575913e-06, "loss": 0.055, "step": 1919 }, { "epoch": 0.8527648234510327, "grad_norm": 0.8458011501127822, "learning_rate": 9.614840835781159e-06, "loss": 0.1023, "step": 1920 }, { "epoch": 0.8532089717965801, "grad_norm": 0.5027378978279742, "learning_rate": 9.614094392114218e-06, "loss": 0.0518, "step": 1921 }, { "epoch": 0.8536531201421275, "grad_norm": 0.5758297033096836, "learning_rate": 9.613347254870511e-06, "loss": 0.0476, "step": 1922 }, { "epoch": 0.8540972684876749, "grad_norm": 0.8773520125115069, "learning_rate": 9.612599424162344e-06, "loss": 0.0858, "step": 1923 }, { "epoch": 0.8545414168332223, "grad_norm": 0.5342112118627843, "learning_rate": 9.61185090010213e-06, "loss": 0.0532, "step": 1924 }, { "epoch": 0.8549855651787697, "grad_norm": 0.617753551302806, "learning_rate": 9.611101682802383e-06, "loss": 0.0664, "step": 1925 }, { "epoch": 0.8554297135243171, "grad_norm": 0.6816206410848374, "learning_rate": 9.610351772375724e-06, "loss": 0.0919, "step": 1926 }, { "epoch": 0.8558738618698646, "grad_norm": 0.636349806575952, "learning_rate": 9.609601168934878e-06, "loss": 0.0544, "step": 1927 }, { "epoch": 0.856318010215412, "grad_norm": 0.5095901638429818, "learning_rate": 9.608849872592674e-06, "loss": 0.0584, "step": 1928 }, { "epoch": 0.8567621585609594, "grad_norm": 0.5240679834795968, "learning_rate": 9.608097883462043e-06, "loss": 0.0553, "step": 1929 }, { "epoch": 0.8572063069065068, "grad_norm": 0.6099517865434585, "learning_rate": 9.60734520165602e-06, "loss": 0.0657, "step": 1930 }, { "epoch": 0.8576504552520542, "grad_norm": 0.555945227169772, "learning_rate": 9.60659182728775e-06, "loss": 0.0737, "step": 1931 }, { "epoch": 0.8580946035976016, "grad_norm": 0.5341529189989662, "learning_rate": 9.605837760470476e-06, "loss": 0.0494, "step": 1932 }, { "epoch": 0.858538751943149, "grad_norm": 0.5439610693418829, "learning_rate": 9.605083001317547e-06, "loss": 0.0566, "step": 1933 }, { "epoch": 0.8589829002886964, "grad_norm": 0.6077814464368465, "learning_rate": 9.604327549942415e-06, "loss": 0.0615, "step": 1934 }, { "epoch": 0.8594270486342438, "grad_norm": 0.7793330895236799, "learning_rate": 9.603571406458641e-06, "loss": 0.0669, "step": 1935 }, { "epoch": 0.8598711969797912, "grad_norm": 0.5859164596399333, "learning_rate": 9.60281457097988e-06, "loss": 0.0554, "step": 1936 }, { "epoch": 0.8603153453253387, "grad_norm": 0.5913578923102047, "learning_rate": 9.602057043619903e-06, "loss": 0.0819, "step": 1937 }, { "epoch": 0.8607594936708861, "grad_norm": 0.3937188804578706, "learning_rate": 9.601298824492577e-06, "loss": 0.0455, "step": 1938 }, { "epoch": 0.8612036420164335, "grad_norm": 0.5687600158251599, "learning_rate": 9.600539913711876e-06, "loss": 0.0703, "step": 1939 }, { "epoch": 0.8616477903619809, "grad_norm": 0.7038982027083199, "learning_rate": 9.599780311391876e-06, "loss": 0.0559, "step": 1940 }, { "epoch": 0.8620919387075283, "grad_norm": 0.5434923734263241, "learning_rate": 9.599020017646758e-06, "loss": 0.059, "step": 1941 }, { "epoch": 0.8625360870530757, "grad_norm": 0.6288012353236909, "learning_rate": 9.59825903259081e-06, "loss": 0.0664, "step": 1942 }, { "epoch": 0.8629802353986231, "grad_norm": 0.7037145137062264, "learning_rate": 9.597497356338415e-06, "loss": 0.069, "step": 1943 }, { "epoch": 0.8634243837441705, "grad_norm": 0.5095647439957363, "learning_rate": 9.59673498900407e-06, "loss": 0.0618, "step": 1944 }, { "epoch": 0.8638685320897179, "grad_norm": 0.6494185255511562, "learning_rate": 9.595971930702372e-06, "loss": 0.0658, "step": 1945 }, { "epoch": 0.8643126804352653, "grad_norm": 0.5454132982093647, "learning_rate": 9.595208181548022e-06, "loss": 0.0591, "step": 1946 }, { "epoch": 0.8647568287808128, "grad_norm": 0.7893076144618603, "learning_rate": 9.594443741655823e-06, "loss": 0.0582, "step": 1947 }, { "epoch": 0.8652009771263602, "grad_norm": 0.7430564668455467, "learning_rate": 9.593678611140683e-06, "loss": 0.0836, "step": 1948 }, { "epoch": 0.8656451254719076, "grad_norm": 0.6526222088648067, "learning_rate": 9.592912790117614e-06, "loss": 0.0612, "step": 1949 }, { "epoch": 0.866089273817455, "grad_norm": 0.5060704004253027, "learning_rate": 9.592146278701734e-06, "loss": 0.0528, "step": 1950 }, { "epoch": 0.8665334221630024, "grad_norm": 0.7447569328611817, "learning_rate": 9.591379077008263e-06, "loss": 0.0657, "step": 1951 }, { "epoch": 0.8669775705085498, "grad_norm": 0.6783080197900007, "learning_rate": 9.590611185152521e-06, "loss": 0.0748, "step": 1952 }, { "epoch": 0.8674217188540972, "grad_norm": 0.7292228470690477, "learning_rate": 9.589842603249935e-06, "loss": 0.0626, "step": 1953 }, { "epoch": 0.8678658671996446, "grad_norm": 0.5752723662054593, "learning_rate": 9.58907333141604e-06, "loss": 0.0562, "step": 1954 }, { "epoch": 0.868310015545192, "grad_norm": 0.5789129306578911, "learning_rate": 9.588303369766469e-06, "loss": 0.0523, "step": 1955 }, { "epoch": 0.8687541638907396, "grad_norm": 0.5972967849185621, "learning_rate": 9.58753271841696e-06, "loss": 0.0638, "step": 1956 }, { "epoch": 0.869198312236287, "grad_norm": 0.9160241760382546, "learning_rate": 9.586761377483355e-06, "loss": 0.083, "step": 1957 }, { "epoch": 0.8696424605818344, "grad_norm": 0.6452629891186096, "learning_rate": 9.585989347081599e-06, "loss": 0.0765, "step": 1958 }, { "epoch": 0.8700866089273818, "grad_norm": 0.5181341158162395, "learning_rate": 9.58521662732774e-06, "loss": 0.063, "step": 1959 }, { "epoch": 0.8705307572729292, "grad_norm": 0.5208504155693663, "learning_rate": 9.584443218337935e-06, "loss": 0.0609, "step": 1960 }, { "epoch": 0.8709749056184766, "grad_norm": 0.5883552043198282, "learning_rate": 9.583669120228439e-06, "loss": 0.0644, "step": 1961 }, { "epoch": 0.871419053964024, "grad_norm": 0.5457449002927042, "learning_rate": 9.582894333115608e-06, "loss": 0.0567, "step": 1962 }, { "epoch": 0.8718632023095714, "grad_norm": 0.6679359172890208, "learning_rate": 9.58211885711591e-06, "loss": 0.0652, "step": 1963 }, { "epoch": 0.8723073506551188, "grad_norm": 0.5085227180546535, "learning_rate": 9.581342692345913e-06, "loss": 0.0734, "step": 1964 }, { "epoch": 0.8727514990006662, "grad_norm": 0.63235227685218, "learning_rate": 9.580565838922285e-06, "loss": 0.0501, "step": 1965 }, { "epoch": 0.8731956473462137, "grad_norm": 0.5297922964410929, "learning_rate": 9.579788296961801e-06, "loss": 0.0525, "step": 1966 }, { "epoch": 0.8736397956917611, "grad_norm": 0.5775438947377388, "learning_rate": 9.57901006658134e-06, "loss": 0.0544, "step": 1967 }, { "epoch": 0.8740839440373085, "grad_norm": 0.708123483636742, "learning_rate": 9.57823114789788e-06, "loss": 0.0617, "step": 1968 }, { "epoch": 0.8745280923828559, "grad_norm": 0.5577152090781191, "learning_rate": 9.577451541028509e-06, "loss": 0.0583, "step": 1969 }, { "epoch": 0.8749722407284033, "grad_norm": 0.6520169618466134, "learning_rate": 9.576671246090415e-06, "loss": 0.0566, "step": 1970 }, { "epoch": 0.8754163890739507, "grad_norm": 0.49841809419135175, "learning_rate": 9.575890263200887e-06, "loss": 0.0497, "step": 1971 }, { "epoch": 0.8758605374194981, "grad_norm": 0.5197540662913069, "learning_rate": 9.575108592477322e-06, "loss": 0.0555, "step": 1972 }, { "epoch": 0.8763046857650455, "grad_norm": 0.5124464300264364, "learning_rate": 9.57432623403722e-06, "loss": 0.0477, "step": 1973 }, { "epoch": 0.8767488341105929, "grad_norm": 0.5779510703301827, "learning_rate": 9.57354318799818e-06, "loss": 0.0685, "step": 1974 }, { "epoch": 0.8771929824561403, "grad_norm": 0.5097058321667625, "learning_rate": 9.572759454477907e-06, "loss": 0.0488, "step": 1975 }, { "epoch": 0.8776371308016878, "grad_norm": 0.8063636142109883, "learning_rate": 9.57197503359421e-06, "loss": 0.0544, "step": 1976 }, { "epoch": 0.8780812791472352, "grad_norm": 0.4428063296481966, "learning_rate": 9.571189925465002e-06, "loss": 0.0533, "step": 1977 }, { "epoch": 0.8785254274927826, "grad_norm": 0.4839122950764424, "learning_rate": 9.570404130208297e-06, "loss": 0.0479, "step": 1978 }, { "epoch": 0.87896957583833, "grad_norm": 0.7120182407032511, "learning_rate": 9.569617647942214e-06, "loss": 0.0611, "step": 1979 }, { "epoch": 0.8794137241838774, "grad_norm": 0.792336377588857, "learning_rate": 9.568830478784975e-06, "loss": 0.0618, "step": 1980 }, { "epoch": 0.8798578725294248, "grad_norm": 0.7252084523924553, "learning_rate": 9.568042622854902e-06, "loss": 0.075, "step": 1981 }, { "epoch": 0.8803020208749722, "grad_norm": 0.5594674149617587, "learning_rate": 9.567254080270427e-06, "loss": 0.0619, "step": 1982 }, { "epoch": 0.8807461692205196, "grad_norm": 0.7969483073047114, "learning_rate": 9.566464851150078e-06, "loss": 0.0626, "step": 1983 }, { "epoch": 0.881190317566067, "grad_norm": 0.760520710752294, "learning_rate": 9.565674935612495e-06, "loss": 0.0584, "step": 1984 }, { "epoch": 0.8816344659116144, "grad_norm": 0.626615448212967, "learning_rate": 9.564884333776408e-06, "loss": 0.0596, "step": 1985 }, { "epoch": 0.8820786142571619, "grad_norm": 0.6020936472164464, "learning_rate": 9.564093045760663e-06, "loss": 0.0623, "step": 1986 }, { "epoch": 0.8825227626027093, "grad_norm": 0.7034061959408842, "learning_rate": 9.563301071684203e-06, "loss": 0.0549, "step": 1987 }, { "epoch": 0.8829669109482567, "grad_norm": 0.5695284311311956, "learning_rate": 9.562508411666077e-06, "loss": 0.0633, "step": 1988 }, { "epoch": 0.8834110592938041, "grad_norm": 0.9573516672168828, "learning_rate": 9.56171506582543e-06, "loss": 0.0653, "step": 1989 }, { "epoch": 0.8838552076393515, "grad_norm": 0.5805029234381518, "learning_rate": 9.56092103428152e-06, "loss": 0.0607, "step": 1990 }, { "epoch": 0.884299355984899, "grad_norm": 0.9854040215753008, "learning_rate": 9.560126317153702e-06, "loss": 0.0792, "step": 1991 }, { "epoch": 0.8847435043304464, "grad_norm": 0.7570653217798166, "learning_rate": 9.559330914561435e-06, "loss": 0.0574, "step": 1992 }, { "epoch": 0.8851876526759938, "grad_norm": 0.6760651246150844, "learning_rate": 9.558534826624281e-06, "loss": 0.0586, "step": 1993 }, { "epoch": 0.8856318010215412, "grad_norm": 0.7446222818408107, "learning_rate": 9.55773805346191e-06, "loss": 0.0903, "step": 1994 }, { "epoch": 0.8860759493670886, "grad_norm": 0.4971467452542264, "learning_rate": 9.556940595194085e-06, "loss": 0.0495, "step": 1995 }, { "epoch": 0.8865200977126361, "grad_norm": 0.48127766478143535, "learning_rate": 9.55614245194068e-06, "loss": 0.0476, "step": 1996 }, { "epoch": 0.8869642460581835, "grad_norm": 0.5264335771790009, "learning_rate": 9.555343623821669e-06, "loss": 0.0656, "step": 1997 }, { "epoch": 0.8874083944037309, "grad_norm": 1.0687153694098346, "learning_rate": 9.554544110957128e-06, "loss": 0.1082, "step": 1998 }, { "epoch": 0.8878525427492783, "grad_norm": 0.6068726497280238, "learning_rate": 9.553743913467241e-06, "loss": 0.0613, "step": 1999 }, { "epoch": 0.8882966910948257, "grad_norm": 0.7498237482418018, "learning_rate": 9.552943031472289e-06, "loss": 0.0771, "step": 2000 }, { "epoch": 0.8887408394403731, "grad_norm": 0.6023014910820245, "learning_rate": 9.552141465092659e-06, "loss": 0.0602, "step": 2001 }, { "epoch": 0.8891849877859205, "grad_norm": 0.6380409827625847, "learning_rate": 9.551339214448838e-06, "loss": 0.0619, "step": 2002 }, { "epoch": 0.8896291361314679, "grad_norm": 0.9507144923151234, "learning_rate": 9.55053627966142e-06, "loss": 0.0869, "step": 2003 }, { "epoch": 0.8900732844770153, "grad_norm": 0.5426047915412493, "learning_rate": 9.5497326608511e-06, "loss": 0.0571, "step": 2004 }, { "epoch": 0.8905174328225627, "grad_norm": 0.44148111117293815, "learning_rate": 9.548928358138672e-06, "loss": 0.0508, "step": 2005 }, { "epoch": 0.8909615811681102, "grad_norm": 0.6815953588544817, "learning_rate": 9.548123371645042e-06, "loss": 0.0648, "step": 2006 }, { "epoch": 0.8914057295136576, "grad_norm": 0.7228672896480205, "learning_rate": 9.547317701491207e-06, "loss": 0.0751, "step": 2007 }, { "epoch": 0.891849877859205, "grad_norm": 0.5288810936079249, "learning_rate": 9.546511347798278e-06, "loss": 0.0718, "step": 2008 }, { "epoch": 0.8922940262047524, "grad_norm": 0.6326908645656705, "learning_rate": 9.545704310687462e-06, "loss": 0.0643, "step": 2009 }, { "epoch": 0.8927381745502998, "grad_norm": 0.715169955221305, "learning_rate": 9.54489659028007e-06, "loss": 0.0551, "step": 2010 }, { "epoch": 0.8931823228958472, "grad_norm": 0.5887866106129909, "learning_rate": 9.544088186697515e-06, "loss": 0.0537, "step": 2011 }, { "epoch": 0.8936264712413946, "grad_norm": 0.9378252587331583, "learning_rate": 9.543279100061316e-06, "loss": 0.0823, "step": 2012 }, { "epoch": 0.894070619586942, "grad_norm": 0.5493178853844625, "learning_rate": 9.542469330493092e-06, "loss": 0.0518, "step": 2013 }, { "epoch": 0.8945147679324894, "grad_norm": 1.0489392256223762, "learning_rate": 9.541658878114564e-06, "loss": 0.0951, "step": 2014 }, { "epoch": 0.8949589162780368, "grad_norm": 0.9003525371835955, "learning_rate": 9.540847743047556e-06, "loss": 0.0823, "step": 2015 }, { "epoch": 0.8954030646235843, "grad_norm": 0.6349100893476417, "learning_rate": 9.540035925413997e-06, "loss": 0.0687, "step": 2016 }, { "epoch": 0.8958472129691317, "grad_norm": 0.6722685205867144, "learning_rate": 9.539223425335919e-06, "loss": 0.0584, "step": 2017 }, { "epoch": 0.8962913613146791, "grad_norm": 0.5752288899218686, "learning_rate": 9.53841024293545e-06, "loss": 0.0615, "step": 2018 }, { "epoch": 0.8967355096602265, "grad_norm": 0.6641589129330268, "learning_rate": 9.537596378334827e-06, "loss": 0.0599, "step": 2019 }, { "epoch": 0.8971796580057739, "grad_norm": 0.6171698716517034, "learning_rate": 9.53678183165639e-06, "loss": 0.0614, "step": 2020 }, { "epoch": 0.8976238063513213, "grad_norm": 0.46805323991408604, "learning_rate": 9.535966603022578e-06, "loss": 0.0497, "step": 2021 }, { "epoch": 0.8980679546968687, "grad_norm": 0.5561153083603769, "learning_rate": 9.53515069255593e-06, "loss": 0.0549, "step": 2022 }, { "epoch": 0.8985121030424161, "grad_norm": 0.8567619540393755, "learning_rate": 9.534334100379095e-06, "loss": 0.087, "step": 2023 }, { "epoch": 0.8989562513879635, "grad_norm": 0.7529977371586785, "learning_rate": 9.533516826614822e-06, "loss": 0.071, "step": 2024 }, { "epoch": 0.899400399733511, "grad_norm": 0.47166342806955874, "learning_rate": 9.532698871385957e-06, "loss": 0.0598, "step": 2025 }, { "epoch": 0.8998445480790584, "grad_norm": 0.5143678735436241, "learning_rate": 9.531880234815454e-06, "loss": 0.0564, "step": 2026 }, { "epoch": 0.9002886964246058, "grad_norm": 0.7879319949686112, "learning_rate": 9.53106091702637e-06, "loss": 0.0721, "step": 2027 }, { "epoch": 0.9007328447701533, "grad_norm": 0.9434208137805631, "learning_rate": 9.53024091814186e-06, "loss": 0.0777, "step": 2028 }, { "epoch": 0.9011769931157007, "grad_norm": 0.5393835319630947, "learning_rate": 9.529420238285185e-06, "loss": 0.0729, "step": 2029 }, { "epoch": 0.901621141461248, "grad_norm": 1.1442231822471376, "learning_rate": 9.528598877579707e-06, "loss": 0.0577, "step": 2030 }, { "epoch": 0.9020652898067955, "grad_norm": 0.6771073887283316, "learning_rate": 9.52777683614889e-06, "loss": 0.0524, "step": 2031 }, { "epoch": 0.9025094381523429, "grad_norm": 0.4601735486943438, "learning_rate": 9.5269541141163e-06, "loss": 0.049, "step": 2032 }, { "epoch": 0.9029535864978903, "grad_norm": 0.857950337442236, "learning_rate": 9.526130711605609e-06, "loss": 0.0729, "step": 2033 }, { "epoch": 0.9033977348434377, "grad_norm": 0.6334716311616548, "learning_rate": 9.525306628740585e-06, "loss": 0.0557, "step": 2034 }, { "epoch": 0.9038418831889852, "grad_norm": 0.5606743700857357, "learning_rate": 9.524481865645105e-06, "loss": 0.0529, "step": 2035 }, { "epoch": 0.9042860315345326, "grad_norm": 0.7009413752452551, "learning_rate": 9.523656422443142e-06, "loss": 0.0705, "step": 2036 }, { "epoch": 0.90473017988008, "grad_norm": 0.5676919166524141, "learning_rate": 9.522830299258773e-06, "loss": 0.0642, "step": 2037 }, { "epoch": 0.9051743282256274, "grad_norm": 0.5909379090037243, "learning_rate": 9.522003496216184e-06, "loss": 0.0611, "step": 2038 }, { "epoch": 0.9056184765711748, "grad_norm": 0.6412971030112977, "learning_rate": 9.521176013439652e-06, "loss": 0.0668, "step": 2039 }, { "epoch": 0.9060626249167222, "grad_norm": 0.6697892166854362, "learning_rate": 9.520347851053567e-06, "loss": 0.0589, "step": 2040 }, { "epoch": 0.9065067732622696, "grad_norm": 0.4472682187482963, "learning_rate": 9.51951900918241e-06, "loss": 0.0468, "step": 2041 }, { "epoch": 0.906950921607817, "grad_norm": 0.6790472985515921, "learning_rate": 9.518689487950772e-06, "loss": 0.064, "step": 2042 }, { "epoch": 0.9073950699533644, "grad_norm": 0.4982007942752425, "learning_rate": 9.517859287483347e-06, "loss": 0.0555, "step": 2043 }, { "epoch": 0.9078392182989118, "grad_norm": 0.4637933782350014, "learning_rate": 9.517028407904925e-06, "loss": 0.0565, "step": 2044 }, { "epoch": 0.9082833666444593, "grad_norm": 0.5832414481867781, "learning_rate": 9.516196849340402e-06, "loss": 0.0742, "step": 2045 }, { "epoch": 0.9087275149900067, "grad_norm": 0.5576423922621532, "learning_rate": 9.515364611914777e-06, "loss": 0.0628, "step": 2046 }, { "epoch": 0.9091716633355541, "grad_norm": 0.5229540500090215, "learning_rate": 9.514531695753146e-06, "loss": 0.0517, "step": 2047 }, { "epoch": 0.9096158116811015, "grad_norm": 0.6260810656444592, "learning_rate": 9.513698100980715e-06, "loss": 0.0828, "step": 2048 }, { "epoch": 0.9100599600266489, "grad_norm": 0.4707579422417085, "learning_rate": 9.512863827722785e-06, "loss": 0.0593, "step": 2049 }, { "epoch": 0.9105041083721963, "grad_norm": 0.5601794044808186, "learning_rate": 9.51202887610476e-06, "loss": 0.063, "step": 2050 }, { "epoch": 0.9109482567177437, "grad_norm": 0.6023400251451128, "learning_rate": 9.51119324625215e-06, "loss": 0.0686, "step": 2051 }, { "epoch": 0.9113924050632911, "grad_norm": 0.5886804844960863, "learning_rate": 9.510356938290562e-06, "loss": 0.0618, "step": 2052 }, { "epoch": 0.9118365534088385, "grad_norm": 0.5729230838893967, "learning_rate": 9.509519952345709e-06, "loss": 0.0594, "step": 2053 }, { "epoch": 0.9122807017543859, "grad_norm": 0.5913444178469247, "learning_rate": 9.508682288543405e-06, "loss": 0.0544, "step": 2054 }, { "epoch": 0.9127248500999334, "grad_norm": 0.5581180816842339, "learning_rate": 9.507843947009562e-06, "loss": 0.0489, "step": 2055 }, { "epoch": 0.9131689984454808, "grad_norm": 0.9592080607831761, "learning_rate": 9.507004927870202e-06, "loss": 0.0674, "step": 2056 }, { "epoch": 0.9136131467910282, "grad_norm": 0.48343586169312186, "learning_rate": 9.506165231251438e-06, "loss": 0.0536, "step": 2057 }, { "epoch": 0.9140572951365756, "grad_norm": 0.5238333120956294, "learning_rate": 9.505324857279494e-06, "loss": 0.0521, "step": 2058 }, { "epoch": 0.914501443482123, "grad_norm": 0.4166519007433999, "learning_rate": 9.504483806080694e-06, "loss": 0.0586, "step": 2059 }, { "epoch": 0.9149455918276704, "grad_norm": 0.6437419685785489, "learning_rate": 9.503642077781457e-06, "loss": 0.0767, "step": 2060 }, { "epoch": 0.9153897401732178, "grad_norm": 0.6570037258452504, "learning_rate": 9.502799672508314e-06, "loss": 0.0587, "step": 2061 }, { "epoch": 0.9158338885187652, "grad_norm": 0.6138821258144063, "learning_rate": 9.501956590387891e-06, "loss": 0.0736, "step": 2062 }, { "epoch": 0.9162780368643126, "grad_norm": 0.6641641647032757, "learning_rate": 9.501112831546917e-06, "loss": 0.0606, "step": 2063 }, { "epoch": 0.91672218520986, "grad_norm": 0.4693758498407545, "learning_rate": 9.500268396112224e-06, "loss": 0.056, "step": 2064 }, { "epoch": 0.9171663335554076, "grad_norm": 0.7149305856179645, "learning_rate": 9.499423284210745e-06, "loss": 0.0627, "step": 2065 }, { "epoch": 0.917610481900955, "grad_norm": 0.522841296737736, "learning_rate": 9.498577495969515e-06, "loss": 0.0722, "step": 2066 }, { "epoch": 0.9180546302465024, "grad_norm": 0.568329647769687, "learning_rate": 9.497731031515669e-06, "loss": 0.0546, "step": 2067 }, { "epoch": 0.9184987785920498, "grad_norm": 0.7376883712368818, "learning_rate": 9.496883890976445e-06, "loss": 0.0743, "step": 2068 }, { "epoch": 0.9189429269375972, "grad_norm": 0.8322448418666403, "learning_rate": 9.496036074479184e-06, "loss": 0.0505, "step": 2069 }, { "epoch": 0.9193870752831446, "grad_norm": 0.47540090764910864, "learning_rate": 9.495187582151328e-06, "loss": 0.0503, "step": 2070 }, { "epoch": 0.919831223628692, "grad_norm": 0.5694137714424392, "learning_rate": 9.494338414120419e-06, "loss": 0.0552, "step": 2071 }, { "epoch": 0.9202753719742394, "grad_norm": 0.4737570623495173, "learning_rate": 9.493488570514099e-06, "loss": 0.0546, "step": 2072 }, { "epoch": 0.9207195203197868, "grad_norm": 0.5080926402077233, "learning_rate": 9.492638051460116e-06, "loss": 0.0634, "step": 2073 }, { "epoch": 0.9211636686653342, "grad_norm": 0.5930821340624887, "learning_rate": 9.491786857086318e-06, "loss": 0.0558, "step": 2074 }, { "epoch": 0.9216078170108817, "grad_norm": 0.6489494784499551, "learning_rate": 9.490934987520653e-06, "loss": 0.0574, "step": 2075 }, { "epoch": 0.9220519653564291, "grad_norm": 0.4274889745604907, "learning_rate": 9.490082442891171e-06, "loss": 0.0484, "step": 2076 }, { "epoch": 0.9224961137019765, "grad_norm": 0.7132092664683398, "learning_rate": 9.489229223326027e-06, "loss": 0.0506, "step": 2077 }, { "epoch": 0.9229402620475239, "grad_norm": 0.7098655395611968, "learning_rate": 9.48837532895347e-06, "loss": 0.0693, "step": 2078 }, { "epoch": 0.9233844103930713, "grad_norm": 0.6125346051926611, "learning_rate": 9.487520759901858e-06, "loss": 0.0665, "step": 2079 }, { "epoch": 0.9238285587386187, "grad_norm": 0.5007229228967727, "learning_rate": 9.486665516299646e-06, "loss": 0.0763, "step": 2080 }, { "epoch": 0.9242727070841661, "grad_norm": 0.6228392832149312, "learning_rate": 9.485809598275391e-06, "loss": 0.0602, "step": 2081 }, { "epoch": 0.9247168554297135, "grad_norm": 0.5392941744359442, "learning_rate": 9.484953005957753e-06, "loss": 0.0621, "step": 2082 }, { "epoch": 0.9251610037752609, "grad_norm": 0.6769648255970269, "learning_rate": 9.484095739475492e-06, "loss": 0.0829, "step": 2083 }, { "epoch": 0.9256051521208084, "grad_norm": 1.0001370312264637, "learning_rate": 9.48323779895747e-06, "loss": 0.0616, "step": 2084 }, { "epoch": 0.9260493004663558, "grad_norm": 0.6960955983895617, "learning_rate": 9.482379184532652e-06, "loss": 0.0701, "step": 2085 }, { "epoch": 0.9264934488119032, "grad_norm": 0.5402499843217174, "learning_rate": 9.481519896330098e-06, "loss": 0.0557, "step": 2086 }, { "epoch": 0.9269375971574506, "grad_norm": 0.5511208018996919, "learning_rate": 9.480659934478975e-06, "loss": 0.0613, "step": 2087 }, { "epoch": 0.927381745502998, "grad_norm": 0.5938957138905275, "learning_rate": 9.479799299108553e-06, "loss": 0.0808, "step": 2088 }, { "epoch": 0.9278258938485454, "grad_norm": 0.7161437453790136, "learning_rate": 9.478937990348196e-06, "loss": 0.0606, "step": 2089 }, { "epoch": 0.9282700421940928, "grad_norm": 0.5468536840129006, "learning_rate": 9.478076008327377e-06, "loss": 0.0513, "step": 2090 }, { "epoch": 0.9287141905396402, "grad_norm": 0.9707762113595154, "learning_rate": 9.477213353175663e-06, "loss": 0.0869, "step": 2091 }, { "epoch": 0.9291583388851876, "grad_norm": 0.7985679050710678, "learning_rate": 9.476350025022728e-06, "loss": 0.0794, "step": 2092 }, { "epoch": 0.929602487230735, "grad_norm": 0.6127936826464601, "learning_rate": 9.475486023998345e-06, "loss": 0.055, "step": 2093 }, { "epoch": 0.9300466355762825, "grad_norm": 0.6334292089301367, "learning_rate": 9.474621350232387e-06, "loss": 0.0801, "step": 2094 }, { "epoch": 0.9304907839218299, "grad_norm": 0.5293672478661852, "learning_rate": 9.47375600385483e-06, "loss": 0.0463, "step": 2095 }, { "epoch": 0.9309349322673773, "grad_norm": 0.6942808305366424, "learning_rate": 9.47288998499575e-06, "loss": 0.0772, "step": 2096 }, { "epoch": 0.9313790806129247, "grad_norm": 0.801229643208364, "learning_rate": 9.472023293785322e-06, "loss": 0.0838, "step": 2097 }, { "epoch": 0.9318232289584721, "grad_norm": 0.5539233738084255, "learning_rate": 9.471155930353829e-06, "loss": 0.0477, "step": 2098 }, { "epoch": 0.9322673773040195, "grad_norm": 0.5083333963598577, "learning_rate": 9.470287894831648e-06, "loss": 0.0493, "step": 2099 }, { "epoch": 0.9327115256495669, "grad_norm": 0.5714592576450407, "learning_rate": 9.469419187349258e-06, "loss": 0.0518, "step": 2100 }, { "epoch": 0.9331556739951143, "grad_norm": 0.5821855293618007, "learning_rate": 9.468549808037241e-06, "loss": 0.0584, "step": 2101 }, { "epoch": 0.9335998223406617, "grad_norm": 0.5808549016034106, "learning_rate": 9.467679757026283e-06, "loss": 0.0641, "step": 2102 }, { "epoch": 0.9340439706862091, "grad_norm": 0.5185841811549224, "learning_rate": 9.466809034447165e-06, "loss": 0.0484, "step": 2103 }, { "epoch": 0.9344881190317567, "grad_norm": 0.46447107395363035, "learning_rate": 9.46593764043077e-06, "loss": 0.0538, "step": 2104 }, { "epoch": 0.9349322673773041, "grad_norm": 0.5957468835643678, "learning_rate": 9.465065575108084e-06, "loss": 0.0687, "step": 2105 }, { "epoch": 0.9353764157228515, "grad_norm": 0.6722403187627196, "learning_rate": 9.464192838610195e-06, "loss": 0.0633, "step": 2106 }, { "epoch": 0.9358205640683989, "grad_norm": 0.7367314225600079, "learning_rate": 9.463319431068289e-06, "loss": 0.0688, "step": 2107 }, { "epoch": 0.9362647124139463, "grad_norm": 0.5272343499456363, "learning_rate": 9.462445352613654e-06, "loss": 0.0598, "step": 2108 }, { "epoch": 0.9367088607594937, "grad_norm": 0.5251390084394981, "learning_rate": 9.461570603377678e-06, "loss": 0.0461, "step": 2109 }, { "epoch": 0.9371530091050411, "grad_norm": 0.45228924863036196, "learning_rate": 9.460695183491852e-06, "loss": 0.0551, "step": 2110 }, { "epoch": 0.9375971574505885, "grad_norm": 0.5609307631212084, "learning_rate": 9.459819093087765e-06, "loss": 0.0582, "step": 2111 }, { "epoch": 0.9380413057961359, "grad_norm": 0.6083990162855326, "learning_rate": 9.45894233229711e-06, "loss": 0.0727, "step": 2112 }, { "epoch": 0.9384854541416833, "grad_norm": 0.6031459810485845, "learning_rate": 9.458064901251679e-06, "loss": 0.055, "step": 2113 }, { "epoch": 0.9389296024872308, "grad_norm": 0.5698111663947624, "learning_rate": 9.457186800083363e-06, "loss": 0.0542, "step": 2114 }, { "epoch": 0.9393737508327782, "grad_norm": 0.5620094721834581, "learning_rate": 9.456308028924157e-06, "loss": 0.0647, "step": 2115 }, { "epoch": 0.9398178991783256, "grad_norm": 0.47201186226610237, "learning_rate": 9.455428587906154e-06, "loss": 0.0587, "step": 2116 }, { "epoch": 0.940262047523873, "grad_norm": 0.5090071386100868, "learning_rate": 9.45454847716155e-06, "loss": 0.0485, "step": 2117 }, { "epoch": 0.9407061958694204, "grad_norm": 0.6100904698718481, "learning_rate": 9.453667696822644e-06, "loss": 0.055, "step": 2118 }, { "epoch": 0.9411503442149678, "grad_norm": 0.5793242183604661, "learning_rate": 9.452786247021825e-06, "loss": 0.0633, "step": 2119 }, { "epoch": 0.9415944925605152, "grad_norm": 0.6603367019430398, "learning_rate": 9.451904127891593e-06, "loss": 0.062, "step": 2120 }, { "epoch": 0.9420386409060626, "grad_norm": 0.48144950864097125, "learning_rate": 9.451021339564549e-06, "loss": 0.0541, "step": 2121 }, { "epoch": 0.94248278925161, "grad_norm": 0.8619120091253565, "learning_rate": 9.450137882173385e-06, "loss": 0.0622, "step": 2122 }, { "epoch": 0.9429269375971574, "grad_norm": 0.574402948879313, "learning_rate": 9.449253755850902e-06, "loss": 0.0579, "step": 2123 }, { "epoch": 0.9433710859427049, "grad_norm": 0.5040572133776431, "learning_rate": 9.448368960730002e-06, "loss": 0.0471, "step": 2124 }, { "epoch": 0.9438152342882523, "grad_norm": 0.8130523767663201, "learning_rate": 9.447483496943682e-06, "loss": 0.0607, "step": 2125 }, { "epoch": 0.9442593826337997, "grad_norm": 0.6488075902611455, "learning_rate": 9.446597364625043e-06, "loss": 0.0495, "step": 2126 }, { "epoch": 0.9447035309793471, "grad_norm": 0.516081567488049, "learning_rate": 9.445710563907286e-06, "loss": 0.0597, "step": 2127 }, { "epoch": 0.9451476793248945, "grad_norm": 0.6477844164341606, "learning_rate": 9.444823094923712e-06, "loss": 0.0581, "step": 2128 }, { "epoch": 0.9455918276704419, "grad_norm": 0.5960225920848716, "learning_rate": 9.44393495780772e-06, "loss": 0.0693, "step": 2129 }, { "epoch": 0.9460359760159893, "grad_norm": 0.6610698243004944, "learning_rate": 9.443046152692818e-06, "loss": 0.0602, "step": 2130 }, { "epoch": 0.9464801243615367, "grad_norm": 0.501365231071439, "learning_rate": 9.442156679712604e-06, "loss": 0.0507, "step": 2131 }, { "epoch": 0.9469242727070841, "grad_norm": 0.5333891976903705, "learning_rate": 9.441266539000782e-06, "loss": 0.0551, "step": 2132 }, { "epoch": 0.9473684210526315, "grad_norm": 0.7140797441741807, "learning_rate": 9.440375730691154e-06, "loss": 0.0559, "step": 2133 }, { "epoch": 0.947812569398179, "grad_norm": 1.0212073230522318, "learning_rate": 9.439484254917626e-06, "loss": 0.0601, "step": 2134 }, { "epoch": 0.9482567177437264, "grad_norm": 0.6547086006006103, "learning_rate": 9.4385921118142e-06, "loss": 0.0678, "step": 2135 }, { "epoch": 0.9487008660892738, "grad_norm": 0.5668847306270909, "learning_rate": 9.437699301514983e-06, "loss": 0.0562, "step": 2136 }, { "epoch": 0.9491450144348212, "grad_norm": 0.535851688994489, "learning_rate": 9.436805824154175e-06, "loss": 0.0536, "step": 2137 }, { "epoch": 0.9495891627803686, "grad_norm": 0.6725155861850945, "learning_rate": 9.435911679866085e-06, "loss": 0.0636, "step": 2138 }, { "epoch": 0.950033311125916, "grad_norm": 0.472128745022765, "learning_rate": 9.435016868785117e-06, "loss": 0.0514, "step": 2139 }, { "epoch": 0.9504774594714634, "grad_norm": 0.5611219746408826, "learning_rate": 9.434121391045775e-06, "loss": 0.057, "step": 2140 }, { "epoch": 0.9509216078170108, "grad_norm": 0.5197846459934121, "learning_rate": 9.433225246782664e-06, "loss": 0.0497, "step": 2141 }, { "epoch": 0.9513657561625583, "grad_norm": 0.6901510307382841, "learning_rate": 9.432328436130493e-06, "loss": 0.0588, "step": 2142 }, { "epoch": 0.9518099045081057, "grad_norm": 0.6595671077813987, "learning_rate": 9.431430959224067e-06, "loss": 0.0852, "step": 2143 }, { "epoch": 0.9522540528536532, "grad_norm": 0.5676059741434758, "learning_rate": 9.43053281619829e-06, "loss": 0.0507, "step": 2144 }, { "epoch": 0.9526982011992006, "grad_norm": 0.4855036199663289, "learning_rate": 9.429634007188169e-06, "loss": 0.0474, "step": 2145 }, { "epoch": 0.953142349544748, "grad_norm": 0.732709155194895, "learning_rate": 9.42873453232881e-06, "loss": 0.0619, "step": 2146 }, { "epoch": 0.9535864978902954, "grad_norm": 0.6306866348324036, "learning_rate": 9.42783439175542e-06, "loss": 0.0772, "step": 2147 }, { "epoch": 0.9540306462358428, "grad_norm": 0.6444304070837995, "learning_rate": 9.426933585603304e-06, "loss": 0.0565, "step": 2148 }, { "epoch": 0.9544747945813902, "grad_norm": 0.5421189880885259, "learning_rate": 9.42603211400787e-06, "loss": 0.0655, "step": 2149 }, { "epoch": 0.9549189429269376, "grad_norm": 0.5350689023537523, "learning_rate": 9.425129977104626e-06, "loss": 0.0598, "step": 2150 }, { "epoch": 0.955363091272485, "grad_norm": 0.3832349941062708, "learning_rate": 9.424227175029175e-06, "loss": 0.0474, "step": 2151 }, { "epoch": 0.9558072396180324, "grad_norm": 0.5764500588860708, "learning_rate": 9.423323707917226e-06, "loss": 0.0673, "step": 2152 }, { "epoch": 0.9562513879635799, "grad_norm": 0.41134986778955357, "learning_rate": 9.422419575904584e-06, "loss": 0.0449, "step": 2153 }, { "epoch": 0.9566955363091273, "grad_norm": 0.40411844058020524, "learning_rate": 9.421514779127156e-06, "loss": 0.0457, "step": 2154 }, { "epoch": 0.9571396846546747, "grad_norm": 0.5724570461924485, "learning_rate": 9.420609317720948e-06, "loss": 0.064, "step": 2155 }, { "epoch": 0.9575838330002221, "grad_norm": 0.5481138081401853, "learning_rate": 9.419703191822067e-06, "loss": 0.0512, "step": 2156 }, { "epoch": 0.9580279813457695, "grad_norm": 0.5977006719684248, "learning_rate": 9.418796401566719e-06, "loss": 0.0452, "step": 2157 }, { "epoch": 0.9584721296913169, "grad_norm": 0.5349085172455206, "learning_rate": 9.417888947091208e-06, "loss": 0.065, "step": 2158 }, { "epoch": 0.9589162780368643, "grad_norm": 0.6787928669434861, "learning_rate": 9.416980828531944e-06, "loss": 0.0677, "step": 2159 }, { "epoch": 0.9593604263824117, "grad_norm": 0.709282953075048, "learning_rate": 9.416072046025429e-06, "loss": 0.0729, "step": 2160 }, { "epoch": 0.9598045747279591, "grad_norm": 0.6790718166354139, "learning_rate": 9.415162599708268e-06, "loss": 0.0699, "step": 2161 }, { "epoch": 0.9602487230735065, "grad_norm": 0.545039186094067, "learning_rate": 9.414252489717168e-06, "loss": 0.0594, "step": 2162 }, { "epoch": 0.960692871419054, "grad_norm": 0.7438799552703795, "learning_rate": 9.413341716188934e-06, "loss": 0.0653, "step": 2163 }, { "epoch": 0.9611370197646014, "grad_norm": 0.5419774566213018, "learning_rate": 9.412430279260473e-06, "loss": 0.0452, "step": 2164 }, { "epoch": 0.9615811681101488, "grad_norm": 0.5417611334038369, "learning_rate": 9.411518179068785e-06, "loss": 0.0695, "step": 2165 }, { "epoch": 0.9620253164556962, "grad_norm": 0.556851360606536, "learning_rate": 9.410605415750977e-06, "loss": 0.0612, "step": 2166 }, { "epoch": 0.9624694648012436, "grad_norm": 0.8789606927293152, "learning_rate": 9.40969198944425e-06, "loss": 0.0604, "step": 2167 }, { "epoch": 0.962913613146791, "grad_norm": 0.5933413334023052, "learning_rate": 9.40877790028591e-06, "loss": 0.0623, "step": 2168 }, { "epoch": 0.9633577614923384, "grad_norm": 0.5946398594923591, "learning_rate": 9.407863148413361e-06, "loss": 0.0419, "step": 2169 }, { "epoch": 0.9638019098378858, "grad_norm": 0.48221233179240436, "learning_rate": 9.406947733964103e-06, "loss": 0.0546, "step": 2170 }, { "epoch": 0.9642460581834332, "grad_norm": 0.6525276948819981, "learning_rate": 9.40603165707574e-06, "loss": 0.0526, "step": 2171 }, { "epoch": 0.9646902065289806, "grad_norm": 0.7205786693472772, "learning_rate": 9.405114917885973e-06, "loss": 0.0684, "step": 2172 }, { "epoch": 0.9651343548745281, "grad_norm": 0.5044988796095775, "learning_rate": 9.404197516532605e-06, "loss": 0.0478, "step": 2173 }, { "epoch": 0.9655785032200755, "grad_norm": 0.44346462918671525, "learning_rate": 9.403279453153536e-06, "loss": 0.0446, "step": 2174 }, { "epoch": 0.966022651565623, "grad_norm": 0.5173823021640699, "learning_rate": 9.402360727886766e-06, "loss": 0.0623, "step": 2175 }, { "epoch": 0.9664667999111703, "grad_norm": 0.5687155650294274, "learning_rate": 9.401441340870397e-06, "loss": 0.0611, "step": 2176 }, { "epoch": 0.9669109482567178, "grad_norm": 0.6467987726672418, "learning_rate": 9.400521292242626e-06, "loss": 0.0529, "step": 2177 }, { "epoch": 0.9673550966022652, "grad_norm": 0.5130913991837206, "learning_rate": 9.399600582141752e-06, "loss": 0.0599, "step": 2178 }, { "epoch": 0.9677992449478126, "grad_norm": 0.6670998717556929, "learning_rate": 9.398679210706176e-06, "loss": 0.0682, "step": 2179 }, { "epoch": 0.96824339329336, "grad_norm": 0.7694218770583813, "learning_rate": 9.397757178074392e-06, "loss": 0.0686, "step": 2180 }, { "epoch": 0.9686875416389074, "grad_norm": 0.7755428807155127, "learning_rate": 9.396834484385e-06, "loss": 0.0574, "step": 2181 }, { "epoch": 0.9691316899844548, "grad_norm": 0.44478529742795087, "learning_rate": 9.395911129776699e-06, "loss": 0.0429, "step": 2182 }, { "epoch": 0.9695758383300023, "grad_norm": 0.7140201406983687, "learning_rate": 9.394987114388278e-06, "loss": 0.0711, "step": 2183 }, { "epoch": 0.9700199866755497, "grad_norm": 0.8530149060800487, "learning_rate": 9.394062438358637e-06, "loss": 0.0529, "step": 2184 }, { "epoch": 0.9704641350210971, "grad_norm": 0.6524072835443322, "learning_rate": 9.39313710182677e-06, "loss": 0.0624, "step": 2185 }, { "epoch": 0.9709082833666445, "grad_norm": 0.4290712533966452, "learning_rate": 9.39221110493177e-06, "loss": 0.041, "step": 2186 }, { "epoch": 0.9713524317121919, "grad_norm": 0.5324670553025179, "learning_rate": 9.39128444781283e-06, "loss": 0.0477, "step": 2187 }, { "epoch": 0.9717965800577393, "grad_norm": 0.5901470773440277, "learning_rate": 9.390357130609243e-06, "loss": 0.0568, "step": 2188 }, { "epoch": 0.9722407284032867, "grad_norm": 0.7720115526671916, "learning_rate": 9.3894291534604e-06, "loss": 0.0559, "step": 2189 }, { "epoch": 0.9726848767488341, "grad_norm": 0.6618879002055548, "learning_rate": 9.38850051650579e-06, "loss": 0.0789, "step": 2190 }, { "epoch": 0.9731290250943815, "grad_norm": 0.6242983609779144, "learning_rate": 9.387571219885008e-06, "loss": 0.0709, "step": 2191 }, { "epoch": 0.9735731734399289, "grad_norm": 0.5488182994392276, "learning_rate": 9.386641263737736e-06, "loss": 0.0483, "step": 2192 }, { "epoch": 0.9740173217854764, "grad_norm": 0.5678755050400883, "learning_rate": 9.38571064820377e-06, "loss": 0.0639, "step": 2193 }, { "epoch": 0.9744614701310238, "grad_norm": 0.7972177120971958, "learning_rate": 9.384779373422992e-06, "loss": 0.0688, "step": 2194 }, { "epoch": 0.9749056184765712, "grad_norm": 0.4181144019615441, "learning_rate": 9.38384743953539e-06, "loss": 0.0434, "step": 2195 }, { "epoch": 0.9753497668221186, "grad_norm": 1.0045462377103442, "learning_rate": 9.382914846681049e-06, "loss": 0.0626, "step": 2196 }, { "epoch": 0.975793915167666, "grad_norm": 0.5663797823359984, "learning_rate": 9.381981595000153e-06, "loss": 0.0536, "step": 2197 }, { "epoch": 0.9762380635132134, "grad_norm": 0.5230033029520286, "learning_rate": 9.381047684632986e-06, "loss": 0.0431, "step": 2198 }, { "epoch": 0.9766822118587608, "grad_norm": 0.5941306490458375, "learning_rate": 9.380113115719933e-06, "loss": 0.0514, "step": 2199 }, { "epoch": 0.9771263602043082, "grad_norm": 0.7074167144721983, "learning_rate": 9.379177888401473e-06, "loss": 0.0516, "step": 2200 }, { "epoch": 0.9775705085498556, "grad_norm": 0.8936942623677783, "learning_rate": 9.378242002818186e-06, "loss": 0.0634, "step": 2201 }, { "epoch": 0.978014656895403, "grad_norm": 0.8015956656924698, "learning_rate": 9.377305459110754e-06, "loss": 0.0701, "step": 2202 }, { "epoch": 0.9784588052409505, "grad_norm": 0.5888281751564788, "learning_rate": 9.376368257419955e-06, "loss": 0.0735, "step": 2203 }, { "epoch": 0.9789029535864979, "grad_norm": 0.6871960760484549, "learning_rate": 9.375430397886661e-06, "loss": 0.0528, "step": 2204 }, { "epoch": 0.9793471019320453, "grad_norm": 0.5915896411171675, "learning_rate": 9.374491880651856e-06, "loss": 0.0577, "step": 2205 }, { "epoch": 0.9797912502775927, "grad_norm": 0.6268683607318559, "learning_rate": 9.373552705856612e-06, "loss": 0.0511, "step": 2206 }, { "epoch": 0.9802353986231401, "grad_norm": 0.5959527967947893, "learning_rate": 9.372612873642101e-06, "loss": 0.0577, "step": 2207 }, { "epoch": 0.9806795469686875, "grad_norm": 0.6376605781281672, "learning_rate": 9.3716723841496e-06, "loss": 0.056, "step": 2208 }, { "epoch": 0.9811236953142349, "grad_norm": 0.6655725615122635, "learning_rate": 9.370731237520476e-06, "loss": 0.0495, "step": 2209 }, { "epoch": 0.9815678436597823, "grad_norm": 0.6178758449973094, "learning_rate": 9.369789433896201e-06, "loss": 0.0785, "step": 2210 }, { "epoch": 0.9820119920053297, "grad_norm": 0.6203954478751535, "learning_rate": 9.368846973418347e-06, "loss": 0.0541, "step": 2211 }, { "epoch": 0.9824561403508771, "grad_norm": 0.5844524216400101, "learning_rate": 9.367903856228575e-06, "loss": 0.0484, "step": 2212 }, { "epoch": 0.9829002886964247, "grad_norm": 1.3376432627550732, "learning_rate": 9.366960082468658e-06, "loss": 0.0523, "step": 2213 }, { "epoch": 0.983344437041972, "grad_norm": 0.7650415999284177, "learning_rate": 9.36601565228046e-06, "loss": 0.0758, "step": 2214 }, { "epoch": 0.9837885853875195, "grad_norm": 0.6031770466123566, "learning_rate": 9.365070565805941e-06, "loss": 0.0552, "step": 2215 }, { "epoch": 0.9842327337330669, "grad_norm": 0.5317856643301976, "learning_rate": 9.364124823187169e-06, "loss": 0.0495, "step": 2216 }, { "epoch": 0.9846768820786143, "grad_norm": 0.697207175537678, "learning_rate": 9.363178424566302e-06, "loss": 0.0575, "step": 2217 }, { "epoch": 0.9851210304241617, "grad_norm": 0.6901112062939687, "learning_rate": 9.3622313700856e-06, "loss": 0.0558, "step": 2218 }, { "epoch": 0.9855651787697091, "grad_norm": 0.7725648574354754, "learning_rate": 9.361283659887421e-06, "loss": 0.0526, "step": 2219 }, { "epoch": 0.9860093271152565, "grad_norm": 0.4919891771117441, "learning_rate": 9.360335294114222e-06, "loss": 0.0517, "step": 2220 }, { "epoch": 0.9864534754608039, "grad_norm": 0.604042614778997, "learning_rate": 9.359386272908561e-06, "loss": 0.0654, "step": 2221 }, { "epoch": 0.9868976238063514, "grad_norm": 0.533434108631547, "learning_rate": 9.35843659641309e-06, "loss": 0.0587, "step": 2222 }, { "epoch": 0.9873417721518988, "grad_norm": 0.627831211955756, "learning_rate": 9.35748626477056e-06, "loss": 0.0577, "step": 2223 }, { "epoch": 0.9877859204974462, "grad_norm": 0.48609069405336236, "learning_rate": 9.356535278123826e-06, "loss": 0.0625, "step": 2224 }, { "epoch": 0.9882300688429936, "grad_norm": 0.5441221898111536, "learning_rate": 9.355583636615832e-06, "loss": 0.0671, "step": 2225 }, { "epoch": 0.988674217188541, "grad_norm": 0.6653694627323021, "learning_rate": 9.354631340389633e-06, "loss": 0.0678, "step": 2226 }, { "epoch": 0.9891183655340884, "grad_norm": 0.49687550533635927, "learning_rate": 9.353678389588367e-06, "loss": 0.048, "step": 2227 }, { "epoch": 0.9895625138796358, "grad_norm": 0.5456376905073873, "learning_rate": 9.352724784355286e-06, "loss": 0.0763, "step": 2228 }, { "epoch": 0.9900066622251832, "grad_norm": 0.5234550844694897, "learning_rate": 9.35177052483373e-06, "loss": 0.0573, "step": 2229 }, { "epoch": 0.9904508105707306, "grad_norm": 0.3992910106511535, "learning_rate": 9.35081561116714e-06, "loss": 0.0422, "step": 2230 }, { "epoch": 0.990894958916278, "grad_norm": 0.4091987827719479, "learning_rate": 9.349860043499056e-06, "loss": 0.0474, "step": 2231 }, { "epoch": 0.9913391072618255, "grad_norm": 0.5342639072522524, "learning_rate": 9.348903821973114e-06, "loss": 0.0663, "step": 2232 }, { "epoch": 0.9917832556073729, "grad_norm": 0.4467060129332395, "learning_rate": 9.347946946733055e-06, "loss": 0.0443, "step": 2233 }, { "epoch": 0.9922274039529203, "grad_norm": 0.4523315857358909, "learning_rate": 9.346989417922712e-06, "loss": 0.0415, "step": 2234 }, { "epoch": 0.9926715522984677, "grad_norm": 0.525064951465678, "learning_rate": 9.346031235686014e-06, "loss": 0.0663, "step": 2235 }, { "epoch": 0.9931157006440151, "grad_norm": 0.5615837318533068, "learning_rate": 9.345072400166999e-06, "loss": 0.0558, "step": 2236 }, { "epoch": 0.9935598489895625, "grad_norm": 0.5550096636418779, "learning_rate": 9.34411291150979e-06, "loss": 0.0699, "step": 2237 }, { "epoch": 0.9940039973351099, "grad_norm": 0.9242044346261947, "learning_rate": 9.343152769858616e-06, "loss": 0.0623, "step": 2238 }, { "epoch": 0.9944481456806573, "grad_norm": 0.6358712068894604, "learning_rate": 9.342191975357806e-06, "loss": 0.0675, "step": 2239 }, { "epoch": 0.9948922940262047, "grad_norm": 0.5189758212655892, "learning_rate": 9.34123052815178e-06, "loss": 0.0446, "step": 2240 }, { "epoch": 0.9953364423717521, "grad_norm": 0.4493321450297416, "learning_rate": 9.340268428385062e-06, "loss": 0.0416, "step": 2241 }, { "epoch": 0.9957805907172996, "grad_norm": 0.4052216095743849, "learning_rate": 9.339305676202268e-06, "loss": 0.0524, "step": 2242 }, { "epoch": 0.996224739062847, "grad_norm": 0.4257914645053657, "learning_rate": 9.338342271748122e-06, "loss": 0.0439, "step": 2243 }, { "epoch": 0.9966688874083944, "grad_norm": 0.5443368549896095, "learning_rate": 9.337378215167436e-06, "loss": 0.0494, "step": 2244 }, { "epoch": 0.9971130357539418, "grad_norm": 0.48744281644400594, "learning_rate": 9.336413506605123e-06, "loss": 0.0692, "step": 2245 }, { "epoch": 0.9975571840994892, "grad_norm": 0.730043582195284, "learning_rate": 9.335448146206201e-06, "loss": 0.0613, "step": 2246 }, { "epoch": 0.9980013324450366, "grad_norm": 0.6666928317513323, "learning_rate": 9.334482134115774e-06, "loss": 0.0626, "step": 2247 }, { "epoch": 0.998445480790584, "grad_norm": 1.1582138795192063, "learning_rate": 9.333515470479052e-06, "loss": 0.0706, "step": 2248 }, { "epoch": 0.9988896291361314, "grad_norm": 0.41845565011499725, "learning_rate": 9.332548155441341e-06, "loss": 0.0428, "step": 2249 }, { "epoch": 0.9993337774816788, "grad_norm": 0.49853102857677845, "learning_rate": 9.331580189148047e-06, "loss": 0.0498, "step": 2250 }, { "epoch": 0.9997779258272262, "grad_norm": 0.469460122533607, "learning_rate": 9.330611571744668e-06, "loss": 0.0584, "step": 2251 }, { "epoch": 0.9997779258272262, "eval_loss": 0.06305240094661713, "eval_runtime": 420.7122, "eval_samples_per_second": 36.048, "eval_steps_per_second": 1.127, "step": 2251 }, { "epoch": 1.0002220741727736, "grad_norm": 0.8383200378207877, "learning_rate": 9.329642303376806e-06, "loss": 0.0866, "step": 2252 }, { "epoch": 1.0006662225183212, "grad_norm": 0.4162547630354714, "learning_rate": 9.328672384190158e-06, "loss": 0.044, "step": 2253 }, { "epoch": 1.0011103708638684, "grad_norm": 0.5811219466400265, "learning_rate": 9.327701814330521e-06, "loss": 0.0489, "step": 2254 }, { "epoch": 1.001554519209416, "grad_norm": 0.4943074740960733, "learning_rate": 9.326730593943784e-06, "loss": 0.0532, "step": 2255 }, { "epoch": 1.0019986675549633, "grad_norm": 0.5854940362247197, "learning_rate": 9.325758723175942e-06, "loss": 0.0447, "step": 2256 }, { "epoch": 1.0024428159005108, "grad_norm": 0.6734583854523404, "learning_rate": 9.324786202173082e-06, "loss": 0.0565, "step": 2257 }, { "epoch": 1.0028869642460583, "grad_norm": 0.626997176192022, "learning_rate": 9.32381303108139e-06, "loss": 0.0554, "step": 2258 }, { "epoch": 1.0033311125916056, "grad_norm": 0.7899011567544635, "learning_rate": 9.322839210047152e-06, "loss": 0.0541, "step": 2259 }, { "epoch": 1.003775260937153, "grad_norm": 0.44777931003199795, "learning_rate": 9.321864739216747e-06, "loss": 0.0475, "step": 2260 }, { "epoch": 1.0042194092827004, "grad_norm": 0.7176524264984108, "learning_rate": 9.320889618736657e-06, "loss": 0.0657, "step": 2261 }, { "epoch": 1.0046635576282479, "grad_norm": 0.6297899790013282, "learning_rate": 9.319913848753457e-06, "loss": 0.0548, "step": 2262 }, { "epoch": 1.0051077059737952, "grad_norm": 0.6829050036731806, "learning_rate": 9.318937429413823e-06, "loss": 0.0677, "step": 2263 }, { "epoch": 1.0055518543193427, "grad_norm": 0.6099066772979853, "learning_rate": 9.31796036086453e-06, "loss": 0.0674, "step": 2264 }, { "epoch": 1.00599600266489, "grad_norm": 0.5220221509404842, "learning_rate": 9.316982643252444e-06, "loss": 0.0489, "step": 2265 }, { "epoch": 1.0064401510104375, "grad_norm": 0.6319552346310404, "learning_rate": 9.316004276724533e-06, "loss": 0.0566, "step": 2266 }, { "epoch": 1.006884299355985, "grad_norm": 0.75308545956684, "learning_rate": 9.315025261427864e-06, "loss": 0.0611, "step": 2267 }, { "epoch": 1.0073284477015323, "grad_norm": 0.5854190329106237, "learning_rate": 9.314045597509598e-06, "loss": 0.0589, "step": 2268 }, { "epoch": 1.0077725960470798, "grad_norm": 0.4592027406322817, "learning_rate": 9.313065285116997e-06, "loss": 0.0363, "step": 2269 }, { "epoch": 1.008216744392627, "grad_norm": 0.7355227136248891, "learning_rate": 9.312084324397416e-06, "loss": 0.0666, "step": 2270 }, { "epoch": 1.0086608927381746, "grad_norm": 0.6811364248406817, "learning_rate": 9.311102715498312e-06, "loss": 0.0664, "step": 2271 }, { "epoch": 1.009105041083722, "grad_norm": 0.4859582181876178, "learning_rate": 9.310120458567238e-06, "loss": 0.0455, "step": 2272 }, { "epoch": 1.0095491894292694, "grad_norm": 0.6126182761710498, "learning_rate": 9.309137553751843e-06, "loss": 0.0568, "step": 2273 }, { "epoch": 1.0099933377748167, "grad_norm": 0.6903438432741064, "learning_rate": 9.308154001199874e-06, "loss": 0.0546, "step": 2274 }, { "epoch": 1.0104374861203642, "grad_norm": 0.5254671203092518, "learning_rate": 9.307169801059175e-06, "loss": 0.0518, "step": 2275 }, { "epoch": 1.0108816344659115, "grad_norm": 0.502322384644144, "learning_rate": 9.30618495347769e-06, "loss": 0.0528, "step": 2276 }, { "epoch": 1.011325782811459, "grad_norm": 0.4570165148396037, "learning_rate": 9.305199458603456e-06, "loss": 0.0423, "step": 2277 }, { "epoch": 1.0117699311570065, "grad_norm": 0.6475436488464652, "learning_rate": 9.304213316584612e-06, "loss": 0.0539, "step": 2278 }, { "epoch": 1.0122140795025538, "grad_norm": 0.5830527566752789, "learning_rate": 9.30322652756939e-06, "loss": 0.0533, "step": 2279 }, { "epoch": 1.0126582278481013, "grad_norm": 0.5218861392382439, "learning_rate": 9.302239091706121e-06, "loss": 0.0478, "step": 2280 }, { "epoch": 1.0131023761936486, "grad_norm": 0.7938697599205536, "learning_rate": 9.301251009143236e-06, "loss": 0.0913, "step": 2281 }, { "epoch": 1.0135465245391961, "grad_norm": 0.5550504645736817, "learning_rate": 9.300262280029257e-06, "loss": 0.0644, "step": 2282 }, { "epoch": 1.0139906728847434, "grad_norm": 0.594954378218846, "learning_rate": 9.29927290451281e-06, "loss": 0.0541, "step": 2283 }, { "epoch": 1.014434821230291, "grad_norm": 0.9298600097346652, "learning_rate": 9.298282882742612e-06, "loss": 0.0518, "step": 2284 }, { "epoch": 1.0148789695758382, "grad_norm": 0.46811932472048295, "learning_rate": 9.297292214867484e-06, "loss": 0.0455, "step": 2285 }, { "epoch": 1.0153231179213857, "grad_norm": 0.5871427629727887, "learning_rate": 9.296300901036337e-06, "loss": 0.0572, "step": 2286 }, { "epoch": 1.0157672662669333, "grad_norm": 0.6945899846686857, "learning_rate": 9.295308941398183e-06, "loss": 0.061, "step": 2287 }, { "epoch": 1.0162114146124805, "grad_norm": 0.640279460698154, "learning_rate": 9.294316336102132e-06, "loss": 0.0599, "step": 2288 }, { "epoch": 1.016655562958028, "grad_norm": 0.31136909660088496, "learning_rate": 9.293323085297386e-06, "loss": 0.026, "step": 2289 }, { "epoch": 1.0170997113035753, "grad_norm": 0.4114152449104369, "learning_rate": 9.29232918913325e-06, "loss": 0.0379, "step": 2290 }, { "epoch": 1.0175438596491229, "grad_norm": 0.41732022602175256, "learning_rate": 9.291334647759122e-06, "loss": 0.0422, "step": 2291 }, { "epoch": 1.0179880079946702, "grad_norm": 0.762539369842014, "learning_rate": 9.2903394613245e-06, "loss": 0.0597, "step": 2292 }, { "epoch": 1.0184321563402177, "grad_norm": 0.9415453759336739, "learning_rate": 9.289343629978978e-06, "loss": 0.0739, "step": 2293 }, { "epoch": 1.018876304685765, "grad_norm": 0.5123579743591409, "learning_rate": 9.288347153872245e-06, "loss": 0.0571, "step": 2294 }, { "epoch": 1.0193204530313125, "grad_norm": 0.458862217726016, "learning_rate": 9.287350033154088e-06, "loss": 0.0468, "step": 2295 }, { "epoch": 1.01976460137686, "grad_norm": 1.3873099747217557, "learning_rate": 9.28635226797439e-06, "loss": 0.0745, "step": 2296 }, { "epoch": 1.0202087497224073, "grad_norm": 0.5997981500021724, "learning_rate": 9.285353858483138e-06, "loss": 0.0506, "step": 2297 }, { "epoch": 1.0206528980679548, "grad_norm": 0.6396330710132598, "learning_rate": 9.284354804830403e-06, "loss": 0.0518, "step": 2298 }, { "epoch": 1.021097046413502, "grad_norm": 0.8119329100233045, "learning_rate": 9.283355107166361e-06, "loss": 0.088, "step": 2299 }, { "epoch": 1.0215411947590496, "grad_norm": 0.6001947150070646, "learning_rate": 9.282354765641286e-06, "loss": 0.0503, "step": 2300 }, { "epoch": 1.0219853431045969, "grad_norm": 0.5257172181736586, "learning_rate": 9.281353780405546e-06, "loss": 0.0632, "step": 2301 }, { "epoch": 1.0224294914501444, "grad_norm": 0.4904406999367498, "learning_rate": 9.280352151609604e-06, "loss": 0.0412, "step": 2302 }, { "epoch": 1.0228736397956917, "grad_norm": 0.6023825108536248, "learning_rate": 9.279349879404024e-06, "loss": 0.0501, "step": 2303 }, { "epoch": 1.0233177881412392, "grad_norm": 0.5713607373831051, "learning_rate": 9.278346963939464e-06, "loss": 0.0648, "step": 2304 }, { "epoch": 1.0237619364867865, "grad_norm": 0.5117889653709818, "learning_rate": 9.27734340536668e-06, "loss": 0.0595, "step": 2305 }, { "epoch": 1.024206084832334, "grad_norm": 0.5957895920503053, "learning_rate": 9.27633920383652e-06, "loss": 0.0588, "step": 2306 }, { "epoch": 1.0246502331778815, "grad_norm": 0.6119278145978992, "learning_rate": 9.275334359499936e-06, "loss": 0.0615, "step": 2307 }, { "epoch": 1.0250943815234288, "grad_norm": 0.5747304102651929, "learning_rate": 9.274328872507973e-06, "loss": 0.0609, "step": 2308 }, { "epoch": 1.0255385298689763, "grad_norm": 0.5890004950126706, "learning_rate": 9.273322743011775e-06, "loss": 0.054, "step": 2309 }, { "epoch": 1.0259826782145236, "grad_norm": 0.46856707487309196, "learning_rate": 9.272315971162573e-06, "loss": 0.0438, "step": 2310 }, { "epoch": 1.0264268265600711, "grad_norm": 0.5705380710124494, "learning_rate": 9.27130855711171e-06, "loss": 0.0477, "step": 2311 }, { "epoch": 1.0268709749056184, "grad_norm": 0.7064165326935903, "learning_rate": 9.270300501010612e-06, "loss": 0.0568, "step": 2312 }, { "epoch": 1.027315123251166, "grad_norm": 0.5416989623481593, "learning_rate": 9.26929180301081e-06, "loss": 0.0682, "step": 2313 }, { "epoch": 1.0277592715967132, "grad_norm": 0.49073246688028643, "learning_rate": 9.268282463263928e-06, "loss": 0.0629, "step": 2314 }, { "epoch": 1.0282034199422607, "grad_norm": 0.47098174668190423, "learning_rate": 9.267272481921686e-06, "loss": 0.0492, "step": 2315 }, { "epoch": 1.0286475682878082, "grad_norm": 0.3813955943581627, "learning_rate": 9.266261859135901e-06, "loss": 0.0308, "step": 2316 }, { "epoch": 1.0290917166333555, "grad_norm": 0.6582567910792767, "learning_rate": 9.265250595058486e-06, "loss": 0.0573, "step": 2317 }, { "epoch": 1.029535864978903, "grad_norm": 0.5953836977171739, "learning_rate": 9.264238689841456e-06, "loss": 0.0631, "step": 2318 }, { "epoch": 1.0299800133244503, "grad_norm": 0.6292012725822909, "learning_rate": 9.263226143636912e-06, "loss": 0.0691, "step": 2319 }, { "epoch": 1.0304241616699978, "grad_norm": 0.47225699541890964, "learning_rate": 9.262212956597059e-06, "loss": 0.0563, "step": 2320 }, { "epoch": 1.0308683100155451, "grad_norm": 0.48390119987242786, "learning_rate": 9.261199128874197e-06, "loss": 0.054, "step": 2321 }, { "epoch": 1.0313124583610926, "grad_norm": 0.5985733647635482, "learning_rate": 9.26018466062072e-06, "loss": 0.0582, "step": 2322 }, { "epoch": 1.03175660670664, "grad_norm": 0.5869595699898266, "learning_rate": 9.259169551989121e-06, "loss": 0.0562, "step": 2323 }, { "epoch": 1.0322007550521874, "grad_norm": 0.5529638354373985, "learning_rate": 9.258153803131989e-06, "loss": 0.055, "step": 2324 }, { "epoch": 1.0326449033977347, "grad_norm": 0.39012786867402377, "learning_rate": 9.257137414202006e-06, "loss": 0.0389, "step": 2325 }, { "epoch": 1.0330890517432822, "grad_norm": 0.5320314663484899, "learning_rate": 9.256120385351953e-06, "loss": 0.0593, "step": 2326 }, { "epoch": 1.0335332000888298, "grad_norm": 0.47550720714560507, "learning_rate": 9.255102716734709e-06, "loss": 0.0383, "step": 2327 }, { "epoch": 1.033977348434377, "grad_norm": 0.6123732206520373, "learning_rate": 9.254084408503243e-06, "loss": 0.0733, "step": 2328 }, { "epoch": 1.0344214967799246, "grad_norm": 0.6340880346930579, "learning_rate": 9.253065460810627e-06, "loss": 0.0703, "step": 2329 }, { "epoch": 1.0348656451254719, "grad_norm": 0.4558646564237513, "learning_rate": 9.252045873810026e-06, "loss": 0.0389, "step": 2330 }, { "epoch": 1.0353097934710194, "grad_norm": 0.573613972583976, "learning_rate": 9.251025647654698e-06, "loss": 0.0578, "step": 2331 }, { "epoch": 1.0357539418165667, "grad_norm": 0.45869687127989117, "learning_rate": 9.250004782498006e-06, "loss": 0.0486, "step": 2332 }, { "epoch": 1.0361980901621142, "grad_norm": 0.700597406034261, "learning_rate": 9.248983278493399e-06, "loss": 0.0486, "step": 2333 }, { "epoch": 1.0366422385076615, "grad_norm": 0.5940754007816903, "learning_rate": 9.247961135794428e-06, "loss": 0.0487, "step": 2334 }, { "epoch": 1.037086386853209, "grad_norm": 0.7212210545701735, "learning_rate": 9.246938354554737e-06, "loss": 0.051, "step": 2335 }, { "epoch": 1.0375305351987565, "grad_norm": 0.41449723705228086, "learning_rate": 9.245914934928068e-06, "loss": 0.0441, "step": 2336 }, { "epoch": 1.0379746835443038, "grad_norm": 0.9446091827668867, "learning_rate": 9.24489087706826e-06, "loss": 0.0941, "step": 2337 }, { "epoch": 1.0384188318898513, "grad_norm": 0.43472127189546134, "learning_rate": 9.243866181129246e-06, "loss": 0.0444, "step": 2338 }, { "epoch": 1.0388629802353986, "grad_norm": 0.8234990673910981, "learning_rate": 9.242840847265053e-06, "loss": 0.0449, "step": 2339 }, { "epoch": 1.039307128580946, "grad_norm": 0.5137093258500119, "learning_rate": 9.241814875629806e-06, "loss": 0.047, "step": 2340 }, { "epoch": 1.0397512769264934, "grad_norm": 0.46164431011916773, "learning_rate": 9.24078826637773e-06, "loss": 0.0567, "step": 2341 }, { "epoch": 1.040195425272041, "grad_norm": 0.5917045565013841, "learning_rate": 9.239761019663139e-06, "loss": 0.0529, "step": 2342 }, { "epoch": 1.0406395736175882, "grad_norm": 0.49372320773080497, "learning_rate": 9.238733135640445e-06, "loss": 0.0503, "step": 2343 }, { "epoch": 1.0410837219631357, "grad_norm": 0.47687223129773526, "learning_rate": 9.237704614464157e-06, "loss": 0.0538, "step": 2344 }, { "epoch": 1.0415278703086832, "grad_norm": 0.5267358173749689, "learning_rate": 9.236675456288879e-06, "loss": 0.0533, "step": 2345 }, { "epoch": 1.0419720186542305, "grad_norm": 0.5562866454702318, "learning_rate": 9.235645661269313e-06, "loss": 0.0617, "step": 2346 }, { "epoch": 1.042416166999778, "grad_norm": 0.48761824525349823, "learning_rate": 9.234615229560251e-06, "loss": 0.0646, "step": 2347 }, { "epoch": 1.0428603153453253, "grad_norm": 0.5413755626896984, "learning_rate": 9.233584161316588e-06, "loss": 0.0485, "step": 2348 }, { "epoch": 1.0433044636908728, "grad_norm": 0.48592846139974233, "learning_rate": 9.232552456693308e-06, "loss": 0.0466, "step": 2349 }, { "epoch": 1.04374861203642, "grad_norm": 0.3666523977412306, "learning_rate": 9.231520115845495e-06, "loss": 0.0415, "step": 2350 }, { "epoch": 1.0441927603819676, "grad_norm": 0.5041685160042538, "learning_rate": 9.23048713892833e-06, "loss": 0.0461, "step": 2351 }, { "epoch": 1.044636908727515, "grad_norm": 0.4557622908913518, "learning_rate": 9.229453526097085e-06, "loss": 0.0444, "step": 2352 }, { "epoch": 1.0450810570730624, "grad_norm": 0.7840502309472853, "learning_rate": 9.228419277507126e-06, "loss": 0.0628, "step": 2353 }, { "epoch": 1.0455252054186097, "grad_norm": 0.5291018953826038, "learning_rate": 9.227384393313924e-06, "loss": 0.0496, "step": 2354 }, { "epoch": 1.0459693537641572, "grad_norm": 0.49003402431309595, "learning_rate": 9.226348873673036e-06, "loss": 0.0549, "step": 2355 }, { "epoch": 1.0464135021097047, "grad_norm": 0.5208623353433574, "learning_rate": 9.22531271874012e-06, "loss": 0.0473, "step": 2356 }, { "epoch": 1.046857650455252, "grad_norm": 0.6890800835128176, "learning_rate": 9.224275928670925e-06, "loss": 0.0554, "step": 2357 }, { "epoch": 1.0473017988007995, "grad_norm": 0.6619670076578327, "learning_rate": 9.223238503621302e-06, "loss": 0.0577, "step": 2358 }, { "epoch": 1.0477459471463468, "grad_norm": 0.5128511282981787, "learning_rate": 9.22220044374719e-06, "loss": 0.0406, "step": 2359 }, { "epoch": 1.0481900954918943, "grad_norm": 0.521752374455369, "learning_rate": 9.221161749204629e-06, "loss": 0.05, "step": 2360 }, { "epoch": 1.0486342438374416, "grad_norm": 0.6524832106347492, "learning_rate": 9.220122420149753e-06, "loss": 0.0548, "step": 2361 }, { "epoch": 1.0490783921829892, "grad_norm": 0.49101446033388607, "learning_rate": 9.219082456738788e-06, "loss": 0.0392, "step": 2362 }, { "epoch": 1.0495225405285364, "grad_norm": 0.40291059778432986, "learning_rate": 9.218041859128062e-06, "loss": 0.0403, "step": 2363 }, { "epoch": 1.049966688874084, "grad_norm": 0.52475244536887, "learning_rate": 9.217000627473993e-06, "loss": 0.0518, "step": 2364 }, { "epoch": 1.0504108372196312, "grad_norm": 0.6932318744101922, "learning_rate": 9.215958761933093e-06, "loss": 0.0586, "step": 2365 }, { "epoch": 1.0508549855651788, "grad_norm": 0.4220631830774893, "learning_rate": 9.214916262661977e-06, "loss": 0.045, "step": 2366 }, { "epoch": 1.0512991339107263, "grad_norm": 0.4512747724107325, "learning_rate": 9.213873129817346e-06, "loss": 0.0543, "step": 2367 }, { "epoch": 1.0517432822562736, "grad_norm": 0.4570731167347105, "learning_rate": 9.212829363556003e-06, "loss": 0.0594, "step": 2368 }, { "epoch": 1.052187430601821, "grad_norm": 0.508942846260648, "learning_rate": 9.211784964034842e-06, "loss": 0.0685, "step": 2369 }, { "epoch": 1.0526315789473684, "grad_norm": 0.5865147734769778, "learning_rate": 9.210739931410857e-06, "loss": 0.064, "step": 2370 }, { "epoch": 1.0530757272929159, "grad_norm": 0.6404273704289678, "learning_rate": 9.209694265841132e-06, "loss": 0.0449, "step": 2371 }, { "epoch": 1.0535198756384632, "grad_norm": 0.5024824890374885, "learning_rate": 9.208647967482849e-06, "loss": 0.053, "step": 2372 }, { "epoch": 1.0539640239840107, "grad_norm": 0.5079871293280772, "learning_rate": 9.207601036493284e-06, "loss": 0.0504, "step": 2373 }, { "epoch": 1.054408172329558, "grad_norm": 0.5005289716599005, "learning_rate": 9.206553473029807e-06, "loss": 0.0469, "step": 2374 }, { "epoch": 1.0548523206751055, "grad_norm": 0.6348636511274667, "learning_rate": 9.205505277249888e-06, "loss": 0.0664, "step": 2375 }, { "epoch": 1.055296469020653, "grad_norm": 0.7124285353701557, "learning_rate": 9.204456449311086e-06, "loss": 0.0811, "step": 2376 }, { "epoch": 1.0557406173662003, "grad_norm": 0.7155216157670515, "learning_rate": 9.203406989371058e-06, "loss": 0.068, "step": 2377 }, { "epoch": 1.0561847657117478, "grad_norm": 0.6903207059209383, "learning_rate": 9.202356897587556e-06, "loss": 0.0625, "step": 2378 }, { "epoch": 1.056628914057295, "grad_norm": 0.4483522509553745, "learning_rate": 9.201306174118428e-06, "loss": 0.0461, "step": 2379 }, { "epoch": 1.0570730624028426, "grad_norm": 0.6352875132111704, "learning_rate": 9.200254819121612e-06, "loss": 0.0838, "step": 2380 }, { "epoch": 1.05751721074839, "grad_norm": 0.600462738349843, "learning_rate": 9.19920283275515e-06, "loss": 0.0498, "step": 2381 }, { "epoch": 1.0579613590939374, "grad_norm": 0.513630635347009, "learning_rate": 9.198150215177168e-06, "loss": 0.0557, "step": 2382 }, { "epoch": 1.0584055074394847, "grad_norm": 0.674636182160706, "learning_rate": 9.197096966545896e-06, "loss": 0.0608, "step": 2383 }, { "epoch": 1.0588496557850322, "grad_norm": 0.7211986590126924, "learning_rate": 9.196043087019651e-06, "loss": 0.0675, "step": 2384 }, { "epoch": 1.0592938041305797, "grad_norm": 0.48469937945694536, "learning_rate": 9.194988576756855e-06, "loss": 0.0533, "step": 2385 }, { "epoch": 1.059737952476127, "grad_norm": 0.6223548159808464, "learning_rate": 9.193933435916013e-06, "loss": 0.0609, "step": 2386 }, { "epoch": 1.0601821008216745, "grad_norm": 0.4702625375198161, "learning_rate": 9.192877664655736e-06, "loss": 0.0629, "step": 2387 }, { "epoch": 1.0606262491672218, "grad_norm": 0.6156516266579465, "learning_rate": 9.191821263134718e-06, "loss": 0.0519, "step": 2388 }, { "epoch": 1.0610703975127693, "grad_norm": 0.41634294802260347, "learning_rate": 9.19076423151176e-06, "loss": 0.0339, "step": 2389 }, { "epoch": 1.0615145458583166, "grad_norm": 0.5676744907352943, "learning_rate": 9.189706569945749e-06, "loss": 0.0672, "step": 2390 }, { "epoch": 1.0619586942038641, "grad_norm": 0.5506701658376884, "learning_rate": 9.188648278595669e-06, "loss": 0.0549, "step": 2391 }, { "epoch": 1.0624028425494114, "grad_norm": 0.5548831925430109, "learning_rate": 9.187589357620602e-06, "loss": 0.0495, "step": 2392 }, { "epoch": 1.062846990894959, "grad_norm": 0.5733351694159566, "learning_rate": 9.186529807179715e-06, "loss": 0.0413, "step": 2393 }, { "epoch": 1.0632911392405062, "grad_norm": 0.4263224419709196, "learning_rate": 9.185469627432287e-06, "loss": 0.0376, "step": 2394 }, { "epoch": 1.0637352875860537, "grad_norm": 0.5107765087622207, "learning_rate": 9.184408818537673e-06, "loss": 0.0441, "step": 2395 }, { "epoch": 1.0641794359316012, "grad_norm": 0.4468370290354222, "learning_rate": 9.183347380655332e-06, "loss": 0.0393, "step": 2396 }, { "epoch": 1.0646235842771485, "grad_norm": 0.4890359715306359, "learning_rate": 9.182285313944818e-06, "loss": 0.0481, "step": 2397 }, { "epoch": 1.065067732622696, "grad_norm": 0.45195434165164133, "learning_rate": 9.181222618565777e-06, "loss": 0.0509, "step": 2398 }, { "epoch": 1.0655118809682433, "grad_norm": 0.49289142398585184, "learning_rate": 9.180159294677948e-06, "loss": 0.0515, "step": 2399 }, { "epoch": 1.0659560293137909, "grad_norm": 0.42385971921718624, "learning_rate": 9.179095342441171e-06, "loss": 0.0448, "step": 2400 }, { "epoch": 1.0664001776593381, "grad_norm": 0.6172728890244491, "learning_rate": 9.178030762015372e-06, "loss": 0.0401, "step": 2401 }, { "epoch": 1.0668443260048857, "grad_norm": 0.5217625747110314, "learning_rate": 9.176965553560578e-06, "loss": 0.0578, "step": 2402 }, { "epoch": 1.067288474350433, "grad_norm": 0.4881611280755445, "learning_rate": 9.175899717236907e-06, "loss": 0.0437, "step": 2403 }, { "epoch": 1.0677326226959805, "grad_norm": 0.5243251412540129, "learning_rate": 9.174833253204571e-06, "loss": 0.0408, "step": 2404 }, { "epoch": 1.068176771041528, "grad_norm": 0.4299956266707952, "learning_rate": 9.17376616162388e-06, "loss": 0.0416, "step": 2405 }, { "epoch": 1.0686209193870753, "grad_norm": 0.5888767684513153, "learning_rate": 9.172698442655236e-06, "loss": 0.064, "step": 2406 }, { "epoch": 1.0690650677326228, "grad_norm": 0.4022150666774181, "learning_rate": 9.171630096459134e-06, "loss": 0.0416, "step": 2407 }, { "epoch": 1.06950921607817, "grad_norm": 0.9093917263815293, "learning_rate": 9.170561123196165e-06, "loss": 0.0573, "step": 2408 }, { "epoch": 1.0699533644237176, "grad_norm": 0.5641484333920734, "learning_rate": 9.169491523027012e-06, "loss": 0.0527, "step": 2409 }, { "epoch": 1.0703975127692649, "grad_norm": 0.5614858136344199, "learning_rate": 9.168421296112457e-06, "loss": 0.0525, "step": 2410 }, { "epoch": 1.0708416611148124, "grad_norm": 0.5361742414415347, "learning_rate": 9.167350442613371e-06, "loss": 0.0512, "step": 2411 }, { "epoch": 1.0712858094603597, "grad_norm": 0.4065744899599026, "learning_rate": 9.166278962690724e-06, "loss": 0.0352, "step": 2412 }, { "epoch": 1.0717299578059072, "grad_norm": 0.7190511616030503, "learning_rate": 9.165206856505577e-06, "loss": 0.0711, "step": 2413 }, { "epoch": 1.0721741061514547, "grad_norm": 0.5266458384013354, "learning_rate": 9.164134124219085e-06, "loss": 0.0436, "step": 2414 }, { "epoch": 1.072618254497002, "grad_norm": 0.5599669781296026, "learning_rate": 9.163060765992495e-06, "loss": 0.0759, "step": 2415 }, { "epoch": 1.0730624028425495, "grad_norm": 0.5128109774041522, "learning_rate": 9.161986781987156e-06, "loss": 0.0455, "step": 2416 }, { "epoch": 1.0735065511880968, "grad_norm": 0.4078193083238796, "learning_rate": 9.160912172364503e-06, "loss": 0.0403, "step": 2417 }, { "epoch": 1.0739506995336443, "grad_norm": 0.6961390181093496, "learning_rate": 9.15983693728607e-06, "loss": 0.0757, "step": 2418 }, { "epoch": 1.0743948478791916, "grad_norm": 0.518134789703249, "learning_rate": 9.158761076913481e-06, "loss": 0.0458, "step": 2419 }, { "epoch": 1.074838996224739, "grad_norm": 0.6466366059180623, "learning_rate": 9.157684591408458e-06, "loss": 0.0639, "step": 2420 }, { "epoch": 1.0752831445702864, "grad_norm": 0.45726102190092227, "learning_rate": 9.156607480932813e-06, "loss": 0.0485, "step": 2421 }, { "epoch": 1.075727292915834, "grad_norm": 0.47111677703640187, "learning_rate": 9.155529745648457e-06, "loss": 0.0576, "step": 2422 }, { "epoch": 1.0761714412613812, "grad_norm": 0.4424777927187194, "learning_rate": 9.154451385717387e-06, "loss": 0.0415, "step": 2423 }, { "epoch": 1.0766155896069287, "grad_norm": 0.5637865585741092, "learning_rate": 9.153372401301706e-06, "loss": 0.0569, "step": 2424 }, { "epoch": 1.0770597379524762, "grad_norm": 0.6193090479661898, "learning_rate": 9.152292792563596e-06, "loss": 0.0541, "step": 2425 }, { "epoch": 1.0775038862980235, "grad_norm": 0.5604767145835976, "learning_rate": 9.151212559665345e-06, "loss": 0.0487, "step": 2426 }, { "epoch": 1.077948034643571, "grad_norm": 0.614420431496054, "learning_rate": 9.150131702769332e-06, "loss": 0.0543, "step": 2427 }, { "epoch": 1.0783921829891183, "grad_norm": 0.500155979890897, "learning_rate": 9.149050222038024e-06, "loss": 0.0543, "step": 2428 }, { "epoch": 1.0788363313346658, "grad_norm": 0.6046519445448296, "learning_rate": 9.147968117633988e-06, "loss": 0.0669, "step": 2429 }, { "epoch": 1.0792804796802131, "grad_norm": 0.5730375661279963, "learning_rate": 9.14688538971988e-06, "loss": 0.0495, "step": 2430 }, { "epoch": 1.0797246280257606, "grad_norm": 0.482430110901872, "learning_rate": 9.145802038458457e-06, "loss": 0.0484, "step": 2431 }, { "epoch": 1.080168776371308, "grad_norm": 0.49413095926471534, "learning_rate": 9.144718064012562e-06, "loss": 0.0482, "step": 2432 }, { "epoch": 1.0806129247168554, "grad_norm": 0.6274129207366034, "learning_rate": 9.143633466545136e-06, "loss": 0.0609, "step": 2433 }, { "epoch": 1.0810570730624027, "grad_norm": 0.4754419676708797, "learning_rate": 9.142548246219212e-06, "loss": 0.0539, "step": 2434 }, { "epoch": 1.0815012214079502, "grad_norm": 0.5129065641839461, "learning_rate": 9.141462403197917e-06, "loss": 0.0502, "step": 2435 }, { "epoch": 1.0819453697534978, "grad_norm": 0.5492083841021022, "learning_rate": 9.14037593764447e-06, "loss": 0.0547, "step": 2436 }, { "epoch": 1.082389518099045, "grad_norm": 0.5638343322048809, "learning_rate": 9.139288849722188e-06, "loss": 0.0552, "step": 2437 }, { "epoch": 1.0828336664445926, "grad_norm": 0.48234420470469497, "learning_rate": 9.138201139594478e-06, "loss": 0.0509, "step": 2438 }, { "epoch": 1.0832778147901398, "grad_norm": 0.48233545504362385, "learning_rate": 9.137112807424842e-06, "loss": 0.0618, "step": 2439 }, { "epoch": 1.0837219631356874, "grad_norm": 0.6597245369594125, "learning_rate": 9.136023853376872e-06, "loss": 0.0505, "step": 2440 }, { "epoch": 1.0841661114812347, "grad_norm": 0.8640144400788339, "learning_rate": 9.134934277614258e-06, "loss": 0.0552, "step": 2441 }, { "epoch": 1.0846102598267822, "grad_norm": 0.5151117640182895, "learning_rate": 9.133844080300783e-06, "loss": 0.0552, "step": 2442 }, { "epoch": 1.0850544081723295, "grad_norm": 0.5553941913181785, "learning_rate": 9.13275326160032e-06, "loss": 0.068, "step": 2443 }, { "epoch": 1.085498556517877, "grad_norm": 1.2548272109304157, "learning_rate": 9.131661821676839e-06, "loss": 0.0615, "step": 2444 }, { "epoch": 1.0859427048634245, "grad_norm": 0.7349991566989792, "learning_rate": 9.130569760694402e-06, "loss": 0.0592, "step": 2445 }, { "epoch": 1.0863868532089718, "grad_norm": 0.4235071999153545, "learning_rate": 9.129477078817165e-06, "loss": 0.0399, "step": 2446 }, { "epoch": 1.0868310015545193, "grad_norm": 0.5132669145700011, "learning_rate": 9.128383776209372e-06, "loss": 0.051, "step": 2447 }, { "epoch": 1.0872751499000666, "grad_norm": 1.6678578417052072, "learning_rate": 9.127289853035371e-06, "loss": 0.068, "step": 2448 }, { "epoch": 1.087719298245614, "grad_norm": 0.3444388637101336, "learning_rate": 9.126195309459593e-06, "loss": 0.0326, "step": 2449 }, { "epoch": 1.0881634465911614, "grad_norm": 0.38355662180503935, "learning_rate": 9.12510014564657e-06, "loss": 0.0552, "step": 2450 }, { "epoch": 1.0886075949367089, "grad_norm": 0.5490923753617978, "learning_rate": 9.124004361760921e-06, "loss": 0.0598, "step": 2451 }, { "epoch": 1.0890517432822562, "grad_norm": 0.49443123554529045, "learning_rate": 9.122907957967363e-06, "loss": 0.0545, "step": 2452 }, { "epoch": 1.0894958916278037, "grad_norm": 0.47265541042225623, "learning_rate": 9.121810934430702e-06, "loss": 0.0523, "step": 2453 }, { "epoch": 1.0899400399733512, "grad_norm": 0.6134212234858121, "learning_rate": 9.12071329131584e-06, "loss": 0.0669, "step": 2454 }, { "epoch": 1.0903841883188985, "grad_norm": 0.5102377020582352, "learning_rate": 9.119615028787771e-06, "loss": 0.0488, "step": 2455 }, { "epoch": 1.090828336664446, "grad_norm": 0.5477789256953114, "learning_rate": 9.118516147011585e-06, "loss": 0.0609, "step": 2456 }, { "epoch": 1.0912724850099933, "grad_norm": 0.5014262210362811, "learning_rate": 9.117416646152459e-06, "loss": 0.0455, "step": 2457 }, { "epoch": 1.0917166333555408, "grad_norm": 0.5068151485473731, "learning_rate": 9.11631652637567e-06, "loss": 0.0558, "step": 2458 }, { "epoch": 1.092160781701088, "grad_norm": 0.5967678649273153, "learning_rate": 9.115215787846583e-06, "loss": 0.0576, "step": 2459 }, { "epoch": 1.0926049300466356, "grad_norm": 0.5382239114996434, "learning_rate": 9.114114430730656e-06, "loss": 0.0568, "step": 2460 }, { "epoch": 1.093049078392183, "grad_norm": 0.46675820455299727, "learning_rate": 9.113012455193444e-06, "loss": 0.0437, "step": 2461 }, { "epoch": 1.0934932267377304, "grad_norm": 0.6129912443014283, "learning_rate": 9.111909861400594e-06, "loss": 0.0584, "step": 2462 }, { "epoch": 1.0939373750832777, "grad_norm": 0.49745724599799956, "learning_rate": 9.110806649517841e-06, "loss": 0.0541, "step": 2463 }, { "epoch": 1.0943815234288252, "grad_norm": 0.5223020387615885, "learning_rate": 9.109702819711018e-06, "loss": 0.0579, "step": 2464 }, { "epoch": 1.0948256717743727, "grad_norm": 0.522897777305782, "learning_rate": 9.108598372146052e-06, "loss": 0.065, "step": 2465 }, { "epoch": 1.09526982011992, "grad_norm": 0.4978718424633595, "learning_rate": 9.107493306988955e-06, "loss": 0.057, "step": 2466 }, { "epoch": 1.0957139684654675, "grad_norm": 0.41889427668506446, "learning_rate": 9.10638762440584e-06, "loss": 0.0546, "step": 2467 }, { "epoch": 1.0961581168110148, "grad_norm": 0.4889531194168305, "learning_rate": 9.10528132456291e-06, "loss": 0.0475, "step": 2468 }, { "epoch": 1.0966022651565623, "grad_norm": 0.49561727802236016, "learning_rate": 9.10417440762646e-06, "loss": 0.0624, "step": 2469 }, { "epoch": 1.0970464135021096, "grad_norm": 0.5035282177847037, "learning_rate": 9.10306687376288e-06, "loss": 0.0613, "step": 2470 }, { "epoch": 1.0974905618476571, "grad_norm": 0.44693886713635184, "learning_rate": 9.101958723138651e-06, "loss": 0.0505, "step": 2471 }, { "epoch": 1.0979347101932044, "grad_norm": 0.6676580453833789, "learning_rate": 9.100849955920344e-06, "loss": 0.0637, "step": 2472 }, { "epoch": 1.098378858538752, "grad_norm": 0.47501330783440915, "learning_rate": 9.099740572274627e-06, "loss": 0.0391, "step": 2473 }, { "epoch": 1.0988230068842995, "grad_norm": 0.6923283723141113, "learning_rate": 9.098630572368262e-06, "loss": 0.0518, "step": 2474 }, { "epoch": 1.0992671552298467, "grad_norm": 0.47645749570109275, "learning_rate": 9.097519956368096e-06, "loss": 0.0273, "step": 2475 }, { "epoch": 1.0997113035753943, "grad_norm": 0.8125207380279094, "learning_rate": 9.096408724441078e-06, "loss": 0.062, "step": 2476 }, { "epoch": 1.1001554519209416, "grad_norm": 0.6208955890687268, "learning_rate": 9.09529687675424e-06, "loss": 0.0399, "step": 2477 }, { "epoch": 1.100599600266489, "grad_norm": 0.6086645282708443, "learning_rate": 9.094184413474716e-06, "loss": 0.0538, "step": 2478 }, { "epoch": 1.1010437486120364, "grad_norm": 0.5638622958089239, "learning_rate": 9.093071334769727e-06, "loss": 0.087, "step": 2479 }, { "epoch": 1.1014878969575839, "grad_norm": 0.5860122134432946, "learning_rate": 9.091957640806585e-06, "loss": 0.0514, "step": 2480 }, { "epoch": 1.1019320453031312, "grad_norm": 0.5483193797542968, "learning_rate": 9.090843331752704e-06, "loss": 0.0837, "step": 2481 }, { "epoch": 1.1023761936486787, "grad_norm": 0.671106831086069, "learning_rate": 9.089728407775576e-06, "loss": 0.0533, "step": 2482 }, { "epoch": 1.1028203419942262, "grad_norm": 0.471025449998095, "learning_rate": 9.088612869042794e-06, "loss": 0.0694, "step": 2483 }, { "epoch": 1.1032644903397735, "grad_norm": 0.46180112116306377, "learning_rate": 9.087496715722049e-06, "loss": 0.0426, "step": 2484 }, { "epoch": 1.103708638685321, "grad_norm": 0.49860403863116065, "learning_rate": 9.08637994798111e-06, "loss": 0.0583, "step": 2485 }, { "epoch": 1.1041527870308683, "grad_norm": 0.5897260124865925, "learning_rate": 9.08526256598785e-06, "loss": 0.0648, "step": 2486 }, { "epoch": 1.1045969353764158, "grad_norm": 0.453592540271068, "learning_rate": 9.084144569910229e-06, "loss": 0.0455, "step": 2487 }, { "epoch": 1.105041083721963, "grad_norm": 0.6113711566277047, "learning_rate": 9.083025959916302e-06, "loss": 0.0556, "step": 2488 }, { "epoch": 1.1054852320675106, "grad_norm": 0.4026753684401193, "learning_rate": 9.081906736174217e-06, "loss": 0.0347, "step": 2489 }, { "epoch": 1.1059293804130579, "grad_norm": 0.7535472922225546, "learning_rate": 9.080786898852207e-06, "loss": 0.0561, "step": 2490 }, { "epoch": 1.1063735287586054, "grad_norm": 0.7712189521196268, "learning_rate": 9.079666448118607e-06, "loss": 0.0815, "step": 2491 }, { "epoch": 1.1068176771041527, "grad_norm": 0.549393105917364, "learning_rate": 9.07854538414184e-06, "loss": 0.0411, "step": 2492 }, { "epoch": 1.1072618254497002, "grad_norm": 0.8270414561268318, "learning_rate": 9.077423707090418e-06, "loss": 0.0697, "step": 2493 }, { "epoch": 1.1077059737952477, "grad_norm": 0.4463853554583335, "learning_rate": 9.07630141713295e-06, "loss": 0.0426, "step": 2494 }, { "epoch": 1.108150122140795, "grad_norm": 0.6269526052059413, "learning_rate": 9.075178514438133e-06, "loss": 0.059, "step": 2495 }, { "epoch": 1.1085942704863425, "grad_norm": 0.7332597224037389, "learning_rate": 9.074054999174762e-06, "loss": 0.0647, "step": 2496 }, { "epoch": 1.1090384188318898, "grad_norm": 0.6649774981712752, "learning_rate": 9.072930871511718e-06, "loss": 0.0751, "step": 2497 }, { "epoch": 1.1094825671774373, "grad_norm": 0.8146175851807935, "learning_rate": 9.071806131617976e-06, "loss": 0.0591, "step": 2498 }, { "epoch": 1.1099267155229846, "grad_norm": 0.6708407619412956, "learning_rate": 9.070680779662606e-06, "loss": 0.0552, "step": 2499 }, { "epoch": 1.1103708638685321, "grad_norm": 0.6510598528132346, "learning_rate": 9.069554815814765e-06, "loss": 0.0547, "step": 2500 }, { "epoch": 1.1108150122140794, "grad_norm": 0.7337961607233396, "learning_rate": 9.068428240243705e-06, "loss": 0.08, "step": 2501 }, { "epoch": 1.111259160559627, "grad_norm": 0.9471208974140737, "learning_rate": 9.067301053118773e-06, "loss": 0.0575, "step": 2502 }, { "epoch": 1.1117033089051742, "grad_norm": 0.6247228777438872, "learning_rate": 9.066173254609399e-06, "loss": 0.0593, "step": 2503 }, { "epoch": 1.1121474572507217, "grad_norm": 0.4119422798089505, "learning_rate": 9.065044844885111e-06, "loss": 0.0369, "step": 2504 }, { "epoch": 1.1125916055962692, "grad_norm": 0.5707033305872198, "learning_rate": 9.063915824115531e-06, "loss": 0.0586, "step": 2505 }, { "epoch": 1.1130357539418165, "grad_norm": 0.5864819346852046, "learning_rate": 9.062786192470372e-06, "loss": 0.0541, "step": 2506 }, { "epoch": 1.113479902287364, "grad_norm": 0.4941216006043351, "learning_rate": 9.06165595011943e-06, "loss": 0.0467, "step": 2507 }, { "epoch": 1.1139240506329113, "grad_norm": 0.7179600112901418, "learning_rate": 9.060525097232603e-06, "loss": 0.061, "step": 2508 }, { "epoch": 1.1143681989784588, "grad_norm": 0.536437166711899, "learning_rate": 9.059393633979881e-06, "loss": 0.0514, "step": 2509 }, { "epoch": 1.1148123473240061, "grad_norm": 0.5200228688873401, "learning_rate": 9.058261560531337e-06, "loss": 0.0489, "step": 2510 }, { "epoch": 1.1152564956695536, "grad_norm": 0.8639507676709893, "learning_rate": 9.057128877057141e-06, "loss": 0.0777, "step": 2511 }, { "epoch": 1.1157006440151012, "grad_norm": 0.7459642047335504, "learning_rate": 9.055995583727559e-06, "loss": 0.0462, "step": 2512 }, { "epoch": 1.1161447923606485, "grad_norm": 0.43557336969374605, "learning_rate": 9.05486168071294e-06, "loss": 0.0411, "step": 2513 }, { "epoch": 1.116588940706196, "grad_norm": 0.4693246881167572, "learning_rate": 9.05372716818373e-06, "loss": 0.0317, "step": 2514 }, { "epoch": 1.1170330890517433, "grad_norm": 0.5801314945806954, "learning_rate": 9.052592046310466e-06, "loss": 0.0504, "step": 2515 }, { "epoch": 1.1174772373972908, "grad_norm": 0.5303403405055431, "learning_rate": 9.051456315263775e-06, "loss": 0.0417, "step": 2516 }, { "epoch": 1.117921385742838, "grad_norm": 0.4682906045403869, "learning_rate": 9.05031997521438e-06, "loss": 0.0441, "step": 2517 }, { "epoch": 1.1183655340883856, "grad_norm": 0.5610438227567353, "learning_rate": 9.049183026333089e-06, "loss": 0.054, "step": 2518 }, { "epoch": 1.1188096824339329, "grad_norm": 0.5309141601127093, "learning_rate": 9.048045468790805e-06, "loss": 0.0599, "step": 2519 }, { "epoch": 1.1192538307794804, "grad_norm": 0.4715253267557383, "learning_rate": 9.04690730275852e-06, "loss": 0.0501, "step": 2520 }, { "epoch": 1.1196979791250277, "grad_norm": 0.42642161615167634, "learning_rate": 9.045768528407326e-06, "loss": 0.0352, "step": 2521 }, { "epoch": 1.1201421274705752, "grad_norm": 0.8452679234650169, "learning_rate": 9.044629145908397e-06, "loss": 0.0792, "step": 2522 }, { "epoch": 1.1205862758161227, "grad_norm": 0.5024078744040577, "learning_rate": 9.043489155433e-06, "loss": 0.0578, "step": 2523 }, { "epoch": 1.12103042416167, "grad_norm": 0.6039439853816904, "learning_rate": 9.042348557152495e-06, "loss": 0.0591, "step": 2524 }, { "epoch": 1.1214745725072175, "grad_norm": 0.5200721258931258, "learning_rate": 9.041207351238336e-06, "loss": 0.0589, "step": 2525 }, { "epoch": 1.1219187208527648, "grad_norm": 0.6201687763530818, "learning_rate": 9.040065537862063e-06, "loss": 0.0564, "step": 2526 }, { "epoch": 1.1223628691983123, "grad_norm": 0.4816887693066457, "learning_rate": 9.038923117195313e-06, "loss": 0.047, "step": 2527 }, { "epoch": 1.1228070175438596, "grad_norm": 0.4930745832447622, "learning_rate": 9.037780089409807e-06, "loss": 0.0437, "step": 2528 }, { "epoch": 1.123251165889407, "grad_norm": 0.4493292408106485, "learning_rate": 9.036636454677363e-06, "loss": 0.0587, "step": 2529 }, { "epoch": 1.1236953142349544, "grad_norm": 0.5080635531655728, "learning_rate": 9.035492213169892e-06, "loss": 0.0536, "step": 2530 }, { "epoch": 1.124139462580502, "grad_norm": 0.3142735249050316, "learning_rate": 9.034347365059389e-06, "loss": 0.0324, "step": 2531 }, { "epoch": 1.1245836109260492, "grad_norm": 0.4572661512872438, "learning_rate": 9.033201910517944e-06, "loss": 0.0484, "step": 2532 }, { "epoch": 1.1250277592715967, "grad_norm": 0.5858908999181529, "learning_rate": 9.032055849717743e-06, "loss": 0.041, "step": 2533 }, { "epoch": 1.1254719076171442, "grad_norm": 0.5641689055994421, "learning_rate": 9.030909182831052e-06, "loss": 0.062, "step": 2534 }, { "epoch": 1.1259160559626915, "grad_norm": 0.48230195194298414, "learning_rate": 9.02976191003024e-06, "loss": 0.055, "step": 2535 }, { "epoch": 1.126360204308239, "grad_norm": 0.45378491026345646, "learning_rate": 9.028614031487757e-06, "loss": 0.0485, "step": 2536 }, { "epoch": 1.1268043526537863, "grad_norm": 0.5351056809427763, "learning_rate": 9.027465547376154e-06, "loss": 0.0554, "step": 2537 }, { "epoch": 1.1272485009993338, "grad_norm": 0.44870767793480254, "learning_rate": 9.02631645786806e-06, "loss": 0.0443, "step": 2538 }, { "epoch": 1.1276926493448811, "grad_norm": 1.1371656771905172, "learning_rate": 9.02516676313621e-06, "loss": 0.0572, "step": 2539 }, { "epoch": 1.1281367976904286, "grad_norm": 0.5747790388905519, "learning_rate": 9.02401646335342e-06, "loss": 0.0662, "step": 2540 }, { "epoch": 1.1285809460359761, "grad_norm": 0.5477714109841172, "learning_rate": 9.022865558692599e-06, "loss": 0.0629, "step": 2541 }, { "epoch": 1.1290250943815234, "grad_norm": 0.7201777798009189, "learning_rate": 9.021714049326749e-06, "loss": 0.0642, "step": 2542 }, { "epoch": 1.1294692427270707, "grad_norm": 0.5084161768229669, "learning_rate": 9.02056193542896e-06, "loss": 0.0413, "step": 2543 }, { "epoch": 1.1299133910726182, "grad_norm": 0.5818759215205692, "learning_rate": 9.019409217172414e-06, "loss": 0.0458, "step": 2544 }, { "epoch": 1.1303575394181657, "grad_norm": 0.4060737061879425, "learning_rate": 9.018255894730384e-06, "loss": 0.0417, "step": 2545 }, { "epoch": 1.130801687763713, "grad_norm": 0.37654168507528063, "learning_rate": 9.017101968276237e-06, "loss": 0.0356, "step": 2546 }, { "epoch": 1.1312458361092605, "grad_norm": 0.5269591862707016, "learning_rate": 9.015947437983423e-06, "loss": 0.0502, "step": 2547 }, { "epoch": 1.1316899844548078, "grad_norm": 0.6087429306966435, "learning_rate": 9.014792304025492e-06, "loss": 0.0549, "step": 2548 }, { "epoch": 1.1321341328003554, "grad_norm": 0.47161508183894374, "learning_rate": 9.013636566576078e-06, "loss": 0.046, "step": 2549 }, { "epoch": 1.1325782811459026, "grad_norm": 0.5566854251234638, "learning_rate": 9.012480225808908e-06, "loss": 0.0582, "step": 2550 }, { "epoch": 1.1330224294914502, "grad_norm": 0.7226965671519505, "learning_rate": 9.0113232818978e-06, "loss": 0.0535, "step": 2551 }, { "epoch": 1.1334665778369977, "grad_norm": 0.528548310886907, "learning_rate": 9.010165735016663e-06, "loss": 0.0471, "step": 2552 }, { "epoch": 1.133910726182545, "grad_norm": 0.6224718792548548, "learning_rate": 9.009007585339493e-06, "loss": 0.0411, "step": 2553 }, { "epoch": 1.1343548745280925, "grad_norm": 0.46637655730340244, "learning_rate": 9.007848833040385e-06, "loss": 0.0382, "step": 2554 }, { "epoch": 1.1347990228736398, "grad_norm": 0.6524984653497812, "learning_rate": 9.006689478293513e-06, "loss": 0.0649, "step": 2555 }, { "epoch": 1.1352431712191873, "grad_norm": 0.42573749158130464, "learning_rate": 9.005529521273152e-06, "loss": 0.0333, "step": 2556 }, { "epoch": 1.1356873195647346, "grad_norm": 0.4720058401308709, "learning_rate": 9.004368962153662e-06, "loss": 0.0454, "step": 2557 }, { "epoch": 1.136131467910282, "grad_norm": 0.4623215710338066, "learning_rate": 9.003207801109495e-06, "loss": 0.0413, "step": 2558 }, { "epoch": 1.1365756162558294, "grad_norm": 0.6101080407770559, "learning_rate": 9.002046038315192e-06, "loss": 0.0534, "step": 2559 }, { "epoch": 1.1370197646013769, "grad_norm": 0.5040799125891366, "learning_rate": 9.000883673945387e-06, "loss": 0.0392, "step": 2560 }, { "epoch": 1.1374639129469242, "grad_norm": 0.6363733147612941, "learning_rate": 8.999720708174802e-06, "loss": 0.0628, "step": 2561 }, { "epoch": 1.1379080612924717, "grad_norm": 0.5620665765918448, "learning_rate": 8.998557141178252e-06, "loss": 0.0524, "step": 2562 }, { "epoch": 1.1383522096380192, "grad_norm": 0.48743987280774526, "learning_rate": 8.99739297313064e-06, "loss": 0.0502, "step": 2563 }, { "epoch": 1.1387963579835665, "grad_norm": 0.4714981818194369, "learning_rate": 8.99622820420696e-06, "loss": 0.0477, "step": 2564 }, { "epoch": 1.139240506329114, "grad_norm": 0.466559878027, "learning_rate": 8.995062834582297e-06, "loss": 0.0585, "step": 2565 }, { "epoch": 1.1396846546746613, "grad_norm": 0.5806057329454937, "learning_rate": 8.993896864431825e-06, "loss": 0.0653, "step": 2566 }, { "epoch": 1.1401288030202088, "grad_norm": 0.565404505122367, "learning_rate": 8.992730293930812e-06, "loss": 0.0375, "step": 2567 }, { "epoch": 1.140572951365756, "grad_norm": 0.49869252300833783, "learning_rate": 8.99156312325461e-06, "loss": 0.0412, "step": 2568 }, { "epoch": 1.1410170997113036, "grad_norm": 0.4533506243280891, "learning_rate": 8.990395352578665e-06, "loss": 0.0423, "step": 2569 }, { "epoch": 1.141461248056851, "grad_norm": 0.5186554476274421, "learning_rate": 8.989226982078513e-06, "loss": 0.0486, "step": 2570 }, { "epoch": 1.1419053964023984, "grad_norm": 0.5971608007231468, "learning_rate": 8.988058011929781e-06, "loss": 0.063, "step": 2571 }, { "epoch": 1.1423495447479457, "grad_norm": 0.6848430211876612, "learning_rate": 8.986888442308187e-06, "loss": 0.0815, "step": 2572 }, { "epoch": 1.1427936930934932, "grad_norm": 0.5213622343694703, "learning_rate": 8.985718273389532e-06, "loss": 0.0426, "step": 2573 }, { "epoch": 1.1432378414390407, "grad_norm": 0.4678564391636386, "learning_rate": 8.984547505349714e-06, "loss": 0.0461, "step": 2574 }, { "epoch": 1.143681989784588, "grad_norm": 0.6562307201524248, "learning_rate": 8.983376138364723e-06, "loss": 0.0666, "step": 2575 }, { "epoch": 1.1441261381301355, "grad_norm": 0.7770057155172067, "learning_rate": 8.982204172610632e-06, "loss": 0.0761, "step": 2576 }, { "epoch": 1.1445702864756828, "grad_norm": 0.5346372112341888, "learning_rate": 8.981031608263608e-06, "loss": 0.0742, "step": 2577 }, { "epoch": 1.1450144348212303, "grad_norm": 0.549185701186776, "learning_rate": 8.979858445499908e-06, "loss": 0.0583, "step": 2578 }, { "epoch": 1.1454585831667776, "grad_norm": 0.44014613661158175, "learning_rate": 8.978684684495875e-06, "loss": 0.0422, "step": 2579 }, { "epoch": 1.1459027315123251, "grad_norm": 0.43518841056839336, "learning_rate": 8.97751032542795e-06, "loss": 0.0513, "step": 2580 }, { "epoch": 1.1463468798578726, "grad_norm": 0.3840746634100509, "learning_rate": 8.976335368472657e-06, "loss": 0.0295, "step": 2581 }, { "epoch": 1.14679102820342, "grad_norm": 0.7385893846304249, "learning_rate": 8.97515981380661e-06, "loss": 0.0794, "step": 2582 }, { "epoch": 1.1472351765489675, "grad_norm": 0.38857897583910317, "learning_rate": 8.97398366160652e-06, "loss": 0.0361, "step": 2583 }, { "epoch": 1.1476793248945147, "grad_norm": 0.6486029298679971, "learning_rate": 8.972806912049178e-06, "loss": 0.0732, "step": 2584 }, { "epoch": 1.1481234732400623, "grad_norm": 0.4395809321084259, "learning_rate": 8.971629565311471e-06, "loss": 0.0426, "step": 2585 }, { "epoch": 1.1485676215856095, "grad_norm": 0.42320652941064185, "learning_rate": 8.970451621570376e-06, "loss": 0.0476, "step": 2586 }, { "epoch": 1.149011769931157, "grad_norm": 0.3926981271234761, "learning_rate": 8.969273081002954e-06, "loss": 0.0408, "step": 2587 }, { "epoch": 1.1494559182767043, "grad_norm": 0.6068945137716437, "learning_rate": 8.96809394378636e-06, "loss": 0.0448, "step": 2588 }, { "epoch": 1.1499000666222519, "grad_norm": 0.46200186631768236, "learning_rate": 8.966914210097843e-06, "loss": 0.0587, "step": 2589 }, { "epoch": 1.1503442149677992, "grad_norm": 0.8019976203155184, "learning_rate": 8.965733880114734e-06, "loss": 0.0768, "step": 2590 }, { "epoch": 1.1507883633133467, "grad_norm": 0.4272911771923715, "learning_rate": 8.964552954014455e-06, "loss": 0.0555, "step": 2591 }, { "epoch": 1.1512325116588942, "grad_norm": 0.46591985962506655, "learning_rate": 8.963371431974521e-06, "loss": 0.0491, "step": 2592 }, { "epoch": 1.1516766600044415, "grad_norm": 0.34427825623736114, "learning_rate": 8.962189314172537e-06, "loss": 0.0291, "step": 2593 }, { "epoch": 1.152120808349989, "grad_norm": 0.3941777289275942, "learning_rate": 8.961006600786191e-06, "loss": 0.0386, "step": 2594 }, { "epoch": 1.1525649566955363, "grad_norm": 0.7828627909758169, "learning_rate": 8.959823291993268e-06, "loss": 0.0644, "step": 2595 }, { "epoch": 1.1530091050410838, "grad_norm": 0.44448667498264577, "learning_rate": 8.95863938797164e-06, "loss": 0.0437, "step": 2596 }, { "epoch": 1.153453253386631, "grad_norm": 0.44346487742364565, "learning_rate": 8.957454888899264e-06, "loss": 0.0462, "step": 2597 }, { "epoch": 1.1538974017321786, "grad_norm": 0.4503855208067358, "learning_rate": 8.956269794954195e-06, "loss": 0.0467, "step": 2598 }, { "epoch": 1.1543415500777259, "grad_norm": 0.50075395685955, "learning_rate": 8.95508410631457e-06, "loss": 0.0513, "step": 2599 }, { "epoch": 1.1547856984232734, "grad_norm": 1.5111182491065092, "learning_rate": 8.953897823158618e-06, "loss": 0.0422, "step": 2600 }, { "epoch": 1.1552298467688207, "grad_norm": 0.4837059174435265, "learning_rate": 8.95271094566466e-06, "loss": 0.0484, "step": 2601 }, { "epoch": 1.1556739951143682, "grad_norm": 0.9602960350821099, "learning_rate": 8.9515234740111e-06, "loss": 0.0596, "step": 2602 }, { "epoch": 1.1561181434599157, "grad_norm": 0.4674309097106074, "learning_rate": 8.950335408376438e-06, "loss": 0.0388, "step": 2603 }, { "epoch": 1.156562291805463, "grad_norm": 0.5494197343743635, "learning_rate": 8.949146748939259e-06, "loss": 0.0516, "step": 2604 }, { "epoch": 1.1570064401510105, "grad_norm": 0.603296345877419, "learning_rate": 8.94795749587824e-06, "loss": 0.0648, "step": 2605 }, { "epoch": 1.1574505884965578, "grad_norm": 0.6840688549848888, "learning_rate": 8.946767649372144e-06, "loss": 0.0633, "step": 2606 }, { "epoch": 1.1578947368421053, "grad_norm": 0.7710851984691011, "learning_rate": 8.945577209599829e-06, "loss": 0.0557, "step": 2607 }, { "epoch": 1.1583388851876526, "grad_norm": 0.5783757044901674, "learning_rate": 8.944386176740233e-06, "loss": 0.0552, "step": 2608 }, { "epoch": 1.1587830335332001, "grad_norm": 0.4854270839314596, "learning_rate": 8.943194550972392e-06, "loss": 0.0531, "step": 2609 }, { "epoch": 1.1592271818787476, "grad_norm": 0.5109780619188811, "learning_rate": 8.942002332475428e-06, "loss": 0.0512, "step": 2610 }, { "epoch": 1.159671330224295, "grad_norm": 0.541497267825657, "learning_rate": 8.940809521428551e-06, "loss": 0.0622, "step": 2611 }, { "epoch": 1.1601154785698422, "grad_norm": 0.6325619808955779, "learning_rate": 8.939616118011058e-06, "loss": 0.0577, "step": 2612 }, { "epoch": 1.1605596269153897, "grad_norm": 0.5836195560024343, "learning_rate": 8.938422122402342e-06, "loss": 0.0608, "step": 2613 }, { "epoch": 1.1610037752609372, "grad_norm": 0.5121529339351947, "learning_rate": 8.937227534781878e-06, "loss": 0.0496, "step": 2614 }, { "epoch": 1.1614479236064845, "grad_norm": 0.5247406850098297, "learning_rate": 8.936032355329233e-06, "loss": 0.0509, "step": 2615 }, { "epoch": 1.161892071952032, "grad_norm": 0.43915296270490234, "learning_rate": 8.934836584224065e-06, "loss": 0.0483, "step": 2616 }, { "epoch": 1.1623362202975793, "grad_norm": 0.4849661298577939, "learning_rate": 8.933640221646116e-06, "loss": 0.0534, "step": 2617 }, { "epoch": 1.1627803686431268, "grad_norm": 0.5053186137211374, "learning_rate": 8.932443267775221e-06, "loss": 0.0508, "step": 2618 }, { "epoch": 1.1632245169886741, "grad_norm": 0.5361080174409868, "learning_rate": 8.931245722791305e-06, "loss": 0.0609, "step": 2619 }, { "epoch": 1.1636686653342216, "grad_norm": 0.40985225869641795, "learning_rate": 8.930047586874373e-06, "loss": 0.0458, "step": 2620 }, { "epoch": 1.1641128136797692, "grad_norm": 0.48841596818152816, "learning_rate": 8.928848860204531e-06, "loss": 0.0782, "step": 2621 }, { "epoch": 1.1645569620253164, "grad_norm": 0.5219141696813542, "learning_rate": 8.927649542961965e-06, "loss": 0.0498, "step": 2622 }, { "epoch": 1.165001110370864, "grad_norm": 0.6061661462461002, "learning_rate": 8.926449635326954e-06, "loss": 0.0586, "step": 2623 }, { "epoch": 1.1654452587164112, "grad_norm": 0.4500746397097711, "learning_rate": 8.925249137479864e-06, "loss": 0.0399, "step": 2624 }, { "epoch": 1.1658894070619588, "grad_norm": 0.5211157612826388, "learning_rate": 8.92404804960115e-06, "loss": 0.05, "step": 2625 }, { "epoch": 1.166333555407506, "grad_norm": 0.4903699164468352, "learning_rate": 8.922846371871355e-06, "loss": 0.0496, "step": 2626 }, { "epoch": 1.1667777037530536, "grad_norm": 0.7464483529673437, "learning_rate": 8.921644104471114e-06, "loss": 0.0601, "step": 2627 }, { "epoch": 1.1672218520986009, "grad_norm": 0.4872521353158615, "learning_rate": 8.920441247581148e-06, "loss": 0.0525, "step": 2628 }, { "epoch": 1.1676660004441484, "grad_norm": 0.5674275180118361, "learning_rate": 8.919237801382265e-06, "loss": 0.0552, "step": 2629 }, { "epoch": 1.1681101487896957, "grad_norm": 0.8950605367456311, "learning_rate": 8.918033766055364e-06, "loss": 0.0602, "step": 2630 }, { "epoch": 1.1685542971352432, "grad_norm": 0.6732326138636984, "learning_rate": 8.916829141781432e-06, "loss": 0.0558, "step": 2631 }, { "epoch": 1.1689984454807907, "grad_norm": 0.48994316207093613, "learning_rate": 8.915623928741546e-06, "loss": 0.0628, "step": 2632 }, { "epoch": 1.169442593826338, "grad_norm": 0.4212795487662145, "learning_rate": 8.914418127116867e-06, "loss": 0.0508, "step": 2633 }, { "epoch": 1.1698867421718855, "grad_norm": 0.49361391712276814, "learning_rate": 8.91321173708865e-06, "loss": 0.0638, "step": 2634 }, { "epoch": 1.1703308905174328, "grad_norm": 0.7512476603132378, "learning_rate": 8.912004758838235e-06, "loss": 0.0839, "step": 2635 }, { "epoch": 1.1707750388629803, "grad_norm": 0.505025169980782, "learning_rate": 8.910797192547051e-06, "loss": 0.0457, "step": 2636 }, { "epoch": 1.1712191872085276, "grad_norm": 0.5671039324048706, "learning_rate": 8.909589038396617e-06, "loss": 0.044, "step": 2637 }, { "epoch": 1.171663335554075, "grad_norm": 0.4745487457505175, "learning_rate": 8.908380296568537e-06, "loss": 0.0479, "step": 2638 }, { "epoch": 1.1721074838996224, "grad_norm": 0.8949118984931155, "learning_rate": 8.907170967244508e-06, "loss": 0.0576, "step": 2639 }, { "epoch": 1.17255163224517, "grad_norm": 0.4520757913913456, "learning_rate": 8.905961050606311e-06, "loss": 0.0467, "step": 2640 }, { "epoch": 1.1729957805907172, "grad_norm": 0.4749545091509921, "learning_rate": 8.904750546835817e-06, "loss": 0.0446, "step": 2641 }, { "epoch": 1.1734399289362647, "grad_norm": 0.522016267548402, "learning_rate": 8.903539456114988e-06, "loss": 0.053, "step": 2642 }, { "epoch": 1.1738840772818122, "grad_norm": 0.530579514310406, "learning_rate": 8.902327778625865e-06, "loss": 0.0437, "step": 2643 }, { "epoch": 1.1743282256273595, "grad_norm": 0.6048157018439171, "learning_rate": 8.90111551455059e-06, "loss": 0.0435, "step": 2644 }, { "epoch": 1.174772373972907, "grad_norm": 0.5906854069033922, "learning_rate": 8.899902664071384e-06, "loss": 0.0626, "step": 2645 }, { "epoch": 1.1752165223184543, "grad_norm": 0.47030730215576705, "learning_rate": 8.898689227370563e-06, "loss": 0.0386, "step": 2646 }, { "epoch": 1.1756606706640018, "grad_norm": 0.6351029367838139, "learning_rate": 8.897475204630521e-06, "loss": 0.0591, "step": 2647 }, { "epoch": 1.176104819009549, "grad_norm": 0.6410082214075851, "learning_rate": 8.89626059603375e-06, "loss": 0.0579, "step": 2648 }, { "epoch": 1.1765489673550966, "grad_norm": 0.5406985095669288, "learning_rate": 8.895045401762825e-06, "loss": 0.0568, "step": 2649 }, { "epoch": 1.1769931157006441, "grad_norm": 0.7480895085443751, "learning_rate": 8.893829622000412e-06, "loss": 0.0707, "step": 2650 }, { "epoch": 1.1774372640461914, "grad_norm": 0.45890411967998634, "learning_rate": 8.892613256929261e-06, "loss": 0.0416, "step": 2651 }, { "epoch": 1.177881412391739, "grad_norm": 0.396661981002553, "learning_rate": 8.891396306732214e-06, "loss": 0.0423, "step": 2652 }, { "epoch": 1.1783255607372862, "grad_norm": 0.5069251937753831, "learning_rate": 8.890178771592198e-06, "loss": 0.0487, "step": 2653 }, { "epoch": 1.1787697090828337, "grad_norm": 0.7785700220445251, "learning_rate": 8.888960651692231e-06, "loss": 0.0719, "step": 2654 }, { "epoch": 1.179213857428381, "grad_norm": 0.5673896965611613, "learning_rate": 8.887741947215415e-06, "loss": 0.0556, "step": 2655 }, { "epoch": 1.1796580057739285, "grad_norm": 0.542725413232058, "learning_rate": 8.886522658344944e-06, "loss": 0.0493, "step": 2656 }, { "epoch": 1.1801021541194758, "grad_norm": 0.6266382123491776, "learning_rate": 8.885302785264098e-06, "loss": 0.0383, "step": 2657 }, { "epoch": 1.1805463024650233, "grad_norm": 0.46581618376933115, "learning_rate": 8.884082328156243e-06, "loss": 0.0419, "step": 2658 }, { "epoch": 1.1809904508105706, "grad_norm": 0.5581446013530325, "learning_rate": 8.882861287204836e-06, "loss": 0.0498, "step": 2659 }, { "epoch": 1.1814345991561181, "grad_norm": 0.42956852547953145, "learning_rate": 8.881639662593417e-06, "loss": 0.0381, "step": 2660 }, { "epoch": 1.1818787475016657, "grad_norm": 0.45803495008537815, "learning_rate": 8.880417454505622e-06, "loss": 0.0492, "step": 2661 }, { "epoch": 1.182322895847213, "grad_norm": 0.6914169208326956, "learning_rate": 8.879194663125164e-06, "loss": 0.0517, "step": 2662 }, { "epoch": 1.1827670441927605, "grad_norm": 0.6395374873974822, "learning_rate": 8.877971288635853e-06, "loss": 0.0631, "step": 2663 }, { "epoch": 1.1832111925383078, "grad_norm": 0.558846270469075, "learning_rate": 8.876747331221583e-06, "loss": 0.058, "step": 2664 }, { "epoch": 1.1836553408838553, "grad_norm": 0.49754542881845054, "learning_rate": 8.875522791066333e-06, "loss": 0.0616, "step": 2665 }, { "epoch": 1.1840994892294026, "grad_norm": 0.641582498045218, "learning_rate": 8.874297668354175e-06, "loss": 0.0432, "step": 2666 }, { "epoch": 1.18454363757495, "grad_norm": 0.5357146614905643, "learning_rate": 8.873071963269265e-06, "loss": 0.0581, "step": 2667 }, { "epoch": 1.1849877859204974, "grad_norm": 0.3864891516746441, "learning_rate": 8.871845675995847e-06, "loss": 0.0397, "step": 2668 }, { "epoch": 1.1854319342660449, "grad_norm": 0.5817776113032109, "learning_rate": 8.870618806718252e-06, "loss": 0.0548, "step": 2669 }, { "epoch": 1.1858760826115922, "grad_norm": 0.6451637896241947, "learning_rate": 8.8693913556209e-06, "loss": 0.0555, "step": 2670 }, { "epoch": 1.1863202309571397, "grad_norm": 0.522938279572616, "learning_rate": 8.868163322888298e-06, "loss": 0.0429, "step": 2671 }, { "epoch": 1.1867643793026872, "grad_norm": 0.6059080038510308, "learning_rate": 8.86693470870504e-06, "loss": 0.056, "step": 2672 }, { "epoch": 1.1872085276482345, "grad_norm": 0.5024058690626159, "learning_rate": 8.865705513255807e-06, "loss": 0.0371, "step": 2673 }, { "epoch": 1.187652675993782, "grad_norm": 0.5033221412559168, "learning_rate": 8.864475736725369e-06, "loss": 0.06, "step": 2674 }, { "epoch": 1.1880968243393293, "grad_norm": 0.4156977033942008, "learning_rate": 8.863245379298582e-06, "loss": 0.0378, "step": 2675 }, { "epoch": 1.1885409726848768, "grad_norm": 0.8144903668245238, "learning_rate": 8.86201444116039e-06, "loss": 0.0614, "step": 2676 }, { "epoch": 1.188985121030424, "grad_norm": 0.4226975303790829, "learning_rate": 8.860782922495821e-06, "loss": 0.046, "step": 2677 }, { "epoch": 1.1894292693759716, "grad_norm": 0.52956763181697, "learning_rate": 8.859550823489997e-06, "loss": 0.0384, "step": 2678 }, { "epoch": 1.189873417721519, "grad_norm": 0.5489791027942442, "learning_rate": 8.858318144328123e-06, "loss": 0.0742, "step": 2679 }, { "epoch": 1.1903175660670664, "grad_norm": 1.0552083022753025, "learning_rate": 8.85708488519549e-06, "loss": 0.0446, "step": 2680 }, { "epoch": 1.1907617144126137, "grad_norm": 0.5078924313476717, "learning_rate": 8.855851046277478e-06, "loss": 0.054, "step": 2681 }, { "epoch": 1.1912058627581612, "grad_norm": 0.5148608895352339, "learning_rate": 8.854616627759553e-06, "loss": 0.0523, "step": 2682 }, { "epoch": 1.1916500111037087, "grad_norm": 0.7282958984855765, "learning_rate": 8.853381629827272e-06, "loss": 0.0506, "step": 2683 }, { "epoch": 1.192094159449256, "grad_norm": 0.5448500153068538, "learning_rate": 8.852146052666275e-06, "loss": 0.0404, "step": 2684 }, { "epoch": 1.1925383077948035, "grad_norm": 0.40034874918632524, "learning_rate": 8.850909896462288e-06, "loss": 0.0351, "step": 2685 }, { "epoch": 1.1929824561403508, "grad_norm": 0.6365377232810542, "learning_rate": 8.849673161401129e-06, "loss": 0.0616, "step": 2686 }, { "epoch": 1.1934266044858983, "grad_norm": 0.5874533737153114, "learning_rate": 8.848435847668699e-06, "loss": 0.06, "step": 2687 }, { "epoch": 1.1938707528314456, "grad_norm": 0.4075740080944545, "learning_rate": 8.847197955450988e-06, "loss": 0.0399, "step": 2688 }, { "epoch": 1.1943149011769931, "grad_norm": 0.4560997072221143, "learning_rate": 8.845959484934073e-06, "loss": 0.0482, "step": 2689 }, { "epoch": 1.1947590495225406, "grad_norm": 0.7271393081297982, "learning_rate": 8.844720436304113e-06, "loss": 0.0694, "step": 2690 }, { "epoch": 1.195203197868088, "grad_norm": 0.5135039124857559, "learning_rate": 8.843480809747363e-06, "loss": 0.064, "step": 2691 }, { "epoch": 1.1956473462136354, "grad_norm": 0.5417817389406259, "learning_rate": 8.842240605450158e-06, "loss": 0.0816, "step": 2692 }, { "epoch": 1.1960914945591827, "grad_norm": 0.40641657577367263, "learning_rate": 8.840999823598921e-06, "loss": 0.0378, "step": 2693 }, { "epoch": 1.1965356429047302, "grad_norm": 0.7137594073963416, "learning_rate": 8.839758464380163e-06, "loss": 0.063, "step": 2694 }, { "epoch": 1.1969797912502775, "grad_norm": 0.6152412711374633, "learning_rate": 8.838516527980483e-06, "loss": 0.0403, "step": 2695 }, { "epoch": 1.197423939595825, "grad_norm": 0.5995872223176858, "learning_rate": 8.837274014586564e-06, "loss": 0.0461, "step": 2696 }, { "epoch": 1.1978680879413723, "grad_norm": 0.5251664478854555, "learning_rate": 8.836030924385175e-06, "loss": 0.0558, "step": 2697 }, { "epoch": 1.1983122362869199, "grad_norm": 0.444869598867083, "learning_rate": 8.834787257563178e-06, "loss": 0.0583, "step": 2698 }, { "epoch": 1.1987563846324671, "grad_norm": 0.5140725378841854, "learning_rate": 8.833543014307513e-06, "loss": 0.0513, "step": 2699 }, { "epoch": 1.1992005329780147, "grad_norm": 0.5779472328853184, "learning_rate": 8.83229819480521e-06, "loss": 0.0653, "step": 2700 }, { "epoch": 1.1996446813235622, "grad_norm": 0.5908242045279332, "learning_rate": 8.831052799243394e-06, "loss": 0.0565, "step": 2701 }, { "epoch": 1.2000888296691095, "grad_norm": 0.9242559999496844, "learning_rate": 8.82980682780926e-06, "loss": 0.0628, "step": 2702 }, { "epoch": 1.200532978014657, "grad_norm": 0.4108528589278202, "learning_rate": 8.828560280690104e-06, "loss": 0.0438, "step": 2703 }, { "epoch": 1.2009771263602043, "grad_norm": 0.4411744291440838, "learning_rate": 8.827313158073304e-06, "loss": 0.0567, "step": 2704 }, { "epoch": 1.2014212747057518, "grad_norm": 0.5260335018456415, "learning_rate": 8.826065460146318e-06, "loss": 0.0579, "step": 2705 }, { "epoch": 1.201865423051299, "grad_norm": 0.6891291032685649, "learning_rate": 8.824817187096702e-06, "loss": 0.056, "step": 2706 }, { "epoch": 1.2023095713968466, "grad_norm": 0.7678244223744226, "learning_rate": 8.823568339112089e-06, "loss": 0.0577, "step": 2707 }, { "epoch": 1.2027537197423939, "grad_norm": 0.605196561971164, "learning_rate": 8.822318916380207e-06, "loss": 0.0511, "step": 2708 }, { "epoch": 1.2031978680879414, "grad_norm": 0.5123811082634186, "learning_rate": 8.821068919088858e-06, "loss": 0.0578, "step": 2709 }, { "epoch": 1.2036420164334887, "grad_norm": 0.4168805885973522, "learning_rate": 8.819818347425943e-06, "loss": 0.0402, "step": 2710 }, { "epoch": 1.2040861647790362, "grad_norm": 0.4322765343829467, "learning_rate": 8.818567201579444e-06, "loss": 0.0404, "step": 2711 }, { "epoch": 1.2045303131245837, "grad_norm": 0.6086948606935972, "learning_rate": 8.817315481737428e-06, "loss": 0.0574, "step": 2712 }, { "epoch": 1.204974461470131, "grad_norm": 0.5388644202630685, "learning_rate": 8.816063188088049e-06, "loss": 0.0526, "step": 2713 }, { "epoch": 1.2054186098156785, "grad_norm": 0.4833033812042681, "learning_rate": 8.814810320819551e-06, "loss": 0.0522, "step": 2714 }, { "epoch": 1.2058627581612258, "grad_norm": 0.9567007955598438, "learning_rate": 8.81355688012026e-06, "loss": 0.0926, "step": 2715 }, { "epoch": 1.2063069065067733, "grad_norm": 0.6610578357230338, "learning_rate": 8.812302866178586e-06, "loss": 0.0508, "step": 2716 }, { "epoch": 1.2067510548523206, "grad_norm": 0.5170485323761623, "learning_rate": 8.811048279183034e-06, "loss": 0.0482, "step": 2717 }, { "epoch": 1.207195203197868, "grad_norm": 0.49170885647162377, "learning_rate": 8.809793119322188e-06, "loss": 0.0452, "step": 2718 }, { "epoch": 1.2076393515434156, "grad_norm": 0.6569021320261341, "learning_rate": 8.808537386784717e-06, "loss": 0.0602, "step": 2719 }, { "epoch": 1.208083499888963, "grad_norm": 0.5861436832219081, "learning_rate": 8.807281081759382e-06, "loss": 0.0503, "step": 2720 }, { "epoch": 1.2085276482345104, "grad_norm": 0.6362413163945917, "learning_rate": 8.806024204435024e-06, "loss": 0.0619, "step": 2721 }, { "epoch": 1.2089717965800577, "grad_norm": 0.8112427507399114, "learning_rate": 8.804766755000577e-06, "loss": 0.0878, "step": 2722 }, { "epoch": 1.2094159449256052, "grad_norm": 0.6213886119436266, "learning_rate": 8.803508733645056e-06, "loss": 0.0391, "step": 2723 }, { "epoch": 1.2098600932711525, "grad_norm": 0.4701764036600453, "learning_rate": 8.80225014055756e-06, "loss": 0.044, "step": 2724 }, { "epoch": 1.2103042416167, "grad_norm": 0.6176542835674168, "learning_rate": 8.80099097592728e-06, "loss": 0.0717, "step": 2725 }, { "epoch": 1.2107483899622473, "grad_norm": 0.5676221760627296, "learning_rate": 8.799731239943488e-06, "loss": 0.0485, "step": 2726 }, { "epoch": 1.2111925383077948, "grad_norm": 0.7416139133479585, "learning_rate": 8.798470932795545e-06, "loss": 0.0848, "step": 2727 }, { "epoch": 1.2116366866533421, "grad_norm": 0.8518948114813076, "learning_rate": 8.797210054672897e-06, "loss": 0.0603, "step": 2728 }, { "epoch": 1.2120808349988896, "grad_norm": 0.5447845702100781, "learning_rate": 8.795948605765071e-06, "loss": 0.0503, "step": 2729 }, { "epoch": 1.2125249833444371, "grad_norm": 0.6041323757362492, "learning_rate": 8.794686586261692e-06, "loss": 0.0606, "step": 2730 }, { "epoch": 1.2129691316899844, "grad_norm": 0.3720241663572847, "learning_rate": 8.793423996352458e-06, "loss": 0.0396, "step": 2731 }, { "epoch": 1.213413280035532, "grad_norm": 0.6614032276078902, "learning_rate": 8.792160836227156e-06, "loss": 0.0778, "step": 2732 }, { "epoch": 1.2138574283810792, "grad_norm": 0.4371895131814461, "learning_rate": 8.790897106075665e-06, "loss": 0.0464, "step": 2733 }, { "epoch": 1.2143015767266268, "grad_norm": 0.42112583525881137, "learning_rate": 8.78963280608794e-06, "loss": 0.0283, "step": 2734 }, { "epoch": 1.214745725072174, "grad_norm": 0.8563223231466682, "learning_rate": 8.788367936454033e-06, "loss": 0.0588, "step": 2735 }, { "epoch": 1.2151898734177216, "grad_norm": 0.5558136427910825, "learning_rate": 8.78710249736407e-06, "loss": 0.0438, "step": 2736 }, { "epoch": 1.2156340217632688, "grad_norm": 0.5340710064178852, "learning_rate": 8.78583648900827e-06, "loss": 0.0517, "step": 2737 }, { "epoch": 1.2160781701088164, "grad_norm": 0.4855046400709726, "learning_rate": 8.784569911576937e-06, "loss": 0.0429, "step": 2738 }, { "epoch": 1.2165223184543636, "grad_norm": 0.5862019061765111, "learning_rate": 8.783302765260456e-06, "loss": 0.0549, "step": 2739 }, { "epoch": 1.2169664667999112, "grad_norm": 0.754950173363004, "learning_rate": 8.782035050249302e-06, "loss": 0.051, "step": 2740 }, { "epoch": 1.2174106151454587, "grad_norm": 0.5310823633824483, "learning_rate": 8.780766766734037e-06, "loss": 0.0535, "step": 2741 }, { "epoch": 1.217854763491006, "grad_norm": 0.515619945566938, "learning_rate": 8.779497914905302e-06, "loss": 0.0518, "step": 2742 }, { "epoch": 1.2182989118365535, "grad_norm": 0.5419625287890412, "learning_rate": 8.778228494953826e-06, "loss": 0.0629, "step": 2743 }, { "epoch": 1.2187430601821008, "grad_norm": 0.41887351869316514, "learning_rate": 8.776958507070427e-06, "loss": 0.0436, "step": 2744 }, { "epoch": 1.2191872085276483, "grad_norm": 0.5475369085076727, "learning_rate": 8.775687951446007e-06, "loss": 0.043, "step": 2745 }, { "epoch": 1.2196313568731956, "grad_norm": 0.6765114475250111, "learning_rate": 8.774416828271548e-06, "loss": 0.0542, "step": 2746 }, { "epoch": 1.220075505218743, "grad_norm": 0.678214320662665, "learning_rate": 8.773145137738125e-06, "loss": 0.0815, "step": 2747 }, { "epoch": 1.2205196535642906, "grad_norm": 0.49324022002296203, "learning_rate": 8.771872880036893e-06, "loss": 0.0524, "step": 2748 }, { "epoch": 1.2209638019098379, "grad_norm": 0.5958708788497837, "learning_rate": 8.770600055359094e-06, "loss": 0.0456, "step": 2749 }, { "epoch": 1.2214079502553852, "grad_norm": 0.4999935592497019, "learning_rate": 8.769326663896056e-06, "loss": 0.0409, "step": 2750 }, { "epoch": 1.2218520986009327, "grad_norm": 0.43642259806336103, "learning_rate": 8.76805270583919e-06, "loss": 0.05, "step": 2751 }, { "epoch": 1.2222962469464802, "grad_norm": 0.6350373365457326, "learning_rate": 8.766778181379993e-06, "loss": 0.046, "step": 2752 }, { "epoch": 1.2227403952920275, "grad_norm": 0.49370518082751286, "learning_rate": 8.765503090710052e-06, "loss": 0.0504, "step": 2753 }, { "epoch": 1.223184543637575, "grad_norm": 0.6617812231173426, "learning_rate": 8.76422743402103e-06, "loss": 0.0524, "step": 2754 }, { "epoch": 1.2236286919831223, "grad_norm": 0.46946114196024546, "learning_rate": 8.762951211504682e-06, "loss": 0.0459, "step": 2755 }, { "epoch": 1.2240728403286698, "grad_norm": 0.49081430568764717, "learning_rate": 8.761674423352844e-06, "loss": 0.0438, "step": 2756 }, { "epoch": 1.224516988674217, "grad_norm": 0.7508698890744291, "learning_rate": 8.760397069757443e-06, "loss": 0.0535, "step": 2757 }, { "epoch": 1.2249611370197646, "grad_norm": 0.7095498367244547, "learning_rate": 8.759119150910482e-06, "loss": 0.0439, "step": 2758 }, { "epoch": 1.2254052853653121, "grad_norm": 0.5423777599181153, "learning_rate": 8.757840667004059e-06, "loss": 0.0584, "step": 2759 }, { "epoch": 1.2258494337108594, "grad_norm": 0.38826805829970257, "learning_rate": 8.756561618230348e-06, "loss": 0.0372, "step": 2760 }, { "epoch": 1.226293582056407, "grad_norm": 0.4214243171446597, "learning_rate": 8.755282004781613e-06, "loss": 0.0387, "step": 2761 }, { "epoch": 1.2267377304019542, "grad_norm": 0.6110973962790421, "learning_rate": 8.754001826850201e-06, "loss": 0.0504, "step": 2762 }, { "epoch": 1.2271818787475017, "grad_norm": 0.5706371024485902, "learning_rate": 8.752721084628545e-06, "loss": 0.0447, "step": 2763 }, { "epoch": 1.227626027093049, "grad_norm": 0.7667925876568604, "learning_rate": 8.751439778309162e-06, "loss": 0.0661, "step": 2764 }, { "epoch": 1.2280701754385965, "grad_norm": 0.4947295202945595, "learning_rate": 8.750157908084655e-06, "loss": 0.0481, "step": 2765 }, { "epoch": 1.2285143237841438, "grad_norm": 0.4844603657944363, "learning_rate": 8.74887547414771e-06, "loss": 0.0428, "step": 2766 }, { "epoch": 1.2289584721296913, "grad_norm": 0.5262436629587928, "learning_rate": 8.747592476691102e-06, "loss": 0.0572, "step": 2767 }, { "epoch": 1.2294026204752386, "grad_norm": 0.48653491041855407, "learning_rate": 8.746308915907681e-06, "loss": 0.0466, "step": 2768 }, { "epoch": 1.2298467688207861, "grad_norm": 0.3598264709368313, "learning_rate": 8.745024791990392e-06, "loss": 0.036, "step": 2769 }, { "epoch": 1.2302909171663337, "grad_norm": 0.4281231861639907, "learning_rate": 8.74374010513226e-06, "loss": 0.0375, "step": 2770 }, { "epoch": 1.230735065511881, "grad_norm": 0.6227157372551906, "learning_rate": 8.742454855526396e-06, "loss": 0.06, "step": 2771 }, { "epoch": 1.2311792138574285, "grad_norm": 0.5721086967364898, "learning_rate": 8.741169043365994e-06, "loss": 0.0517, "step": 2772 }, { "epoch": 1.2316233622029757, "grad_norm": 0.4118993047204808, "learning_rate": 8.739882668844332e-06, "loss": 0.0433, "step": 2773 }, { "epoch": 1.2320675105485233, "grad_norm": 0.7343040841582446, "learning_rate": 8.738595732154776e-06, "loss": 0.0525, "step": 2774 }, { "epoch": 1.2325116588940705, "grad_norm": 0.4304569080551511, "learning_rate": 8.737308233490775e-06, "loss": 0.0548, "step": 2775 }, { "epoch": 1.232955807239618, "grad_norm": 0.5483425130829543, "learning_rate": 8.736020173045858e-06, "loss": 0.0547, "step": 2776 }, { "epoch": 1.2333999555851654, "grad_norm": 0.45127669817592087, "learning_rate": 8.734731551013648e-06, "loss": 0.0458, "step": 2777 }, { "epoch": 1.2338441039307129, "grad_norm": 0.5076179219488122, "learning_rate": 8.733442367587842e-06, "loss": 0.0495, "step": 2778 }, { "epoch": 1.2342882522762602, "grad_norm": 0.4061357633325624, "learning_rate": 8.732152622962229e-06, "loss": 0.038, "step": 2779 }, { "epoch": 1.2347324006218077, "grad_norm": 0.6061025558148905, "learning_rate": 8.730862317330678e-06, "loss": 0.0552, "step": 2780 }, { "epoch": 1.2351765489673552, "grad_norm": 0.7935186621762401, "learning_rate": 8.729571450887145e-06, "loss": 0.0497, "step": 2781 }, { "epoch": 1.2356206973129025, "grad_norm": 0.4698985875136965, "learning_rate": 8.728280023825667e-06, "loss": 0.0449, "step": 2782 }, { "epoch": 1.23606484565845, "grad_norm": 0.5845370985288897, "learning_rate": 8.726988036340372e-06, "loss": 0.0558, "step": 2783 }, { "epoch": 1.2365089940039973, "grad_norm": 0.40081844727764104, "learning_rate": 8.725695488625463e-06, "loss": 0.0309, "step": 2784 }, { "epoch": 1.2369531423495448, "grad_norm": 0.565816281763518, "learning_rate": 8.724402380875234e-06, "loss": 0.0527, "step": 2785 }, { "epoch": 1.237397290695092, "grad_norm": 0.4908483433280354, "learning_rate": 8.72310871328406e-06, "loss": 0.0406, "step": 2786 }, { "epoch": 1.2378414390406396, "grad_norm": 0.5334199926157142, "learning_rate": 8.7218144860464e-06, "loss": 0.0432, "step": 2787 }, { "epoch": 1.238285587386187, "grad_norm": 0.43486952273189144, "learning_rate": 8.720519699356804e-06, "loss": 0.0449, "step": 2788 }, { "epoch": 1.2387297357317344, "grad_norm": 0.5049916355349909, "learning_rate": 8.719224353409895e-06, "loss": 0.056, "step": 2789 }, { "epoch": 1.239173884077282, "grad_norm": 0.44547159070564635, "learning_rate": 8.717928448400387e-06, "loss": 0.0444, "step": 2790 }, { "epoch": 1.2396180324228292, "grad_norm": 0.4328408997299911, "learning_rate": 8.716631984523076e-06, "loss": 0.0461, "step": 2791 }, { "epoch": 1.2400621807683767, "grad_norm": 0.42920188220498545, "learning_rate": 8.715334961972844e-06, "loss": 0.0477, "step": 2792 }, { "epoch": 1.240506329113924, "grad_norm": 0.45226945207712443, "learning_rate": 8.714037380944655e-06, "loss": 0.0445, "step": 2793 }, { "epoch": 1.2409504774594715, "grad_norm": 0.6142043461351961, "learning_rate": 8.712739241633557e-06, "loss": 0.0526, "step": 2794 }, { "epoch": 1.2413946258050188, "grad_norm": 0.49524623051926364, "learning_rate": 8.711440544234681e-06, "loss": 0.0448, "step": 2795 }, { "epoch": 1.2418387741505663, "grad_norm": 0.527589137144962, "learning_rate": 8.710141288943247e-06, "loss": 0.0681, "step": 2796 }, { "epoch": 1.2422829224961136, "grad_norm": 0.45949156992557444, "learning_rate": 8.708841475954551e-06, "loss": 0.0597, "step": 2797 }, { "epoch": 1.2427270708416611, "grad_norm": 0.5807053135952894, "learning_rate": 8.707541105463982e-06, "loss": 0.0602, "step": 2798 }, { "epoch": 1.2431712191872086, "grad_norm": 0.47202242550842133, "learning_rate": 8.706240177667003e-06, "loss": 0.043, "step": 2799 }, { "epoch": 1.243615367532756, "grad_norm": 0.5195621264387825, "learning_rate": 8.704938692759166e-06, "loss": 0.0384, "step": 2800 }, { "epoch": 1.2440595158783034, "grad_norm": 0.40308041507113895, "learning_rate": 8.703636650936108e-06, "loss": 0.0352, "step": 2801 }, { "epoch": 1.2445036642238507, "grad_norm": 0.5663764923556683, "learning_rate": 8.70233405239355e-06, "loss": 0.0487, "step": 2802 }, { "epoch": 1.2449478125693982, "grad_norm": 0.6290221419877656, "learning_rate": 8.70103089732729e-06, "loss": 0.0518, "step": 2803 }, { "epoch": 1.2453919609149455, "grad_norm": 0.4173665256358516, "learning_rate": 8.699727185933215e-06, "loss": 0.0352, "step": 2804 }, { "epoch": 1.245836109260493, "grad_norm": 0.8204810767617772, "learning_rate": 8.698422918407299e-06, "loss": 0.0487, "step": 2805 }, { "epoch": 1.2462802576060403, "grad_norm": 0.9850006429206544, "learning_rate": 8.697118094945593e-06, "loss": 0.0865, "step": 2806 }, { "epoch": 1.2467244059515878, "grad_norm": 0.48735159482648766, "learning_rate": 8.695812715744235e-06, "loss": 0.0358, "step": 2807 }, { "epoch": 1.2471685542971351, "grad_norm": 0.5010917630261117, "learning_rate": 8.694506780999444e-06, "loss": 0.0477, "step": 2808 }, { "epoch": 1.2476127026426826, "grad_norm": 0.4869736837475569, "learning_rate": 8.693200290907525e-06, "loss": 0.0484, "step": 2809 }, { "epoch": 1.2480568509882302, "grad_norm": 0.3824351180128345, "learning_rate": 8.691893245664867e-06, "loss": 0.0352, "step": 2810 }, { "epoch": 1.2485009993337775, "grad_norm": 0.5417070708808928, "learning_rate": 8.690585645467937e-06, "loss": 0.0447, "step": 2811 }, { "epoch": 1.248945147679325, "grad_norm": 0.5086804337705053, "learning_rate": 8.689277490513295e-06, "loss": 0.05, "step": 2812 }, { "epoch": 1.2493892960248723, "grad_norm": 0.4431550852814911, "learning_rate": 8.687968780997576e-06, "loss": 0.0635, "step": 2813 }, { "epoch": 1.2498334443704198, "grad_norm": 0.3783726536150246, "learning_rate": 8.686659517117501e-06, "loss": 0.0384, "step": 2814 }, { "epoch": 1.250277592715967, "grad_norm": 0.5331217000478606, "learning_rate": 8.685349699069875e-06, "loss": 0.0409, "step": 2815 }, { "epoch": 1.2507217410615146, "grad_norm": 0.38074858661235217, "learning_rate": 8.684039327051586e-06, "loss": 0.038, "step": 2816 }, { "epoch": 1.251165889407062, "grad_norm": 0.6121662418691846, "learning_rate": 8.682728401259606e-06, "loss": 0.0556, "step": 2817 }, { "epoch": 1.2516100377526094, "grad_norm": 0.8655438293508221, "learning_rate": 8.681416921890988e-06, "loss": 0.0559, "step": 2818 }, { "epoch": 1.2520541860981567, "grad_norm": 0.5646419752199546, "learning_rate": 8.680104889142871e-06, "loss": 0.0546, "step": 2819 }, { "epoch": 1.2524983344437042, "grad_norm": 0.47189997209382956, "learning_rate": 8.678792303212474e-06, "loss": 0.038, "step": 2820 }, { "epoch": 1.2529424827892517, "grad_norm": 0.5347473900252707, "learning_rate": 8.677479164297102e-06, "loss": 0.0506, "step": 2821 }, { "epoch": 1.253386631134799, "grad_norm": 0.6752833223029766, "learning_rate": 8.676165472594145e-06, "loss": 0.0553, "step": 2822 }, { "epoch": 1.2538307794803465, "grad_norm": 0.6290436949126601, "learning_rate": 8.674851228301066e-06, "loss": 0.0628, "step": 2823 }, { "epoch": 1.2542749278258938, "grad_norm": 0.5190239487706246, "learning_rate": 8.673536431615426e-06, "loss": 0.053, "step": 2824 }, { "epoch": 1.2547190761714413, "grad_norm": 0.5039759155119382, "learning_rate": 8.672221082734857e-06, "loss": 0.0538, "step": 2825 }, { "epoch": 1.2551632245169886, "grad_norm": 0.5007524426018746, "learning_rate": 8.670905181857078e-06, "loss": 0.0379, "step": 2826 }, { "epoch": 1.255607372862536, "grad_norm": 0.4471624712255033, "learning_rate": 8.669588729179895e-06, "loss": 0.0425, "step": 2827 }, { "epoch": 1.2560515212080836, "grad_norm": 0.6483988178781206, "learning_rate": 8.668271724901188e-06, "loss": 0.0667, "step": 2828 }, { "epoch": 1.256495669553631, "grad_norm": 0.7074590212609329, "learning_rate": 8.666954169218929e-06, "loss": 0.0644, "step": 2829 }, { "epoch": 1.2569398178991782, "grad_norm": 0.4799067719368653, "learning_rate": 8.665636062331166e-06, "loss": 0.0516, "step": 2830 }, { "epoch": 1.2573839662447257, "grad_norm": 0.4519642992082907, "learning_rate": 8.664317404436036e-06, "loss": 0.0388, "step": 2831 }, { "epoch": 1.2578281145902732, "grad_norm": 1.9317339882104076, "learning_rate": 8.662998195731755e-06, "loss": 0.0445, "step": 2832 }, { "epoch": 1.2582722629358205, "grad_norm": 0.9801917673057263, "learning_rate": 8.661678436416621e-06, "loss": 0.0609, "step": 2833 }, { "epoch": 1.258716411281368, "grad_norm": 0.4629508087617577, "learning_rate": 8.660358126689015e-06, "loss": 0.046, "step": 2834 }, { "epoch": 1.2591605596269153, "grad_norm": 0.55871143511484, "learning_rate": 8.659037266747405e-06, "loss": 0.0669, "step": 2835 }, { "epoch": 1.2596047079724628, "grad_norm": 0.4373848923476879, "learning_rate": 8.65771585679034e-06, "loss": 0.0365, "step": 2836 }, { "epoch": 1.2600488563180101, "grad_norm": 0.4620726535649706, "learning_rate": 8.656393897016446e-06, "loss": 0.0508, "step": 2837 }, { "epoch": 1.2604930046635576, "grad_norm": 0.7076034863902437, "learning_rate": 8.655071387624439e-06, "loss": 0.066, "step": 2838 }, { "epoch": 1.2609371530091051, "grad_norm": 0.7170929705949399, "learning_rate": 8.653748328813112e-06, "loss": 0.0663, "step": 2839 }, { "epoch": 1.2613813013546524, "grad_norm": 0.5278820360047365, "learning_rate": 8.652424720781346e-06, "loss": 0.0703, "step": 2840 }, { "epoch": 1.2618254497002, "grad_norm": 0.6581330490442558, "learning_rate": 8.6511005637281e-06, "loss": 0.0475, "step": 2841 }, { "epoch": 1.2622695980457472, "grad_norm": 0.5785660781607422, "learning_rate": 8.649775857852419e-06, "loss": 0.0539, "step": 2842 }, { "epoch": 1.2627137463912947, "grad_norm": 0.7193486412721348, "learning_rate": 8.648450603353427e-06, "loss": 0.0527, "step": 2843 }, { "epoch": 1.263157894736842, "grad_norm": 0.4879073594292654, "learning_rate": 8.647124800430332e-06, "loss": 0.0462, "step": 2844 }, { "epoch": 1.2636020430823895, "grad_norm": 0.6283549482710774, "learning_rate": 8.645798449282427e-06, "loss": 0.047, "step": 2845 }, { "epoch": 1.264046191427937, "grad_norm": 0.590026590546146, "learning_rate": 8.644471550109084e-06, "loss": 0.0417, "step": 2846 }, { "epoch": 1.2644903397734844, "grad_norm": 0.5727280121631517, "learning_rate": 8.643144103109757e-06, "loss": 0.0461, "step": 2847 }, { "epoch": 1.2649344881190316, "grad_norm": 0.5188962100480214, "learning_rate": 8.641816108483987e-06, "loss": 0.0528, "step": 2848 }, { "epoch": 1.2653786364645792, "grad_norm": 0.6762837917534186, "learning_rate": 8.64048756643139e-06, "loss": 0.0551, "step": 2849 }, { "epoch": 1.2658227848101267, "grad_norm": 0.6063186060719946, "learning_rate": 8.639158477151673e-06, "loss": 0.0529, "step": 2850 }, { "epoch": 1.266266933155674, "grad_norm": 0.8416294217857067, "learning_rate": 8.637828840844615e-06, "loss": 0.0605, "step": 2851 }, { "epoch": 1.2667110815012215, "grad_norm": 0.42460216144695934, "learning_rate": 8.636498657710091e-06, "loss": 0.0406, "step": 2852 }, { "epoch": 1.2671552298467688, "grad_norm": 0.45348809902081694, "learning_rate": 8.635167927948041e-06, "loss": 0.0423, "step": 2853 }, { "epoch": 1.2675993781923163, "grad_norm": 0.5925945869143667, "learning_rate": 8.633836651758502e-06, "loss": 0.0422, "step": 2854 }, { "epoch": 1.2680435265378636, "grad_norm": 0.474702929089884, "learning_rate": 8.632504829341588e-06, "loss": 0.0365, "step": 2855 }, { "epoch": 1.268487674883411, "grad_norm": 0.8276698250673459, "learning_rate": 8.63117246089749e-06, "loss": 0.0477, "step": 2856 }, { "epoch": 1.2689318232289586, "grad_norm": 0.4891929607601022, "learning_rate": 8.62983954662649e-06, "loss": 0.044, "step": 2857 }, { "epoch": 1.2693759715745059, "grad_norm": 0.6525774912230711, "learning_rate": 8.628506086728947e-06, "loss": 0.0651, "step": 2858 }, { "epoch": 1.2698201199200532, "grad_norm": 0.6594678233990844, "learning_rate": 8.6271720814053e-06, "loss": 0.0381, "step": 2859 }, { "epoch": 1.2702642682656007, "grad_norm": 0.3241646392485162, "learning_rate": 8.625837530856074e-06, "loss": 0.0309, "step": 2860 }, { "epoch": 1.2707084166111482, "grad_norm": 0.7122663241070569, "learning_rate": 8.624502435281875e-06, "loss": 0.0617, "step": 2861 }, { "epoch": 1.2711525649566955, "grad_norm": 0.7005320306757822, "learning_rate": 8.623166794883393e-06, "loss": 0.0581, "step": 2862 }, { "epoch": 1.271596713302243, "grad_norm": 0.5328754180731429, "learning_rate": 8.621830609861392e-06, "loss": 0.0493, "step": 2863 }, { "epoch": 1.2720408616477903, "grad_norm": 0.6269919214433372, "learning_rate": 8.620493880416727e-06, "loss": 0.0516, "step": 2864 }, { "epoch": 1.2724850099933378, "grad_norm": 0.5844430878482636, "learning_rate": 8.619156606750329e-06, "loss": 0.0725, "step": 2865 }, { "epoch": 1.272929158338885, "grad_norm": 0.4737556741912441, "learning_rate": 8.617818789063217e-06, "loss": 0.0421, "step": 2866 }, { "epoch": 1.2733733066844326, "grad_norm": 0.7524484026464038, "learning_rate": 8.616480427556484e-06, "loss": 0.0708, "step": 2867 }, { "epoch": 1.2738174550299801, "grad_norm": 0.8150405412731492, "learning_rate": 8.61514152243131e-06, "loss": 0.0704, "step": 2868 }, { "epoch": 1.2742616033755274, "grad_norm": 0.6289008429543912, "learning_rate": 8.613802073888953e-06, "loss": 0.0544, "step": 2869 }, { "epoch": 1.274705751721075, "grad_norm": 0.49267955313551814, "learning_rate": 8.612462082130758e-06, "loss": 0.0423, "step": 2870 }, { "epoch": 1.2751499000666222, "grad_norm": 0.8023016461317273, "learning_rate": 8.611121547358146e-06, "loss": 0.0581, "step": 2871 }, { "epoch": 1.2755940484121697, "grad_norm": 0.5971368904341509, "learning_rate": 8.609780469772623e-06, "loss": 0.0555, "step": 2872 }, { "epoch": 1.276038196757717, "grad_norm": 0.497140537686175, "learning_rate": 8.608438849575777e-06, "loss": 0.0511, "step": 2873 }, { "epoch": 1.2764823451032645, "grad_norm": 0.5696597646442083, "learning_rate": 8.607096686969274e-06, "loss": 0.0592, "step": 2874 }, { "epoch": 1.276926493448812, "grad_norm": 0.7494228951484337, "learning_rate": 8.605753982154865e-06, "loss": 0.0486, "step": 2875 }, { "epoch": 1.2773706417943593, "grad_norm": 0.5764143358267766, "learning_rate": 8.604410735334383e-06, "loss": 0.0551, "step": 2876 }, { "epoch": 1.2778147901399066, "grad_norm": 0.6011073566193142, "learning_rate": 8.603066946709739e-06, "loss": 0.0567, "step": 2877 }, { "epoch": 1.2782589384854541, "grad_norm": 0.44537454001664567, "learning_rate": 8.601722616482927e-06, "loss": 0.0473, "step": 2878 }, { "epoch": 1.2787030868310016, "grad_norm": 0.45558471841390763, "learning_rate": 8.600377744856024e-06, "loss": 0.0577, "step": 2879 }, { "epoch": 1.279147235176549, "grad_norm": 0.5192384143571988, "learning_rate": 8.599032332031185e-06, "loss": 0.0549, "step": 2880 }, { "epoch": 1.2795913835220964, "grad_norm": 0.641251450966757, "learning_rate": 8.59768637821065e-06, "loss": 0.0599, "step": 2881 }, { "epoch": 1.2800355318676437, "grad_norm": 0.5747913363539975, "learning_rate": 8.596339883596738e-06, "loss": 0.0546, "step": 2882 }, { "epoch": 1.2804796802131913, "grad_norm": 0.5416056259419014, "learning_rate": 8.594992848391852e-06, "loss": 0.0536, "step": 2883 }, { "epoch": 1.2809238285587385, "grad_norm": 0.6028308551659004, "learning_rate": 8.59364527279847e-06, "loss": 0.0699, "step": 2884 }, { "epoch": 1.281367976904286, "grad_norm": 0.4368044045668833, "learning_rate": 8.59229715701916e-06, "loss": 0.0396, "step": 2885 }, { "epoch": 1.2818121252498336, "grad_norm": 0.5104164604694876, "learning_rate": 8.590948501256564e-06, "loss": 0.04, "step": 2886 }, { "epoch": 1.2822562735953809, "grad_norm": 0.7612641229651903, "learning_rate": 8.58959930571341e-06, "loss": 0.0624, "step": 2887 }, { "epoch": 1.2827004219409281, "grad_norm": 0.5035114097597291, "learning_rate": 8.588249570592502e-06, "loss": 0.0437, "step": 2888 }, { "epoch": 1.2831445702864757, "grad_norm": 0.499768070561634, "learning_rate": 8.586899296096731e-06, "loss": 0.0378, "step": 2889 }, { "epoch": 1.2835887186320232, "grad_norm": 0.4925272487243439, "learning_rate": 8.585548482429064e-06, "loss": 0.0406, "step": 2890 }, { "epoch": 1.2840328669775705, "grad_norm": 0.5079455242566149, "learning_rate": 8.584197129792553e-06, "loss": 0.049, "step": 2891 }, { "epoch": 1.284477015323118, "grad_norm": 0.5999702910432165, "learning_rate": 8.58284523839033e-06, "loss": 0.0525, "step": 2892 }, { "epoch": 1.2849211636686653, "grad_norm": 0.5762429387843919, "learning_rate": 8.581492808425604e-06, "loss": 0.0412, "step": 2893 }, { "epoch": 1.2853653120142128, "grad_norm": 0.5170549640705353, "learning_rate": 8.58013984010167e-06, "loss": 0.0426, "step": 2894 }, { "epoch": 1.28580946035976, "grad_norm": 0.44644738551782165, "learning_rate": 8.578786333621902e-06, "loss": 0.0449, "step": 2895 }, { "epoch": 1.2862536087053076, "grad_norm": 0.49787403784527023, "learning_rate": 8.577432289189755e-06, "loss": 0.053, "step": 2896 }, { "epoch": 1.286697757050855, "grad_norm": 0.4201343576902461, "learning_rate": 8.576077707008766e-06, "loss": 0.0548, "step": 2897 }, { "epoch": 1.2871419053964024, "grad_norm": 0.43223595023539274, "learning_rate": 8.57472258728255e-06, "loss": 0.0414, "step": 2898 }, { "epoch": 1.2875860537419497, "grad_norm": 0.45140599595731223, "learning_rate": 8.573366930214807e-06, "loss": 0.0512, "step": 2899 }, { "epoch": 1.2880302020874972, "grad_norm": 0.7864322313017508, "learning_rate": 8.57201073600931e-06, "loss": 0.0611, "step": 2900 }, { "epoch": 1.2884743504330447, "grad_norm": 0.4533939072001686, "learning_rate": 8.570654004869924e-06, "loss": 0.0455, "step": 2901 }, { "epoch": 1.288918498778592, "grad_norm": 0.5822538472138225, "learning_rate": 8.569296737000586e-06, "loss": 0.0689, "step": 2902 }, { "epoch": 1.2893626471241395, "grad_norm": 0.6558805640817998, "learning_rate": 8.567938932605315e-06, "loss": 0.0716, "step": 2903 }, { "epoch": 1.2898067954696868, "grad_norm": 0.6962929137245736, "learning_rate": 8.566580591888216e-06, "loss": 0.0456, "step": 2904 }, { "epoch": 1.2902509438152343, "grad_norm": 0.5885420821356683, "learning_rate": 8.565221715053467e-06, "loss": 0.0519, "step": 2905 }, { "epoch": 1.2906950921607816, "grad_norm": 0.5367742959382944, "learning_rate": 8.563862302305333e-06, "loss": 0.0385, "step": 2906 }, { "epoch": 1.2911392405063291, "grad_norm": 0.7323581959894143, "learning_rate": 8.562502353848155e-06, "loss": 0.0659, "step": 2907 }, { "epoch": 1.2915833888518766, "grad_norm": 0.5051887556869058, "learning_rate": 8.561141869886356e-06, "loss": 0.0559, "step": 2908 }, { "epoch": 1.292027537197424, "grad_norm": 0.5491304399843213, "learning_rate": 8.55978085062444e-06, "loss": 0.0639, "step": 2909 }, { "epoch": 1.2924716855429714, "grad_norm": 0.8492250674519668, "learning_rate": 8.558419296266995e-06, "loss": 0.0636, "step": 2910 }, { "epoch": 1.2929158338885187, "grad_norm": 0.3882329610931263, "learning_rate": 8.557057207018681e-06, "loss": 0.0487, "step": 2911 }, { "epoch": 1.2933599822340662, "grad_norm": 0.625233318674613, "learning_rate": 8.555694583084244e-06, "loss": 0.0465, "step": 2912 }, { "epoch": 1.2938041305796135, "grad_norm": 0.4026058166802596, "learning_rate": 8.554331424668511e-06, "loss": 0.0354, "step": 2913 }, { "epoch": 1.294248278925161, "grad_norm": 0.5634837852049973, "learning_rate": 8.552967731976388e-06, "loss": 0.0541, "step": 2914 }, { "epoch": 1.2946924272707085, "grad_norm": 0.48751917705852116, "learning_rate": 8.551603505212862e-06, "loss": 0.0322, "step": 2915 }, { "epoch": 1.2951365756162558, "grad_norm": 0.5156212240116493, "learning_rate": 8.550238744582997e-06, "loss": 0.0489, "step": 2916 }, { "epoch": 1.2955807239618031, "grad_norm": 0.6223211171347353, "learning_rate": 8.548873450291939e-06, "loss": 0.0457, "step": 2917 }, { "epoch": 1.2960248723073506, "grad_norm": 0.47672895894969763, "learning_rate": 8.547507622544916e-06, "loss": 0.0463, "step": 2918 }, { "epoch": 1.2964690206528982, "grad_norm": 0.4421458887458943, "learning_rate": 8.546141261547238e-06, "loss": 0.045, "step": 2919 }, { "epoch": 1.2969131689984454, "grad_norm": 0.848623824582431, "learning_rate": 8.544774367504291e-06, "loss": 0.0605, "step": 2920 }, { "epoch": 1.297357317343993, "grad_norm": 0.554513822402812, "learning_rate": 8.54340694062154e-06, "loss": 0.0518, "step": 2921 }, { "epoch": 1.2978014656895402, "grad_norm": 0.5649747315400963, "learning_rate": 8.542038981104532e-06, "loss": 0.0565, "step": 2922 }, { "epoch": 1.2982456140350878, "grad_norm": 0.42021656791448186, "learning_rate": 8.540670489158899e-06, "loss": 0.0426, "step": 2923 }, { "epoch": 1.298689762380635, "grad_norm": 0.6402654468272049, "learning_rate": 8.539301464990345e-06, "loss": 0.0543, "step": 2924 }, { "epoch": 1.2991339107261826, "grad_norm": 0.3618130143994902, "learning_rate": 8.53793190880466e-06, "loss": 0.038, "step": 2925 }, { "epoch": 1.29957805907173, "grad_norm": 0.3384505689514398, "learning_rate": 8.536561820807707e-06, "loss": 0.0346, "step": 2926 }, { "epoch": 1.3000222074172774, "grad_norm": 0.46188012553966973, "learning_rate": 8.535191201205439e-06, "loss": 0.0514, "step": 2927 }, { "epoch": 1.3004663557628247, "grad_norm": 0.5300445785864025, "learning_rate": 8.533820050203881e-06, "loss": 0.0483, "step": 2928 }, { "epoch": 1.3009105041083722, "grad_norm": 0.5536409349327417, "learning_rate": 8.532448368009139e-06, "loss": 0.0489, "step": 2929 }, { "epoch": 1.3013546524539197, "grad_norm": 0.371658656980648, "learning_rate": 8.531076154827402e-06, "loss": 0.0445, "step": 2930 }, { "epoch": 1.301798800799467, "grad_norm": 0.5129545567634131, "learning_rate": 8.529703410864938e-06, "loss": 0.0341, "step": 2931 }, { "epoch": 1.3022429491450145, "grad_norm": 0.6790917670823785, "learning_rate": 8.52833013632809e-06, "loss": 0.0604, "step": 2932 }, { "epoch": 1.3026870974905618, "grad_norm": 0.4758905378740349, "learning_rate": 8.526956331423289e-06, "loss": 0.0458, "step": 2933 }, { "epoch": 1.3031312458361093, "grad_norm": 0.5428940197749453, "learning_rate": 8.525581996357036e-06, "loss": 0.047, "step": 2934 }, { "epoch": 1.3035753941816566, "grad_norm": 0.5546360540394489, "learning_rate": 8.52420713133592e-06, "loss": 0.0585, "step": 2935 }, { "epoch": 1.304019542527204, "grad_norm": 0.8256368560390267, "learning_rate": 8.522831736566607e-06, "loss": 0.0592, "step": 2936 }, { "epoch": 1.3044636908727516, "grad_norm": 0.6807172008845478, "learning_rate": 8.521455812255843e-06, "loss": 0.0727, "step": 2937 }, { "epoch": 1.304907839218299, "grad_norm": 0.5972239265610148, "learning_rate": 8.52007935861045e-06, "loss": 0.0531, "step": 2938 }, { "epoch": 1.3053519875638464, "grad_norm": 0.5756047785651232, "learning_rate": 8.518702375837335e-06, "loss": 0.0484, "step": 2939 }, { "epoch": 1.3057961359093937, "grad_norm": 0.5254079002689842, "learning_rate": 8.51732486414348e-06, "loss": 0.0551, "step": 2940 }, { "epoch": 1.3062402842549412, "grad_norm": 0.5057686654115676, "learning_rate": 8.515946823735948e-06, "loss": 0.0555, "step": 2941 }, { "epoch": 1.3066844326004885, "grad_norm": 0.5425819759814885, "learning_rate": 8.514568254821884e-06, "loss": 0.0461, "step": 2942 }, { "epoch": 1.307128580946036, "grad_norm": 0.4919842279881122, "learning_rate": 8.51318915760851e-06, "loss": 0.0351, "step": 2943 }, { "epoch": 1.3075727292915835, "grad_norm": 0.5651189893610779, "learning_rate": 8.511809532303126e-06, "loss": 0.0485, "step": 2944 }, { "epoch": 1.3080168776371308, "grad_norm": 0.5135664399251805, "learning_rate": 8.510429379113114e-06, "loss": 0.0345, "step": 2945 }, { "epoch": 1.308461025982678, "grad_norm": 0.698576525158577, "learning_rate": 8.509048698245934e-06, "loss": 0.0541, "step": 2946 }, { "epoch": 1.3089051743282256, "grad_norm": 0.5914910850001974, "learning_rate": 8.507667489909126e-06, "loss": 0.0422, "step": 2947 }, { "epoch": 1.3093493226737731, "grad_norm": 0.39464116666974663, "learning_rate": 8.506285754310311e-06, "loss": 0.0395, "step": 2948 }, { "epoch": 1.3097934710193204, "grad_norm": 0.41501372577846396, "learning_rate": 8.504903491657185e-06, "loss": 0.0392, "step": 2949 }, { "epoch": 1.310237619364868, "grad_norm": 0.5710008833310989, "learning_rate": 8.503520702157527e-06, "loss": 0.0486, "step": 2950 }, { "epoch": 1.3106817677104152, "grad_norm": 0.912615051335769, "learning_rate": 8.502137386019191e-06, "loss": 0.0636, "step": 2951 }, { "epoch": 1.3111259160559627, "grad_norm": 0.543540441011416, "learning_rate": 8.500753543450118e-06, "loss": 0.062, "step": 2952 }, { "epoch": 1.31157006440151, "grad_norm": 0.39754298101965646, "learning_rate": 8.499369174658318e-06, "loss": 0.0383, "step": 2953 }, { "epoch": 1.3120142127470575, "grad_norm": 0.5539227325166909, "learning_rate": 8.497984279851888e-06, "loss": 0.0499, "step": 2954 }, { "epoch": 1.312458361092605, "grad_norm": 0.599023615011808, "learning_rate": 8.496598859238997e-06, "loss": 0.0472, "step": 2955 }, { "epoch": 1.3129025094381523, "grad_norm": 0.4220558992968304, "learning_rate": 8.495212913027906e-06, "loss": 0.0446, "step": 2956 }, { "epoch": 1.3133466577836996, "grad_norm": 0.49593231969327095, "learning_rate": 8.493826441426937e-06, "loss": 0.0636, "step": 2957 }, { "epoch": 1.3137908061292471, "grad_norm": 0.5382980257808322, "learning_rate": 8.492439444644506e-06, "loss": 0.0462, "step": 2958 }, { "epoch": 1.3142349544747947, "grad_norm": 0.4925896725925621, "learning_rate": 8.4910519228891e-06, "loss": 0.0602, "step": 2959 }, { "epoch": 1.314679102820342, "grad_norm": 1.1877590532157762, "learning_rate": 8.489663876369288e-06, "loss": 0.0723, "step": 2960 }, { "epoch": 1.3151232511658895, "grad_norm": 0.6920312294903902, "learning_rate": 8.488275305293715e-06, "loss": 0.0531, "step": 2961 }, { "epoch": 1.3155673995114368, "grad_norm": 0.36761567736761985, "learning_rate": 8.486886209871108e-06, "loss": 0.0397, "step": 2962 }, { "epoch": 1.3160115478569843, "grad_norm": 0.55057942024177, "learning_rate": 8.485496590310274e-06, "loss": 0.0591, "step": 2963 }, { "epoch": 1.3164556962025316, "grad_norm": 0.5710852175463684, "learning_rate": 8.484106446820094e-06, "loss": 0.0471, "step": 2964 }, { "epoch": 1.316899844548079, "grad_norm": 0.5828688111433763, "learning_rate": 8.482715779609526e-06, "loss": 0.0551, "step": 2965 }, { "epoch": 1.3173439928936266, "grad_norm": 0.531963458189058, "learning_rate": 8.481324588887619e-06, "loss": 0.0504, "step": 2966 }, { "epoch": 1.3177881412391739, "grad_norm": 0.9382025539874739, "learning_rate": 8.47993287486349e-06, "loss": 0.0778, "step": 2967 }, { "epoch": 1.3182322895847212, "grad_norm": 0.5014993856992925, "learning_rate": 8.478540637746334e-06, "loss": 0.0635, "step": 2968 }, { "epoch": 1.3186764379302687, "grad_norm": 0.396172698958811, "learning_rate": 8.477147877745431e-06, "loss": 0.0378, "step": 2969 }, { "epoch": 1.3191205862758162, "grad_norm": 0.5285271119946755, "learning_rate": 8.475754595070134e-06, "loss": 0.0544, "step": 2970 }, { "epoch": 1.3195647346213635, "grad_norm": 0.4032933677589254, "learning_rate": 8.474360789929881e-06, "loss": 0.0431, "step": 2971 }, { "epoch": 1.320008882966911, "grad_norm": 0.45143103762565806, "learning_rate": 8.47296646253418e-06, "loss": 0.0509, "step": 2972 }, { "epoch": 1.3204530313124583, "grad_norm": 0.5252154450521924, "learning_rate": 8.471571613092626e-06, "loss": 0.0594, "step": 2973 }, { "epoch": 1.3208971796580058, "grad_norm": 0.34642633471428613, "learning_rate": 8.470176241814886e-06, "loss": 0.027, "step": 2974 }, { "epoch": 1.321341328003553, "grad_norm": 0.4784318360661084, "learning_rate": 8.46878034891071e-06, "loss": 0.0444, "step": 2975 }, { "epoch": 1.3217854763491006, "grad_norm": 0.580781905109618, "learning_rate": 8.467383934589923e-06, "loss": 0.043, "step": 2976 }, { "epoch": 1.322229624694648, "grad_norm": 0.46709485088977415, "learning_rate": 8.465986999062427e-06, "loss": 0.0485, "step": 2977 }, { "epoch": 1.3226737730401954, "grad_norm": 0.7439128060207424, "learning_rate": 8.464589542538213e-06, "loss": 0.0566, "step": 2978 }, { "epoch": 1.323117921385743, "grad_norm": 0.5532502253043584, "learning_rate": 8.463191565227336e-06, "loss": 0.0486, "step": 2979 }, { "epoch": 1.3235620697312902, "grad_norm": 0.46900654723685586, "learning_rate": 8.461793067339936e-06, "loss": 0.0384, "step": 2980 }, { "epoch": 1.3240062180768377, "grad_norm": 0.4715987465734639, "learning_rate": 8.460394049086232e-06, "loss": 0.0467, "step": 2981 }, { "epoch": 1.324450366422385, "grad_norm": 0.4464995773383881, "learning_rate": 8.458994510676523e-06, "loss": 0.0392, "step": 2982 }, { "epoch": 1.3248945147679325, "grad_norm": 0.5120257592364733, "learning_rate": 8.457594452321178e-06, "loss": 0.0468, "step": 2983 }, { "epoch": 1.32533866311348, "grad_norm": 0.5011205376177407, "learning_rate": 8.456193874230656e-06, "loss": 0.0479, "step": 2984 }, { "epoch": 1.3257828114590273, "grad_norm": 0.4942109449168538, "learning_rate": 8.454792776615482e-06, "loss": 0.0456, "step": 2985 }, { "epoch": 1.3262269598045746, "grad_norm": 0.5223005309985502, "learning_rate": 8.453391159686268e-06, "loss": 0.0578, "step": 2986 }, { "epoch": 1.3266711081501221, "grad_norm": 0.4751851347230112, "learning_rate": 8.4519890236537e-06, "loss": 0.0496, "step": 2987 }, { "epoch": 1.3271152564956696, "grad_norm": 0.46876537416517666, "learning_rate": 8.450586368728541e-06, "loss": 0.0556, "step": 2988 }, { "epoch": 1.327559404841217, "grad_norm": 0.5342196778194671, "learning_rate": 8.449183195121638e-06, "loss": 0.0462, "step": 2989 }, { "epoch": 1.3280035531867644, "grad_norm": 0.7891053866526658, "learning_rate": 8.447779503043907e-06, "loss": 0.0695, "step": 2990 }, { "epoch": 1.3284477015323117, "grad_norm": 0.4837197404803745, "learning_rate": 8.44637529270635e-06, "loss": 0.0416, "step": 2991 }, { "epoch": 1.3288918498778592, "grad_norm": 0.4822831093408252, "learning_rate": 8.444970564320044e-06, "loss": 0.0492, "step": 2992 }, { "epoch": 1.3293359982234065, "grad_norm": 1.1468745158193232, "learning_rate": 8.443565318096141e-06, "loss": 0.0829, "step": 2993 }, { "epoch": 1.329780146568954, "grad_norm": 0.3846877899730632, "learning_rate": 8.442159554245875e-06, "loss": 0.0383, "step": 2994 }, { "epoch": 1.3302242949145016, "grad_norm": 0.48249463397520415, "learning_rate": 8.440753272980555e-06, "loss": 0.0487, "step": 2995 }, { "epoch": 1.3306684432600488, "grad_norm": 0.5300221622862935, "learning_rate": 8.439346474511572e-06, "loss": 0.0506, "step": 2996 }, { "epoch": 1.3311125916055961, "grad_norm": 0.6183171953601485, "learning_rate": 8.437939159050388e-06, "loss": 0.063, "step": 2997 }, { "epoch": 1.3315567399511437, "grad_norm": 0.5988476945915722, "learning_rate": 8.43653132680855e-06, "loss": 0.0625, "step": 2998 }, { "epoch": 1.3320008882966912, "grad_norm": 0.5556515413344068, "learning_rate": 8.435122977997675e-06, "loss": 0.0491, "step": 2999 }, { "epoch": 1.3324450366422385, "grad_norm": 0.4154893093213718, "learning_rate": 8.433714112829464e-06, "loss": 0.0431, "step": 3000 }, { "epoch": 1.332889184987786, "grad_norm": 0.6020512275698489, "learning_rate": 8.432304731515695e-06, "loss": 0.0596, "step": 3001 }, { "epoch": 1.3333333333333333, "grad_norm": 0.41304989339620046, "learning_rate": 8.430894834268218e-06, "loss": 0.0436, "step": 3002 }, { "epoch": 1.3337774816788808, "grad_norm": 0.5204666086414845, "learning_rate": 8.429484421298968e-06, "loss": 0.0581, "step": 3003 }, { "epoch": 1.334221630024428, "grad_norm": 0.5429684889982698, "learning_rate": 8.428073492819953e-06, "loss": 0.0404, "step": 3004 }, { "epoch": 1.3346657783699756, "grad_norm": 0.4188357634818928, "learning_rate": 8.426662049043258e-06, "loss": 0.0422, "step": 3005 }, { "epoch": 1.335109926715523, "grad_norm": 0.40403921511813046, "learning_rate": 8.42525009018105e-06, "loss": 0.0376, "step": 3006 }, { "epoch": 1.3355540750610704, "grad_norm": 0.40629767747303225, "learning_rate": 8.423837616445568e-06, "loss": 0.0404, "step": 3007 }, { "epoch": 1.335998223406618, "grad_norm": 0.4159409808633564, "learning_rate": 8.42242462804913e-06, "loss": 0.0394, "step": 3008 }, { "epoch": 1.3364423717521652, "grad_norm": 0.7041981820609958, "learning_rate": 8.421011125204134e-06, "loss": 0.0693, "step": 3009 }, { "epoch": 1.3368865200977127, "grad_norm": 0.7951485702237924, "learning_rate": 8.419597108123054e-06, "loss": 0.0612, "step": 3010 }, { "epoch": 1.33733066844326, "grad_norm": 0.3990758518161739, "learning_rate": 8.418182577018438e-06, "loss": 0.0511, "step": 3011 }, { "epoch": 1.3377748167888075, "grad_norm": 0.9209585349240309, "learning_rate": 8.416767532102918e-06, "loss": 0.0416, "step": 3012 }, { "epoch": 1.338218965134355, "grad_norm": 0.5459049872521075, "learning_rate": 8.415351973589197e-06, "loss": 0.0434, "step": 3013 }, { "epoch": 1.3386631134799023, "grad_norm": 0.5057701190286418, "learning_rate": 8.413935901690057e-06, "loss": 0.0434, "step": 3014 }, { "epoch": 1.3391072618254496, "grad_norm": 0.48056478669043734, "learning_rate": 8.412519316618359e-06, "loss": 0.0502, "step": 3015 }, { "epoch": 1.339551410170997, "grad_norm": 0.45090544113428555, "learning_rate": 8.411102218587039e-06, "loss": 0.0419, "step": 3016 }, { "epoch": 1.3399955585165446, "grad_norm": 0.4066569281524885, "learning_rate": 8.40968460780911e-06, "loss": 0.0368, "step": 3017 }, { "epoch": 1.340439706862092, "grad_norm": 0.48568103858702, "learning_rate": 8.408266484497664e-06, "loss": 0.0416, "step": 3018 }, { "epoch": 1.3408838552076394, "grad_norm": 0.5224793245797686, "learning_rate": 8.406847848865871e-06, "loss": 0.0573, "step": 3019 }, { "epoch": 1.3413280035531867, "grad_norm": 0.5080673094422519, "learning_rate": 8.405428701126973e-06, "loss": 0.0496, "step": 3020 }, { "epoch": 1.3417721518987342, "grad_norm": 0.5515622788542163, "learning_rate": 8.404009041494292e-06, "loss": 0.054, "step": 3021 }, { "epoch": 1.3422163002442815, "grad_norm": 0.4992441781647522, "learning_rate": 8.40258887018123e-06, "loss": 0.0383, "step": 3022 }, { "epoch": 1.342660448589829, "grad_norm": 0.5853495583783108, "learning_rate": 8.40116818740126e-06, "loss": 0.0419, "step": 3023 }, { "epoch": 1.3431045969353765, "grad_norm": 0.47598229779455054, "learning_rate": 8.399746993367936e-06, "loss": 0.0342, "step": 3024 }, { "epoch": 1.3435487452809238, "grad_norm": 0.46492013802658844, "learning_rate": 8.398325288294886e-06, "loss": 0.043, "step": 3025 }, { "epoch": 1.3439928936264711, "grad_norm": 0.4287825450323698, "learning_rate": 8.396903072395819e-06, "loss": 0.0401, "step": 3026 }, { "epoch": 1.3444370419720186, "grad_norm": 0.4586017747854982, "learning_rate": 8.395480345884516e-06, "loss": 0.0431, "step": 3027 }, { "epoch": 1.3448811903175661, "grad_norm": 0.494703530250107, "learning_rate": 8.39405710897484e-06, "loss": 0.0462, "step": 3028 }, { "epoch": 1.3453253386631134, "grad_norm": 0.5340552133978543, "learning_rate": 8.392633361880724e-06, "loss": 0.0567, "step": 3029 }, { "epoch": 1.345769487008661, "grad_norm": 0.49912393023299867, "learning_rate": 8.391209104816183e-06, "loss": 0.0501, "step": 3030 }, { "epoch": 1.3462136353542082, "grad_norm": 0.6779311071781129, "learning_rate": 8.389784337995306e-06, "loss": 0.0682, "step": 3031 }, { "epoch": 1.3466577836997558, "grad_norm": 0.47717232493661027, "learning_rate": 8.388359061632262e-06, "loss": 0.043, "step": 3032 }, { "epoch": 1.347101932045303, "grad_norm": 0.47326694390163015, "learning_rate": 8.386933275941294e-06, "loss": 0.0456, "step": 3033 }, { "epoch": 1.3475460803908506, "grad_norm": 0.6662990359563103, "learning_rate": 8.385506981136717e-06, "loss": 0.0487, "step": 3034 }, { "epoch": 1.347990228736398, "grad_norm": 0.718683780503916, "learning_rate": 8.384080177432933e-06, "loss": 0.0394, "step": 3035 }, { "epoch": 1.3484343770819454, "grad_norm": 0.5289766475406907, "learning_rate": 8.382652865044414e-06, "loss": 0.0553, "step": 3036 }, { "epoch": 1.3488785254274926, "grad_norm": 0.6441199621383914, "learning_rate": 8.381225044185708e-06, "loss": 0.0522, "step": 3037 }, { "epoch": 1.3493226737730402, "grad_norm": 0.9166796511578611, "learning_rate": 8.37979671507144e-06, "loss": 0.0458, "step": 3038 }, { "epoch": 1.3497668221185877, "grad_norm": 0.531028146563615, "learning_rate": 8.378367877916313e-06, "loss": 0.0522, "step": 3039 }, { "epoch": 1.350210970464135, "grad_norm": 0.46742487240112696, "learning_rate": 8.376938532935106e-06, "loss": 0.0379, "step": 3040 }, { "epoch": 1.3506551188096825, "grad_norm": 0.5443821001104316, "learning_rate": 8.375508680342674e-06, "loss": 0.0439, "step": 3041 }, { "epoch": 1.3510992671552298, "grad_norm": 0.5727922217841197, "learning_rate": 8.374078320353944e-06, "loss": 0.0424, "step": 3042 }, { "epoch": 1.3515434155007773, "grad_norm": 0.49458279067850347, "learning_rate": 8.37264745318393e-06, "loss": 0.0506, "step": 3043 }, { "epoch": 1.3519875638463246, "grad_norm": 1.3831663223083857, "learning_rate": 8.371216079047713e-06, "loss": 0.0392, "step": 3044 }, { "epoch": 1.352431712191872, "grad_norm": 0.4619345954223979, "learning_rate": 8.369784198160451e-06, "loss": 0.0413, "step": 3045 }, { "epoch": 1.3528758605374196, "grad_norm": 0.4847101460239981, "learning_rate": 8.368351810737383e-06, "loss": 0.0389, "step": 3046 }, { "epoch": 1.3533200088829669, "grad_norm": 0.5592056103162423, "learning_rate": 8.366918916993817e-06, "loss": 0.0527, "step": 3047 }, { "epoch": 1.3537641572285144, "grad_norm": 0.3872649705998668, "learning_rate": 8.365485517145145e-06, "loss": 0.0324, "step": 3048 }, { "epoch": 1.3542083055740617, "grad_norm": 0.4866737656887245, "learning_rate": 8.364051611406829e-06, "loss": 0.0438, "step": 3049 }, { "epoch": 1.3546524539196092, "grad_norm": 0.969360380725499, "learning_rate": 8.362617199994413e-06, "loss": 0.0471, "step": 3050 }, { "epoch": 1.3550966022651565, "grad_norm": 0.582815842887394, "learning_rate": 8.36118228312351e-06, "loss": 0.042, "step": 3051 }, { "epoch": 1.355540750610704, "grad_norm": 0.721376890360811, "learning_rate": 8.359746861009812e-06, "loss": 0.0625, "step": 3052 }, { "epoch": 1.3559848989562515, "grad_norm": 0.6319029678683107, "learning_rate": 8.358310933869091e-06, "loss": 0.0645, "step": 3053 }, { "epoch": 1.3564290473017988, "grad_norm": 2.7049677243291828, "learning_rate": 8.356874501917188e-06, "loss": 0.0531, "step": 3054 }, { "epoch": 1.356873195647346, "grad_norm": 0.3779085536381152, "learning_rate": 8.355437565370022e-06, "loss": 0.0356, "step": 3055 }, { "epoch": 1.3573173439928936, "grad_norm": 0.7424758490934648, "learning_rate": 8.354000124443594e-06, "loss": 0.0524, "step": 3056 }, { "epoch": 1.3577614923384411, "grad_norm": 0.6071475225743952, "learning_rate": 8.352562179353971e-06, "loss": 0.0476, "step": 3057 }, { "epoch": 1.3582056406839884, "grad_norm": 0.4823292349978002, "learning_rate": 8.351123730317303e-06, "loss": 0.037, "step": 3058 }, { "epoch": 1.358649789029536, "grad_norm": 0.42940088227364875, "learning_rate": 8.349684777549813e-06, "loss": 0.032, "step": 3059 }, { "epoch": 1.3590939373750832, "grad_norm": 0.6291618703403149, "learning_rate": 8.348245321267798e-06, "loss": 0.0542, "step": 3060 }, { "epoch": 1.3595380857206307, "grad_norm": 0.46443218939790437, "learning_rate": 8.346805361687637e-06, "loss": 0.0482, "step": 3061 }, { "epoch": 1.359982234066178, "grad_norm": 0.44266144811788105, "learning_rate": 8.345364899025776e-06, "loss": 0.0508, "step": 3062 }, { "epoch": 1.3604263824117255, "grad_norm": 0.5931758192805282, "learning_rate": 8.343923933498742e-06, "loss": 0.052, "step": 3063 }, { "epoch": 1.360870530757273, "grad_norm": 0.4228622965921645, "learning_rate": 8.342482465323141e-06, "loss": 0.0342, "step": 3064 }, { "epoch": 1.3613146791028203, "grad_norm": 0.6573207468033229, "learning_rate": 8.341040494715644e-06, "loss": 0.054, "step": 3065 }, { "epoch": 1.3617588274483676, "grad_norm": 0.694757132531818, "learning_rate": 8.339598021893007e-06, "loss": 0.0573, "step": 3066 }, { "epoch": 1.3622029757939151, "grad_norm": 0.5168331449919007, "learning_rate": 8.338155047072058e-06, "loss": 0.0476, "step": 3067 }, { "epoch": 1.3626471241394627, "grad_norm": 0.5294718006665416, "learning_rate": 8.336711570469698e-06, "loss": 0.0536, "step": 3068 }, { "epoch": 1.36309127248501, "grad_norm": 0.5902958008882144, "learning_rate": 8.33526759230291e-06, "loss": 0.0543, "step": 3069 }, { "epoch": 1.3635354208305575, "grad_norm": 0.5123768514504031, "learning_rate": 8.333823112788747e-06, "loss": 0.055, "step": 3070 }, { "epoch": 1.3639795691761047, "grad_norm": 0.4051444968388966, "learning_rate": 8.332378132144336e-06, "loss": 0.0408, "step": 3071 }, { "epoch": 1.3644237175216523, "grad_norm": 0.45045135771966455, "learning_rate": 8.330932650586887e-06, "loss": 0.0449, "step": 3072 }, { "epoch": 1.3648678658671995, "grad_norm": 0.6922855447849937, "learning_rate": 8.329486668333677e-06, "loss": 0.052, "step": 3073 }, { "epoch": 1.365312014212747, "grad_norm": 0.3651988449178566, "learning_rate": 8.328040185602063e-06, "loss": 0.0383, "step": 3074 }, { "epoch": 1.3657561625582946, "grad_norm": 0.6637690048102727, "learning_rate": 8.326593202609475e-06, "loss": 0.0571, "step": 3075 }, { "epoch": 1.3662003109038419, "grad_norm": 0.5428561517455449, "learning_rate": 8.325145719573419e-06, "loss": 0.0384, "step": 3076 }, { "epoch": 1.3666444592493894, "grad_norm": 0.7398928249310872, "learning_rate": 8.323697736711478e-06, "loss": 0.0535, "step": 3077 }, { "epoch": 1.3670886075949367, "grad_norm": 0.6968067508458905, "learning_rate": 8.322249254241309e-06, "loss": 0.0624, "step": 3078 }, { "epoch": 1.3675327559404842, "grad_norm": 0.42153823690397146, "learning_rate": 8.320800272380639e-06, "loss": 0.036, "step": 3079 }, { "epoch": 1.3679769042860315, "grad_norm": 0.7070448290401131, "learning_rate": 8.319350791347279e-06, "loss": 0.0682, "step": 3080 }, { "epoch": 1.368421052631579, "grad_norm": 0.5028115971086855, "learning_rate": 8.31790081135911e-06, "loss": 0.0543, "step": 3081 }, { "epoch": 1.3688652009771265, "grad_norm": 0.5307360164451863, "learning_rate": 8.316450332634084e-06, "loss": 0.0535, "step": 3082 }, { "epoch": 1.3693093493226738, "grad_norm": 0.6501199496749419, "learning_rate": 8.31499935539024e-06, "loss": 0.0461, "step": 3083 }, { "epoch": 1.369753497668221, "grad_norm": 0.5797446911962002, "learning_rate": 8.313547879845682e-06, "loss": 0.0472, "step": 3084 }, { "epoch": 1.3701976460137686, "grad_norm": 0.4268924633424178, "learning_rate": 8.312095906218588e-06, "loss": 0.0447, "step": 3085 }, { "epoch": 1.370641794359316, "grad_norm": 0.6304839984425172, "learning_rate": 8.310643434727216e-06, "loss": 0.0625, "step": 3086 }, { "epoch": 1.3710859427048634, "grad_norm": 0.5665653851714997, "learning_rate": 8.3091904655899e-06, "loss": 0.0539, "step": 3087 }, { "epoch": 1.371530091050411, "grad_norm": 0.40002359226927725, "learning_rate": 8.307736999025043e-06, "loss": 0.0329, "step": 3088 }, { "epoch": 1.3719742393959582, "grad_norm": 0.7513234692327438, "learning_rate": 8.306283035251125e-06, "loss": 0.0648, "step": 3089 }, { "epoch": 1.3724183877415057, "grad_norm": 0.5469597650300881, "learning_rate": 8.304828574486704e-06, "loss": 0.0432, "step": 3090 }, { "epoch": 1.372862536087053, "grad_norm": 0.48401783163541084, "learning_rate": 8.303373616950408e-06, "loss": 0.0457, "step": 3091 }, { "epoch": 1.3733066844326005, "grad_norm": 0.4692359706390245, "learning_rate": 8.301918162860944e-06, "loss": 0.0422, "step": 3092 }, { "epoch": 1.373750832778148, "grad_norm": 0.5378714608611251, "learning_rate": 8.30046221243709e-06, "loss": 0.0513, "step": 3093 }, { "epoch": 1.3741949811236953, "grad_norm": 0.5625724873607021, "learning_rate": 8.2990057658977e-06, "loss": 0.0532, "step": 3094 }, { "epoch": 1.3746391294692426, "grad_norm": 0.4899279935109507, "learning_rate": 8.297548823461704e-06, "loss": 0.0454, "step": 3095 }, { "epoch": 1.3750832778147901, "grad_norm": 0.7726706942428412, "learning_rate": 8.296091385348104e-06, "loss": 0.0697, "step": 3096 }, { "epoch": 1.3755274261603376, "grad_norm": 0.5012946731170276, "learning_rate": 8.294633451775977e-06, "loss": 0.0378, "step": 3097 }, { "epoch": 1.375971574505885, "grad_norm": 0.48151498654702213, "learning_rate": 8.293175022964476e-06, "loss": 0.0434, "step": 3098 }, { "epoch": 1.3764157228514324, "grad_norm": 0.4130737267146749, "learning_rate": 8.291716099132829e-06, "loss": 0.0359, "step": 3099 }, { "epoch": 1.3768598711969797, "grad_norm": 0.4681903495603353, "learning_rate": 8.290256680500336e-06, "loss": 0.0398, "step": 3100 }, { "epoch": 1.3773040195425272, "grad_norm": 0.6272554974390412, "learning_rate": 8.28879676728637e-06, "loss": 0.0627, "step": 3101 }, { "epoch": 1.3777481678880745, "grad_norm": 0.5051341999455976, "learning_rate": 8.287336359710386e-06, "loss": 0.047, "step": 3102 }, { "epoch": 1.378192316233622, "grad_norm": 0.5262863424826024, "learning_rate": 8.285875457991903e-06, "loss": 0.047, "step": 3103 }, { "epoch": 1.3786364645791696, "grad_norm": 0.5493114786417674, "learning_rate": 8.284414062350524e-06, "loss": 0.0477, "step": 3104 }, { "epoch": 1.3790806129247168, "grad_norm": 0.529779249166117, "learning_rate": 8.282952173005916e-06, "loss": 0.0489, "step": 3105 }, { "epoch": 1.3795247612702641, "grad_norm": 0.5169327523467377, "learning_rate": 8.28148979017783e-06, "loss": 0.0568, "step": 3106 }, { "epoch": 1.3799689096158116, "grad_norm": 0.6639124677133846, "learning_rate": 8.280026914086086e-06, "loss": 0.0831, "step": 3107 }, { "epoch": 1.3804130579613592, "grad_norm": 0.6604146215653776, "learning_rate": 8.278563544950579e-06, "loss": 0.0688, "step": 3108 }, { "epoch": 1.3808572063069064, "grad_norm": 0.499500034008347, "learning_rate": 8.277099682991276e-06, "loss": 0.0466, "step": 3109 }, { "epoch": 1.381301354652454, "grad_norm": 0.5463437639449784, "learning_rate": 8.275635328428226e-06, "loss": 0.0576, "step": 3110 }, { "epoch": 1.3817455029980013, "grad_norm": 0.43757681367045054, "learning_rate": 8.274170481481541e-06, "loss": 0.0348, "step": 3111 }, { "epoch": 1.3821896513435488, "grad_norm": 0.5431219597419497, "learning_rate": 8.272705142371414e-06, "loss": 0.0592, "step": 3112 }, { "epoch": 1.382633799689096, "grad_norm": 0.6091624737010345, "learning_rate": 8.271239311318111e-06, "loss": 0.0645, "step": 3113 }, { "epoch": 1.3830779480346436, "grad_norm": 0.46664330067036014, "learning_rate": 8.269772988541971e-06, "loss": 0.0293, "step": 3114 }, { "epoch": 1.383522096380191, "grad_norm": 0.39018350334607316, "learning_rate": 8.268306174263407e-06, "loss": 0.0488, "step": 3115 }, { "epoch": 1.3839662447257384, "grad_norm": 0.7360437552444796, "learning_rate": 8.266838868702904e-06, "loss": 0.0539, "step": 3116 }, { "epoch": 1.3844103930712859, "grad_norm": 0.3523843914894874, "learning_rate": 8.265371072081028e-06, "loss": 0.033, "step": 3117 }, { "epoch": 1.3848545414168332, "grad_norm": 0.5841378028235703, "learning_rate": 8.263902784618409e-06, "loss": 0.0512, "step": 3118 }, { "epoch": 1.3852986897623807, "grad_norm": 0.6604716728640524, "learning_rate": 8.262434006535759e-06, "loss": 0.0841, "step": 3119 }, { "epoch": 1.385742838107928, "grad_norm": 0.43203367132780174, "learning_rate": 8.260964738053859e-06, "loss": 0.0474, "step": 3120 }, { "epoch": 1.3861869864534755, "grad_norm": 0.48844306169229995, "learning_rate": 8.259494979393563e-06, "loss": 0.0481, "step": 3121 }, { "epoch": 1.386631134799023, "grad_norm": 0.3906618714847538, "learning_rate": 8.258024730775805e-06, "loss": 0.0398, "step": 3122 }, { "epoch": 1.3870752831445703, "grad_norm": 0.47997636522094667, "learning_rate": 8.256553992421583e-06, "loss": 0.0554, "step": 3123 }, { "epoch": 1.3875194314901176, "grad_norm": 0.6434789607226818, "learning_rate": 8.255082764551978e-06, "loss": 0.0484, "step": 3124 }, { "epoch": 1.387963579835665, "grad_norm": 0.3887987489990569, "learning_rate": 8.25361104738814e-06, "loss": 0.0464, "step": 3125 }, { "epoch": 1.3884077281812126, "grad_norm": 0.487796373914711, "learning_rate": 8.252138841151292e-06, "loss": 0.0441, "step": 3126 }, { "epoch": 1.38885187652676, "grad_norm": 0.6398270077190703, "learning_rate": 8.250666146062732e-06, "loss": 0.0419, "step": 3127 }, { "epoch": 1.3892960248723074, "grad_norm": 0.4422350214805325, "learning_rate": 8.249192962343829e-06, "loss": 0.041, "step": 3128 }, { "epoch": 1.3897401732178547, "grad_norm": 0.5252093775770235, "learning_rate": 8.247719290216032e-06, "loss": 0.0479, "step": 3129 }, { "epoch": 1.3901843215634022, "grad_norm": 0.6735609599734196, "learning_rate": 8.246245129900856e-06, "loss": 0.038, "step": 3130 }, { "epoch": 1.3906284699089495, "grad_norm": 0.5147679810299035, "learning_rate": 8.244770481619892e-06, "loss": 0.05, "step": 3131 }, { "epoch": 1.391072618254497, "grad_norm": 0.4260669615511085, "learning_rate": 8.243295345594807e-06, "loss": 0.0351, "step": 3132 }, { "epoch": 1.3915167666000445, "grad_norm": 0.5948196620222996, "learning_rate": 8.241819722047337e-06, "loss": 0.0532, "step": 3133 }, { "epoch": 1.3919609149455918, "grad_norm": 0.4954498969782017, "learning_rate": 8.240343611199294e-06, "loss": 0.0395, "step": 3134 }, { "epoch": 1.3924050632911391, "grad_norm": 0.5748384966095534, "learning_rate": 8.238867013272562e-06, "loss": 0.051, "step": 3135 }, { "epoch": 1.3928492116366866, "grad_norm": 0.47868756581223826, "learning_rate": 8.237389928489099e-06, "loss": 0.0406, "step": 3136 }, { "epoch": 1.3932933599822341, "grad_norm": 0.42581407271928307, "learning_rate": 8.235912357070938e-06, "loss": 0.0337, "step": 3137 }, { "epoch": 1.3937375083277814, "grad_norm": 0.6781537407713069, "learning_rate": 8.234434299240179e-06, "loss": 0.0483, "step": 3138 }, { "epoch": 1.394181656673329, "grad_norm": 0.598623626498893, "learning_rate": 8.232955755219002e-06, "loss": 0.0498, "step": 3139 }, { "epoch": 1.3946258050188762, "grad_norm": 0.6505669269254222, "learning_rate": 8.231476725229659e-06, "loss": 0.0391, "step": 3140 }, { "epoch": 1.3950699533644237, "grad_norm": 0.5150042095978116, "learning_rate": 8.229997209494468e-06, "loss": 0.0538, "step": 3141 }, { "epoch": 1.395514101709971, "grad_norm": 0.48445621459488397, "learning_rate": 8.228517208235829e-06, "loss": 0.0485, "step": 3142 }, { "epoch": 1.3959582500555185, "grad_norm": 0.42479311306640655, "learning_rate": 8.22703672167621e-06, "loss": 0.0454, "step": 3143 }, { "epoch": 1.396402398401066, "grad_norm": 0.4968133842399203, "learning_rate": 8.225555750038157e-06, "loss": 0.0548, "step": 3144 }, { "epoch": 1.3968465467466133, "grad_norm": 0.839408513781402, "learning_rate": 8.22407429354428e-06, "loss": 0.0508, "step": 3145 }, { "epoch": 1.3972906950921609, "grad_norm": 0.4593510375216177, "learning_rate": 8.222592352417268e-06, "loss": 0.0484, "step": 3146 }, { "epoch": 1.3977348434377082, "grad_norm": 0.41788586732862426, "learning_rate": 8.221109926879885e-06, "loss": 0.0394, "step": 3147 }, { "epoch": 1.3981789917832557, "grad_norm": 0.6011495923589089, "learning_rate": 8.219627017154962e-06, "loss": 0.0538, "step": 3148 }, { "epoch": 1.398623140128803, "grad_norm": 0.483716139104255, "learning_rate": 8.218143623465407e-06, "loss": 0.0479, "step": 3149 }, { "epoch": 1.3990672884743505, "grad_norm": 0.7567570451553496, "learning_rate": 8.216659746034199e-06, "loss": 0.059, "step": 3150 }, { "epoch": 1.399511436819898, "grad_norm": 0.48553150567848463, "learning_rate": 8.215175385084389e-06, "loss": 0.0617, "step": 3151 }, { "epoch": 1.3999555851654453, "grad_norm": 0.47459878094993846, "learning_rate": 8.2136905408391e-06, "loss": 0.0527, "step": 3152 }, { "epoch": 1.4003997335109926, "grad_norm": 0.5833223937440162, "learning_rate": 8.212205213521535e-06, "loss": 0.0626, "step": 3153 }, { "epoch": 1.40084388185654, "grad_norm": 0.35626979864365443, "learning_rate": 8.210719403354961e-06, "loss": 0.033, "step": 3154 }, { "epoch": 1.4012880302020876, "grad_norm": 0.7430849859689593, "learning_rate": 8.209233110562719e-06, "loss": 0.0645, "step": 3155 }, { "epoch": 1.4017321785476349, "grad_norm": 0.45738357167910687, "learning_rate": 8.207746335368223e-06, "loss": 0.0378, "step": 3156 }, { "epoch": 1.4021763268931824, "grad_norm": 0.5265228817791134, "learning_rate": 8.206259077994966e-06, "loss": 0.0653, "step": 3157 }, { "epoch": 1.4026204752387297, "grad_norm": 0.5945877517368485, "learning_rate": 8.204771338666504e-06, "loss": 0.0517, "step": 3158 }, { "epoch": 1.4030646235842772, "grad_norm": 0.48633704182422527, "learning_rate": 8.20328311760647e-06, "loss": 0.0467, "step": 3159 }, { "epoch": 1.4035087719298245, "grad_norm": 0.45125218858977967, "learning_rate": 8.201794415038569e-06, "loss": 0.0436, "step": 3160 }, { "epoch": 1.403952920275372, "grad_norm": 0.454560780546378, "learning_rate": 8.200305231186578e-06, "loss": 0.0566, "step": 3161 }, { "epoch": 1.4043970686209195, "grad_norm": 0.4442970232995022, "learning_rate": 8.198815566274346e-06, "loss": 0.0343, "step": 3162 }, { "epoch": 1.4048412169664668, "grad_norm": 0.4600734220829904, "learning_rate": 8.197325420525797e-06, "loss": 0.0471, "step": 3163 }, { "epoch": 1.405285365312014, "grad_norm": 0.46601308549255893, "learning_rate": 8.195834794164925e-06, "loss": 0.0427, "step": 3164 }, { "epoch": 1.4057295136575616, "grad_norm": 0.5996621159738204, "learning_rate": 8.194343687415795e-06, "loss": 0.0457, "step": 3165 }, { "epoch": 1.4061736620031091, "grad_norm": 0.9069132714321548, "learning_rate": 8.192852100502547e-06, "loss": 0.0678, "step": 3166 }, { "epoch": 1.4066178103486564, "grad_norm": 0.43166301548877434, "learning_rate": 8.191360033649392e-06, "loss": 0.0352, "step": 3167 }, { "epoch": 1.407061958694204, "grad_norm": 0.5216023574393313, "learning_rate": 8.18986748708061e-06, "loss": 0.0475, "step": 3168 }, { "epoch": 1.4075061070397512, "grad_norm": 0.5468879202276986, "learning_rate": 8.18837446102056e-06, "loss": 0.0547, "step": 3169 }, { "epoch": 1.4079502553852987, "grad_norm": 0.4122751122216154, "learning_rate": 8.186880955693667e-06, "loss": 0.0357, "step": 3170 }, { "epoch": 1.408394403730846, "grad_norm": 0.49605981026080986, "learning_rate": 8.18538697132443e-06, "loss": 0.0593, "step": 3171 }, { "epoch": 1.4088385520763935, "grad_norm": 0.7328755570855009, "learning_rate": 8.183892508137423e-06, "loss": 0.0691, "step": 3172 }, { "epoch": 1.409282700421941, "grad_norm": 0.5835998191367716, "learning_rate": 8.182397566357286e-06, "loss": 0.0621, "step": 3173 }, { "epoch": 1.4097268487674883, "grad_norm": 0.45687160374129326, "learning_rate": 8.180902146208734e-06, "loss": 0.0529, "step": 3174 }, { "epoch": 1.4101709971130356, "grad_norm": 0.4464006299057523, "learning_rate": 8.179406247916555e-06, "loss": 0.0462, "step": 3175 }, { "epoch": 1.4106151454585831, "grad_norm": 0.5104992313269215, "learning_rate": 8.17790987170561e-06, "loss": 0.0397, "step": 3176 }, { "epoch": 1.4110592938041306, "grad_norm": 0.4500241032324194, "learning_rate": 8.176413017800828e-06, "loss": 0.0588, "step": 3177 }, { "epoch": 1.411503442149678, "grad_norm": 0.9698569462217295, "learning_rate": 8.174915686427211e-06, "loss": 0.0593, "step": 3178 }, { "epoch": 1.4119475904952254, "grad_norm": 1.9467722797227018, "learning_rate": 8.173417877809835e-06, "loss": 0.0657, "step": 3179 }, { "epoch": 1.4123917388407727, "grad_norm": 0.8619854581857347, "learning_rate": 8.171919592173843e-06, "loss": 0.0616, "step": 3180 }, { "epoch": 1.4128358871863202, "grad_norm": 2.1957744086497324, "learning_rate": 8.170420829744458e-06, "loss": 0.0441, "step": 3181 }, { "epoch": 1.4132800355318675, "grad_norm": 0.437679909069633, "learning_rate": 8.168921590746964e-06, "loss": 0.0448, "step": 3182 }, { "epoch": 1.413724183877415, "grad_norm": 0.44545740554831986, "learning_rate": 8.167421875406725e-06, "loss": 0.0534, "step": 3183 }, { "epoch": 1.4141683322229626, "grad_norm": 0.6661022181292509, "learning_rate": 8.165921683949172e-06, "loss": 0.0752, "step": 3184 }, { "epoch": 1.4146124805685099, "grad_norm": 0.5379954758436954, "learning_rate": 8.164421016599811e-06, "loss": 0.0465, "step": 3185 }, { "epoch": 1.4150566289140574, "grad_norm": 0.7649578135185098, "learning_rate": 8.162919873584216e-06, "loss": 0.0655, "step": 3186 }, { "epoch": 1.4155007772596047, "grad_norm": 0.6155469523069211, "learning_rate": 8.161418255128037e-06, "loss": 0.058, "step": 3187 }, { "epoch": 1.4159449256051522, "grad_norm": 0.43980966134894045, "learning_rate": 8.15991616145699e-06, "loss": 0.046, "step": 3188 }, { "epoch": 1.4163890739506995, "grad_norm": 0.3772085957077231, "learning_rate": 8.158413592796867e-06, "loss": 0.0407, "step": 3189 }, { "epoch": 1.416833222296247, "grad_norm": 0.4916035633372075, "learning_rate": 8.156910549373529e-06, "loss": 0.0509, "step": 3190 }, { "epoch": 1.4172773706417945, "grad_norm": 0.44586244165360156, "learning_rate": 8.15540703141291e-06, "loss": 0.0421, "step": 3191 }, { "epoch": 1.4177215189873418, "grad_norm": 0.6752971156056389, "learning_rate": 8.153903039141011e-06, "loss": 0.0394, "step": 3192 }, { "epoch": 1.418165667332889, "grad_norm": 0.5021744951340577, "learning_rate": 8.15239857278391e-06, "loss": 0.0446, "step": 3193 }, { "epoch": 1.4186098156784366, "grad_norm": 0.4057752198360003, "learning_rate": 8.150893632567755e-06, "loss": 0.0413, "step": 3194 }, { "epoch": 1.419053964023984, "grad_norm": 0.44646591292416776, "learning_rate": 8.149388218718763e-06, "loss": 0.033, "step": 3195 }, { "epoch": 1.4194981123695314, "grad_norm": 0.5430849460817241, "learning_rate": 8.147882331463221e-06, "loss": 0.0476, "step": 3196 }, { "epoch": 1.419942260715079, "grad_norm": 0.5748975413489712, "learning_rate": 8.146375971027492e-06, "loss": 0.0422, "step": 3197 }, { "epoch": 1.4203864090606262, "grad_norm": 0.8569726405019049, "learning_rate": 8.144869137638008e-06, "loss": 0.0586, "step": 3198 }, { "epoch": 1.4208305574061737, "grad_norm": 0.48111798749968265, "learning_rate": 8.14336183152127e-06, "loss": 0.0511, "step": 3199 }, { "epoch": 1.421274705751721, "grad_norm": 0.4447988259403195, "learning_rate": 8.141854052903853e-06, "loss": 0.0409, "step": 3200 }, { "epoch": 1.4217188540972685, "grad_norm": 0.42318494315184574, "learning_rate": 8.1403458020124e-06, "loss": 0.0393, "step": 3201 }, { "epoch": 1.422163002442816, "grad_norm": 0.527946330855177, "learning_rate": 8.138837079073628e-06, "loss": 0.0544, "step": 3202 }, { "epoch": 1.4226071507883633, "grad_norm": 0.5168740647207917, "learning_rate": 8.137327884314323e-06, "loss": 0.042, "step": 3203 }, { "epoch": 1.4230512991339106, "grad_norm": 0.44556932550096595, "learning_rate": 8.135818217961344e-06, "loss": 0.043, "step": 3204 }, { "epoch": 1.423495447479458, "grad_norm": 0.4521168782601511, "learning_rate": 8.13430808024162e-06, "loss": 0.0616, "step": 3205 }, { "epoch": 1.4239395958250056, "grad_norm": 0.998651913471337, "learning_rate": 8.132797471382148e-06, "loss": 0.0857, "step": 3206 }, { "epoch": 1.424383744170553, "grad_norm": 0.4845757133637773, "learning_rate": 8.131286391609996e-06, "loss": 0.0316, "step": 3207 }, { "epoch": 1.4248278925161004, "grad_norm": 0.5115534012315782, "learning_rate": 8.129774841152311e-06, "loss": 0.0442, "step": 3208 }, { "epoch": 1.4252720408616477, "grad_norm": 0.4658784277879884, "learning_rate": 8.128262820236302e-06, "loss": 0.0392, "step": 3209 }, { "epoch": 1.4257161892071952, "grad_norm": 0.7323138806429754, "learning_rate": 8.12675032908925e-06, "loss": 0.0476, "step": 3210 }, { "epoch": 1.4261603375527425, "grad_norm": 0.7544916173168683, "learning_rate": 8.125237367938511e-06, "loss": 0.0539, "step": 3211 }, { "epoch": 1.42660448589829, "grad_norm": 0.62197595498055, "learning_rate": 8.123723937011507e-06, "loss": 0.0526, "step": 3212 }, { "epoch": 1.4270486342438375, "grad_norm": 0.411592343530701, "learning_rate": 8.12221003653573e-06, "loss": 0.0422, "step": 3213 }, { "epoch": 1.4274927825893848, "grad_norm": 0.5467878759617523, "learning_rate": 8.12069566673875e-06, "loss": 0.0486, "step": 3214 }, { "epoch": 1.4279369309349323, "grad_norm": 0.3951319611407401, "learning_rate": 8.119180827848199e-06, "loss": 0.0372, "step": 3215 }, { "epoch": 1.4283810792804796, "grad_norm": 0.5355923646902133, "learning_rate": 8.117665520091783e-06, "loss": 0.0583, "step": 3216 }, { "epoch": 1.4288252276260272, "grad_norm": 0.49870365746589324, "learning_rate": 8.11614974369728e-06, "loss": 0.0521, "step": 3217 }, { "epoch": 1.4292693759715744, "grad_norm": 0.5463670187827259, "learning_rate": 8.114633498892537e-06, "loss": 0.0567, "step": 3218 }, { "epoch": 1.429713524317122, "grad_norm": 0.5699638456777965, "learning_rate": 8.11311678590547e-06, "loss": 0.05, "step": 3219 }, { "epoch": 1.4301576726626695, "grad_norm": 0.3789965811347938, "learning_rate": 8.11159960496407e-06, "loss": 0.0405, "step": 3220 }, { "epoch": 1.4306018210082168, "grad_norm": 0.4496180621274656, "learning_rate": 8.11008195629639e-06, "loss": 0.0414, "step": 3221 }, { "epoch": 1.431045969353764, "grad_norm": 0.6660820533473273, "learning_rate": 8.10856384013056e-06, "loss": 0.0519, "step": 3222 }, { "epoch": 1.4314901176993116, "grad_norm": 0.42475615332155653, "learning_rate": 8.107045256694782e-06, "loss": 0.0507, "step": 3223 }, { "epoch": 1.431934266044859, "grad_norm": 0.531367471068279, "learning_rate": 8.105526206217322e-06, "loss": 0.0438, "step": 3224 }, { "epoch": 1.4323784143904064, "grad_norm": 0.6173452877900699, "learning_rate": 8.104006688926518e-06, "loss": 0.044, "step": 3225 }, { "epoch": 1.4328225627359539, "grad_norm": 0.5047091534195701, "learning_rate": 8.102486705050782e-06, "loss": 0.0448, "step": 3226 }, { "epoch": 1.4332667110815012, "grad_norm": 0.756671059089655, "learning_rate": 8.100966254818591e-06, "loss": 0.0472, "step": 3227 }, { "epoch": 1.4337108594270487, "grad_norm": 0.6679241833390342, "learning_rate": 8.099445338458496e-06, "loss": 0.0428, "step": 3228 }, { "epoch": 1.434155007772596, "grad_norm": 0.40517368896131173, "learning_rate": 8.097923956199118e-06, "loss": 0.0366, "step": 3229 }, { "epoch": 1.4345991561181435, "grad_norm": 0.4490587246876595, "learning_rate": 8.096402108269144e-06, "loss": 0.0356, "step": 3230 }, { "epoch": 1.435043304463691, "grad_norm": 0.5508426411635136, "learning_rate": 8.094879794897333e-06, "loss": 0.0499, "step": 3231 }, { "epoch": 1.4354874528092383, "grad_norm": 0.39803990157367924, "learning_rate": 8.093357016312518e-06, "loss": 0.039, "step": 3232 }, { "epoch": 1.4359316011547856, "grad_norm": 0.3911481714996281, "learning_rate": 8.091833772743595e-06, "loss": 0.0408, "step": 3233 }, { "epoch": 1.436375749500333, "grad_norm": 0.43202710834535546, "learning_rate": 8.090310064419536e-06, "loss": 0.0517, "step": 3234 }, { "epoch": 1.4368198978458806, "grad_norm": 0.63394524089963, "learning_rate": 8.088785891569379e-06, "loss": 0.0498, "step": 3235 }, { "epoch": 1.437264046191428, "grad_norm": 0.6014431551871622, "learning_rate": 8.087261254422232e-06, "loss": 0.0553, "step": 3236 }, { "epoch": 1.4377081945369754, "grad_norm": 0.6136207381555616, "learning_rate": 8.085736153207277e-06, "loss": 0.0594, "step": 3237 }, { "epoch": 1.4381523428825227, "grad_norm": 0.5952470946539382, "learning_rate": 8.08421058815376e-06, "loss": 0.0769, "step": 3238 }, { "epoch": 1.4385964912280702, "grad_norm": 0.5189933118580958, "learning_rate": 8.082684559490999e-06, "loss": 0.0554, "step": 3239 }, { "epoch": 1.4390406395736175, "grad_norm": 0.4135426057188439, "learning_rate": 8.081158067448385e-06, "loss": 0.0396, "step": 3240 }, { "epoch": 1.439484787919165, "grad_norm": 0.5443217663228829, "learning_rate": 8.079631112255372e-06, "loss": 0.044, "step": 3241 }, { "epoch": 1.4399289362647125, "grad_norm": 0.5826766711918354, "learning_rate": 8.078103694141487e-06, "loss": 0.0446, "step": 3242 }, { "epoch": 1.4403730846102598, "grad_norm": 0.648859138960752, "learning_rate": 8.076575813336333e-06, "loss": 0.0609, "step": 3243 }, { "epoch": 1.440817232955807, "grad_norm": 0.4409431909915432, "learning_rate": 8.07504747006957e-06, "loss": 0.0531, "step": 3244 }, { "epoch": 1.4412613813013546, "grad_norm": 0.3721919303810988, "learning_rate": 8.073518664570938e-06, "loss": 0.0386, "step": 3245 }, { "epoch": 1.4417055296469021, "grad_norm": 0.43841622793791346, "learning_rate": 8.07198939707024e-06, "loss": 0.0493, "step": 3246 }, { "epoch": 1.4421496779924494, "grad_norm": 0.5931350944063895, "learning_rate": 8.070459667797351e-06, "loss": 0.0634, "step": 3247 }, { "epoch": 1.442593826337997, "grad_norm": 0.41937120746821677, "learning_rate": 8.068929476982217e-06, "loss": 0.0423, "step": 3248 }, { "epoch": 1.4430379746835442, "grad_norm": 0.6050623457059973, "learning_rate": 8.067398824854851e-06, "loss": 0.07, "step": 3249 }, { "epoch": 1.4434821230290917, "grad_norm": 0.36263255781004144, "learning_rate": 8.065867711645334e-06, "loss": 0.0488, "step": 3250 }, { "epoch": 1.443926271374639, "grad_norm": 0.5452720169189981, "learning_rate": 8.064336137583821e-06, "loss": 0.0592, "step": 3251 }, { "epoch": 1.4443704197201865, "grad_norm": 0.5648377578907434, "learning_rate": 8.062804102900532e-06, "loss": 0.045, "step": 3252 }, { "epoch": 1.444814568065734, "grad_norm": 0.5203009129121725, "learning_rate": 8.061271607825758e-06, "loss": 0.0455, "step": 3253 }, { "epoch": 1.4452587164112813, "grad_norm": 0.5388011169643779, "learning_rate": 8.059738652589862e-06, "loss": 0.0563, "step": 3254 }, { "epoch": 1.4457028647568289, "grad_norm": 0.41394935328254073, "learning_rate": 8.058205237423266e-06, "loss": 0.0523, "step": 3255 }, { "epoch": 1.4461470131023761, "grad_norm": 0.921759574953431, "learning_rate": 8.056671362556476e-06, "loss": 0.0356, "step": 3256 }, { "epoch": 1.4465911614479237, "grad_norm": 0.8754840871870988, "learning_rate": 8.055137028220058e-06, "loss": 0.0615, "step": 3257 }, { "epoch": 1.447035309793471, "grad_norm": 0.4878410786722894, "learning_rate": 8.053602234644644e-06, "loss": 0.0415, "step": 3258 }, { "epoch": 1.4474794581390185, "grad_norm": 0.6534435017529012, "learning_rate": 8.052066982060945e-06, "loss": 0.0579, "step": 3259 }, { "epoch": 1.447923606484566, "grad_norm": 0.9319839406156598, "learning_rate": 8.050531270699731e-06, "loss": 0.0484, "step": 3260 }, { "epoch": 1.4483677548301133, "grad_norm": 0.4011114975003397, "learning_rate": 8.048995100791847e-06, "loss": 0.0372, "step": 3261 }, { "epoch": 1.4488119031756606, "grad_norm": 0.5604426931227581, "learning_rate": 8.047458472568208e-06, "loss": 0.0496, "step": 3262 }, { "epoch": 1.449256051521208, "grad_norm": 0.3856602729544308, "learning_rate": 8.045921386259792e-06, "loss": 0.0368, "step": 3263 }, { "epoch": 1.4497001998667556, "grad_norm": 0.37413170967076825, "learning_rate": 8.044383842097651e-06, "loss": 0.0426, "step": 3264 }, { "epoch": 1.4501443482123029, "grad_norm": 0.4129802274607094, "learning_rate": 8.042845840312903e-06, "loss": 0.0398, "step": 3265 }, { "epoch": 1.4505884965578504, "grad_norm": 0.8418363464980803, "learning_rate": 8.041307381136738e-06, "loss": 0.0656, "step": 3266 }, { "epoch": 1.4510326449033977, "grad_norm": 0.5365298547106184, "learning_rate": 8.039768464800408e-06, "loss": 0.048, "step": 3267 }, { "epoch": 1.4514767932489452, "grad_norm": 1.0687462882240213, "learning_rate": 8.038229091535244e-06, "loss": 0.0503, "step": 3268 }, { "epoch": 1.4519209415944925, "grad_norm": 0.49239171264959003, "learning_rate": 8.036689261572636e-06, "loss": 0.0563, "step": 3269 }, { "epoch": 1.45236508994004, "grad_norm": 0.5011792630285886, "learning_rate": 8.035148975144046e-06, "loss": 0.0396, "step": 3270 }, { "epoch": 1.4528092382855875, "grad_norm": 0.49603707035563677, "learning_rate": 8.033608232481009e-06, "loss": 0.0501, "step": 3271 }, { "epoch": 1.4532533866311348, "grad_norm": 0.5939987053954954, "learning_rate": 8.032067033815123e-06, "loss": 0.0458, "step": 3272 }, { "epoch": 1.453697534976682, "grad_norm": 0.5812600520648006, "learning_rate": 8.030525379378053e-06, "loss": 0.0447, "step": 3273 }, { "epoch": 1.4541416833222296, "grad_norm": 0.6480809819716409, "learning_rate": 8.028983269401542e-06, "loss": 0.0608, "step": 3274 }, { "epoch": 1.454585831667777, "grad_norm": 0.5090480104564535, "learning_rate": 8.027440704117391e-06, "loss": 0.0559, "step": 3275 }, { "epoch": 1.4550299800133244, "grad_norm": 0.6602628376919341, "learning_rate": 8.025897683757473e-06, "loss": 0.0587, "step": 3276 }, { "epoch": 1.455474128358872, "grad_norm": 0.5252379759239632, "learning_rate": 8.024354208553735e-06, "loss": 0.061, "step": 3277 }, { "epoch": 1.4559182767044192, "grad_norm": 0.4725375990607396, "learning_rate": 8.022810278738185e-06, "loss": 0.0408, "step": 3278 }, { "epoch": 1.4563624250499667, "grad_norm": 0.5401711443633535, "learning_rate": 8.021265894542898e-06, "loss": 0.0667, "step": 3279 }, { "epoch": 1.456806573395514, "grad_norm": 0.5352342030628493, "learning_rate": 8.019721056200027e-06, "loss": 0.0458, "step": 3280 }, { "epoch": 1.4572507217410615, "grad_norm": 0.4210073842896914, "learning_rate": 8.018175763941784e-06, "loss": 0.0437, "step": 3281 }, { "epoch": 1.457694870086609, "grad_norm": 0.3904193074228819, "learning_rate": 8.016630018000457e-06, "loss": 0.0456, "step": 3282 }, { "epoch": 1.4581390184321563, "grad_norm": 0.4133159146388692, "learning_rate": 8.015083818608393e-06, "loss": 0.0385, "step": 3283 }, { "epoch": 1.4585831667777038, "grad_norm": 0.5275288891932939, "learning_rate": 8.013537165998014e-06, "loss": 0.0477, "step": 3284 }, { "epoch": 1.4590273151232511, "grad_norm": 0.43540224877298483, "learning_rate": 8.011990060401806e-06, "loss": 0.0433, "step": 3285 }, { "epoch": 1.4594714634687986, "grad_norm": 0.476963740475049, "learning_rate": 8.010442502052329e-06, "loss": 0.0549, "step": 3286 }, { "epoch": 1.459915611814346, "grad_norm": 0.4927774183701049, "learning_rate": 8.008894491182205e-06, "loss": 0.0464, "step": 3287 }, { "epoch": 1.4603597601598934, "grad_norm": 0.4716288282663322, "learning_rate": 8.007346028024125e-06, "loss": 0.0437, "step": 3288 }, { "epoch": 1.460803908505441, "grad_norm": 0.5277979981560662, "learning_rate": 8.005797112810854e-06, "loss": 0.0452, "step": 3289 }, { "epoch": 1.4612480568509882, "grad_norm": 0.6486682818717127, "learning_rate": 8.004247745775216e-06, "loss": 0.0636, "step": 3290 }, { "epoch": 1.4616922051965355, "grad_norm": 0.693590181694553, "learning_rate": 8.00269792715011e-06, "loss": 0.0392, "step": 3291 }, { "epoch": 1.462136353542083, "grad_norm": 0.4973813779816927, "learning_rate": 8.001147657168497e-06, "loss": 0.0523, "step": 3292 }, { "epoch": 1.4625805018876306, "grad_norm": 0.42880946574568146, "learning_rate": 7.99959693606341e-06, "loss": 0.0365, "step": 3293 }, { "epoch": 1.4630246502331778, "grad_norm": 0.4647071982633707, "learning_rate": 7.99804576406795e-06, "loss": 0.0473, "step": 3294 }, { "epoch": 1.4634687985787254, "grad_norm": 0.5564296867063627, "learning_rate": 7.996494141415284e-06, "loss": 0.0654, "step": 3295 }, { "epoch": 1.4639129469242727, "grad_norm": 0.4602454467326126, "learning_rate": 7.994942068338647e-06, "loss": 0.0375, "step": 3296 }, { "epoch": 1.4643570952698202, "grad_norm": 0.44240678049891874, "learning_rate": 7.993389545071341e-06, "loss": 0.0538, "step": 3297 }, { "epoch": 1.4648012436153675, "grad_norm": 0.5956552998865925, "learning_rate": 7.991836571846739e-06, "loss": 0.0483, "step": 3298 }, { "epoch": 1.465245391960915, "grad_norm": 0.8272941029925692, "learning_rate": 7.990283148898277e-06, "loss": 0.0621, "step": 3299 }, { "epoch": 1.4656895403064625, "grad_norm": 0.521288053624164, "learning_rate": 7.988729276459463e-06, "loss": 0.0389, "step": 3300 }, { "epoch": 1.4661336886520098, "grad_norm": 0.3918246593118355, "learning_rate": 7.987174954763867e-06, "loss": 0.0351, "step": 3301 }, { "epoch": 1.466577836997557, "grad_norm": 0.5695857738579163, "learning_rate": 7.985620184045133e-06, "loss": 0.051, "step": 3302 }, { "epoch": 1.4670219853431046, "grad_norm": 0.5461168059678839, "learning_rate": 7.98406496453697e-06, "loss": 0.045, "step": 3303 }, { "epoch": 1.467466133688652, "grad_norm": 0.45316446756220935, "learning_rate": 7.982509296473151e-06, "loss": 0.0376, "step": 3304 }, { "epoch": 1.4679102820341994, "grad_norm": 0.5116686497553603, "learning_rate": 7.98095318008752e-06, "loss": 0.0523, "step": 3305 }, { "epoch": 1.4683544303797469, "grad_norm": 0.5042517448525269, "learning_rate": 7.97939661561399e-06, "loss": 0.0473, "step": 3306 }, { "epoch": 1.4687985787252942, "grad_norm": 0.5555029036352702, "learning_rate": 7.977839603286537e-06, "loss": 0.053, "step": 3307 }, { "epoch": 1.4692427270708417, "grad_norm": 0.3410097103369562, "learning_rate": 7.976282143339207e-06, "loss": 0.0293, "step": 3308 }, { "epoch": 1.469686875416389, "grad_norm": 0.6134381443649801, "learning_rate": 7.974724236006113e-06, "loss": 0.0528, "step": 3309 }, { "epoch": 1.4701310237619365, "grad_norm": 0.4165723941603629, "learning_rate": 7.973165881521435e-06, "loss": 0.041, "step": 3310 }, { "epoch": 1.470575172107484, "grad_norm": 0.4357045233722827, "learning_rate": 7.971607080119418e-06, "loss": 0.0499, "step": 3311 }, { "epoch": 1.4710193204530313, "grad_norm": 0.5469760548909597, "learning_rate": 7.97004783203438e-06, "loss": 0.0436, "step": 3312 }, { "epoch": 1.4714634687985786, "grad_norm": 0.4196612097876158, "learning_rate": 7.968488137500699e-06, "loss": 0.0474, "step": 3313 }, { "epoch": 1.471907617144126, "grad_norm": 0.4878242528103949, "learning_rate": 7.966927996752824e-06, "loss": 0.0407, "step": 3314 }, { "epoch": 1.4723517654896736, "grad_norm": 0.4654212754221821, "learning_rate": 7.965367410025275e-06, "loss": 0.0276, "step": 3315 }, { "epoch": 1.472795913835221, "grad_norm": 0.4496002039864948, "learning_rate": 7.96380637755263e-06, "loss": 0.043, "step": 3316 }, { "epoch": 1.4732400621807684, "grad_norm": 0.843662279100629, "learning_rate": 7.96224489956954e-06, "loss": 0.0507, "step": 3317 }, { "epoch": 1.4736842105263157, "grad_norm": 0.32395601492358206, "learning_rate": 7.960682976310721e-06, "loss": 0.0356, "step": 3318 }, { "epoch": 1.4741283588718632, "grad_norm": 0.6598994798652713, "learning_rate": 7.959120608010959e-06, "loss": 0.0532, "step": 3319 }, { "epoch": 1.4745725072174105, "grad_norm": 0.6135947839100228, "learning_rate": 7.957557794905104e-06, "loss": 0.0568, "step": 3320 }, { "epoch": 1.475016655562958, "grad_norm": 0.5386416254084523, "learning_rate": 7.955994537228068e-06, "loss": 0.0524, "step": 3321 }, { "epoch": 1.4754608039085055, "grad_norm": 0.41178054802832886, "learning_rate": 7.954430835214844e-06, "loss": 0.0478, "step": 3322 }, { "epoch": 1.4759049522540528, "grad_norm": 0.553965746901051, "learning_rate": 7.952866689100476e-06, "loss": 0.0497, "step": 3323 }, { "epoch": 1.4763491005996003, "grad_norm": 0.5002040941699343, "learning_rate": 7.951302099120087e-06, "loss": 0.0479, "step": 3324 }, { "epoch": 1.4767932489451476, "grad_norm": 0.5774527696932492, "learning_rate": 7.949737065508856e-06, "loss": 0.0447, "step": 3325 }, { "epoch": 1.4772373972906951, "grad_norm": 0.5834169178186834, "learning_rate": 7.948171588502036e-06, "loss": 0.0442, "step": 3326 }, { "epoch": 1.4776815456362424, "grad_norm": 0.48069558429150877, "learning_rate": 7.946605668334947e-06, "loss": 0.0509, "step": 3327 }, { "epoch": 1.47812569398179, "grad_norm": 0.6916569805639394, "learning_rate": 7.945039305242972e-06, "loss": 0.0543, "step": 3328 }, { "epoch": 1.4785698423273375, "grad_norm": 0.45596286134877817, "learning_rate": 7.943472499461562e-06, "loss": 0.0463, "step": 3329 }, { "epoch": 1.4790139906728847, "grad_norm": 0.7085391238735931, "learning_rate": 7.941905251226235e-06, "loss": 0.0643, "step": 3330 }, { "epoch": 1.479458139018432, "grad_norm": 0.878574496425455, "learning_rate": 7.940337560772573e-06, "loss": 0.0623, "step": 3331 }, { "epoch": 1.4799022873639796, "grad_norm": 0.3602452580945399, "learning_rate": 7.93876942833623e-06, "loss": 0.0325, "step": 3332 }, { "epoch": 1.480346435709527, "grad_norm": 0.5694019432619376, "learning_rate": 7.937200854152917e-06, "loss": 0.0527, "step": 3333 }, { "epoch": 1.4807905840550744, "grad_norm": 0.46397062738751477, "learning_rate": 7.935631838458426e-06, "loss": 0.0435, "step": 3334 }, { "epoch": 1.4812347324006219, "grad_norm": 0.38977994271678623, "learning_rate": 7.9340623814886e-06, "loss": 0.0346, "step": 3335 }, { "epoch": 1.4816788807461692, "grad_norm": 0.5991214620269069, "learning_rate": 7.932492483479358e-06, "loss": 0.0519, "step": 3336 }, { "epoch": 1.4821230290917167, "grad_norm": 0.42521594572783633, "learning_rate": 7.930922144666679e-06, "loss": 0.0407, "step": 3337 }, { "epoch": 1.482567177437264, "grad_norm": 0.6068474747842975, "learning_rate": 7.929351365286614e-06, "loss": 0.0569, "step": 3338 }, { "epoch": 1.4830113257828115, "grad_norm": 0.3673998638748444, "learning_rate": 7.927780145575281e-06, "loss": 0.0398, "step": 3339 }, { "epoch": 1.483455474128359, "grad_norm": 0.6378602388998965, "learning_rate": 7.926208485768856e-06, "loss": 0.0445, "step": 3340 }, { "epoch": 1.4838996224739063, "grad_norm": 0.4635352438588869, "learning_rate": 7.924636386103588e-06, "loss": 0.0472, "step": 3341 }, { "epoch": 1.4843437708194536, "grad_norm": 0.43741344875712923, "learning_rate": 7.923063846815791e-06, "loss": 0.0365, "step": 3342 }, { "epoch": 1.484787919165001, "grad_norm": 0.409274641921781, "learning_rate": 7.921490868141843e-06, "loss": 0.0437, "step": 3343 }, { "epoch": 1.4852320675105486, "grad_norm": 0.44864312363339953, "learning_rate": 7.91991745031819e-06, "loss": 0.0365, "step": 3344 }, { "epoch": 1.4856762158560959, "grad_norm": 0.7767506869882325, "learning_rate": 7.918343593581344e-06, "loss": 0.055, "step": 3345 }, { "epoch": 1.4861203642016434, "grad_norm": 0.663893547353565, "learning_rate": 7.916769298167881e-06, "loss": 0.0539, "step": 3346 }, { "epoch": 1.4865645125471907, "grad_norm": 0.44134070690444965, "learning_rate": 7.915194564314446e-06, "loss": 0.0451, "step": 3347 }, { "epoch": 1.4870086608927382, "grad_norm": 0.5644812574628542, "learning_rate": 7.913619392257748e-06, "loss": 0.052, "step": 3348 }, { "epoch": 1.4874528092382855, "grad_norm": 0.4293480800359804, "learning_rate": 7.912043782234562e-06, "loss": 0.0375, "step": 3349 }, { "epoch": 1.487896957583833, "grad_norm": 0.4208900641552744, "learning_rate": 7.910467734481726e-06, "loss": 0.0263, "step": 3350 }, { "epoch": 1.4883411059293805, "grad_norm": 0.5686191171590155, "learning_rate": 7.90889124923615e-06, "loss": 0.0552, "step": 3351 }, { "epoch": 1.4887852542749278, "grad_norm": 0.45995768523750874, "learning_rate": 7.907314326734807e-06, "loss": 0.0392, "step": 3352 }, { "epoch": 1.4892294026204753, "grad_norm": 0.6197595746880575, "learning_rate": 7.905736967214735e-06, "loss": 0.055, "step": 3353 }, { "epoch": 1.4896735509660226, "grad_norm": 0.6356786842165147, "learning_rate": 7.904159170913035e-06, "loss": 0.046, "step": 3354 }, { "epoch": 1.4901176993115701, "grad_norm": 0.4601859054406351, "learning_rate": 7.902580938066878e-06, "loss": 0.0414, "step": 3355 }, { "epoch": 1.4905618476571174, "grad_norm": 0.36553812505176564, "learning_rate": 7.901002268913501e-06, "loss": 0.0322, "step": 3356 }, { "epoch": 1.491005996002665, "grad_norm": 0.615226798089253, "learning_rate": 7.899423163690204e-06, "loss": 0.0549, "step": 3357 }, { "epoch": 1.4914501443482124, "grad_norm": 0.4095361485944277, "learning_rate": 7.897843622634352e-06, "loss": 0.0387, "step": 3358 }, { "epoch": 1.4918942926937597, "grad_norm": 0.5006702378564764, "learning_rate": 7.896263645983378e-06, "loss": 0.0517, "step": 3359 }, { "epoch": 1.492338441039307, "grad_norm": 0.6244366583571412, "learning_rate": 7.89468323397478e-06, "loss": 0.0496, "step": 3360 }, { "epoch": 1.4927825893848545, "grad_norm": 0.5367025221002972, "learning_rate": 7.893102386846118e-06, "loss": 0.0436, "step": 3361 }, { "epoch": 1.493226737730402, "grad_norm": 0.4508013926132327, "learning_rate": 7.891521104835023e-06, "loss": 0.0344, "step": 3362 }, { "epoch": 1.4936708860759493, "grad_norm": 0.6393470972778723, "learning_rate": 7.889939388179188e-06, "loss": 0.0526, "step": 3363 }, { "epoch": 1.4941150344214968, "grad_norm": 0.472940776234741, "learning_rate": 7.888357237116372e-06, "loss": 0.0431, "step": 3364 }, { "epoch": 1.4945591827670441, "grad_norm": 0.4993044626375099, "learning_rate": 7.886774651884397e-06, "loss": 0.0541, "step": 3365 }, { "epoch": 1.4950033311125916, "grad_norm": 0.4991379094630535, "learning_rate": 7.885191632721156e-06, "loss": 0.0423, "step": 3366 }, { "epoch": 1.495447479458139, "grad_norm": 0.6104766035319223, "learning_rate": 7.8836081798646e-06, "loss": 0.0489, "step": 3367 }, { "epoch": 1.4958916278036865, "grad_norm": 0.582209332395513, "learning_rate": 7.882024293552752e-06, "loss": 0.0481, "step": 3368 }, { "epoch": 1.496335776149234, "grad_norm": 1.2878711444761508, "learning_rate": 7.880439974023694e-06, "loss": 0.0895, "step": 3369 }, { "epoch": 1.4967799244947813, "grad_norm": 0.47752757971222215, "learning_rate": 7.87885522151558e-06, "loss": 0.0556, "step": 3370 }, { "epoch": 1.4972240728403285, "grad_norm": 0.34960537720365853, "learning_rate": 7.877270036266622e-06, "loss": 0.0361, "step": 3371 }, { "epoch": 1.497668221185876, "grad_norm": 0.6171844191638863, "learning_rate": 7.875684418515101e-06, "loss": 0.0413, "step": 3372 }, { "epoch": 1.4981123695314236, "grad_norm": 0.5888605884776164, "learning_rate": 7.874098368499362e-06, "loss": 0.0433, "step": 3373 }, { "epoch": 1.4985565178769709, "grad_norm": 0.32860735680481784, "learning_rate": 7.872511886457816e-06, "loss": 0.029, "step": 3374 }, { "epoch": 1.4990006662225184, "grad_norm": 0.4838256059081808, "learning_rate": 7.87092497262894e-06, "loss": 0.0428, "step": 3375 }, { "epoch": 1.4994448145680657, "grad_norm": 0.5961233345859986, "learning_rate": 7.86933762725127e-06, "loss": 0.0568, "step": 3376 }, { "epoch": 1.4998889629136132, "grad_norm": 0.4985166006472107, "learning_rate": 7.867749850563414e-06, "loss": 0.0539, "step": 3377 }, { "epoch": 1.5003331112591605, "grad_norm": 0.8288419788268757, "learning_rate": 7.86616164280404e-06, "loss": 0.0376, "step": 3378 }, { "epoch": 1.500777259604708, "grad_norm": 0.5437911475680088, "learning_rate": 7.864573004211884e-06, "loss": 0.0629, "step": 3379 }, { "epoch": 1.5012214079502555, "grad_norm": 0.5173272400012955, "learning_rate": 7.862983935025745e-06, "loss": 0.0496, "step": 3380 }, { "epoch": 1.5016655562958028, "grad_norm": 0.41368737678867235, "learning_rate": 7.861394435484488e-06, "loss": 0.0428, "step": 3381 }, { "epoch": 1.50210970464135, "grad_norm": 0.382287625598364, "learning_rate": 7.85980450582704e-06, "loss": 0.0362, "step": 3382 }, { "epoch": 1.5025538529868976, "grad_norm": 0.7914574257051636, "learning_rate": 7.858214146292394e-06, "loss": 0.0723, "step": 3383 }, { "epoch": 1.502998001332445, "grad_norm": 0.600368806153385, "learning_rate": 7.85662335711961e-06, "loss": 0.0426, "step": 3384 }, { "epoch": 1.5034421496779924, "grad_norm": 0.516289015563682, "learning_rate": 7.855032138547811e-06, "loss": 0.0401, "step": 3385 }, { "epoch": 1.50388629802354, "grad_norm": 0.4579999933485943, "learning_rate": 7.853440490816182e-06, "loss": 0.0372, "step": 3386 }, { "epoch": 1.5043304463690874, "grad_norm": 0.4924686770621345, "learning_rate": 7.851848414163976e-06, "loss": 0.0457, "step": 3387 }, { "epoch": 1.5047745947146347, "grad_norm": 0.5624519083004446, "learning_rate": 7.850255908830508e-06, "loss": 0.0431, "step": 3388 }, { "epoch": 1.505218743060182, "grad_norm": 0.46856624335860086, "learning_rate": 7.848662975055161e-06, "loss": 0.0374, "step": 3389 }, { "epoch": 1.5056628914057295, "grad_norm": 0.6866568127731671, "learning_rate": 7.847069613077377e-06, "loss": 0.088, "step": 3390 }, { "epoch": 1.506107039751277, "grad_norm": 0.5263852322647139, "learning_rate": 7.845475823136669e-06, "loss": 0.0475, "step": 3391 }, { "epoch": 1.5065511880968243, "grad_norm": 0.6378963066155895, "learning_rate": 7.843881605472606e-06, "loss": 0.0579, "step": 3392 }, { "epoch": 1.5069953364423716, "grad_norm": 0.8470563699341165, "learning_rate": 7.84228696032483e-06, "loss": 0.0685, "step": 3393 }, { "epoch": 1.5074394847879191, "grad_norm": 0.5121438548331829, "learning_rate": 7.840691887933042e-06, "loss": 0.0482, "step": 3394 }, { "epoch": 1.5078836331334666, "grad_norm": 0.46761061686619343, "learning_rate": 7.839096388537008e-06, "loss": 0.053, "step": 3395 }, { "epoch": 1.508327781479014, "grad_norm": 0.568716582017773, "learning_rate": 7.837500462376559e-06, "loss": 0.0446, "step": 3396 }, { "epoch": 1.5087719298245614, "grad_norm": 0.427488840303415, "learning_rate": 7.83590410969159e-06, "loss": 0.045, "step": 3397 }, { "epoch": 1.509216078170109, "grad_norm": 0.5336130953887138, "learning_rate": 7.834307330722059e-06, "loss": 0.0501, "step": 3398 }, { "epoch": 1.5096602265156562, "grad_norm": 0.6643085395902935, "learning_rate": 7.832710125707991e-06, "loss": 0.0714, "step": 3399 }, { "epoch": 1.5101043748612035, "grad_norm": 0.7517073070810807, "learning_rate": 7.831112494889472e-06, "loss": 0.0574, "step": 3400 }, { "epoch": 1.510548523206751, "grad_norm": 0.6793225641094393, "learning_rate": 7.829514438506651e-06, "loss": 0.054, "step": 3401 }, { "epoch": 1.5109926715522985, "grad_norm": 0.4505747895513854, "learning_rate": 7.827915956799745e-06, "loss": 0.0417, "step": 3402 }, { "epoch": 1.5114368198978458, "grad_norm": 0.7701556350457059, "learning_rate": 7.826317050009035e-06, "loss": 0.046, "step": 3403 }, { "epoch": 1.5118809682433931, "grad_norm": 0.4711452448196391, "learning_rate": 7.82471771837486e-06, "loss": 0.0393, "step": 3404 }, { "epoch": 1.5123251165889409, "grad_norm": 0.5955356460688755, "learning_rate": 7.823117962137628e-06, "loss": 0.0475, "step": 3405 }, { "epoch": 1.5127692649344882, "grad_norm": 0.47383670753602875, "learning_rate": 7.821517781537811e-06, "loss": 0.0471, "step": 3406 }, { "epoch": 1.5132134132800354, "grad_norm": 0.36906733260563584, "learning_rate": 7.819917176815942e-06, "loss": 0.0424, "step": 3407 }, { "epoch": 1.513657561625583, "grad_norm": 0.5026639235608937, "learning_rate": 7.818316148212619e-06, "loss": 0.0478, "step": 3408 }, { "epoch": 1.5141017099711305, "grad_norm": 0.5632301347650163, "learning_rate": 7.816714695968503e-06, "loss": 0.0582, "step": 3409 }, { "epoch": 1.5145458583166778, "grad_norm": 0.5601350342899486, "learning_rate": 7.815112820324322e-06, "loss": 0.052, "step": 3410 }, { "epoch": 1.514990006662225, "grad_norm": 0.44545554824483924, "learning_rate": 7.813510521520864e-06, "loss": 0.0437, "step": 3411 }, { "epoch": 1.5154341550077726, "grad_norm": 0.5321000678999631, "learning_rate": 7.811907799798981e-06, "loss": 0.0446, "step": 3412 }, { "epoch": 1.51587830335332, "grad_norm": 0.6123306795917254, "learning_rate": 7.81030465539959e-06, "loss": 0.0472, "step": 3413 }, { "epoch": 1.5163224516988674, "grad_norm": 0.7182097530608341, "learning_rate": 7.808701088563669e-06, "loss": 0.0652, "step": 3414 }, { "epoch": 1.5167666000444149, "grad_norm": 0.6740358195253988, "learning_rate": 7.807097099532264e-06, "loss": 0.0446, "step": 3415 }, { "epoch": 1.5172107483899624, "grad_norm": 0.43545100676950116, "learning_rate": 7.805492688546481e-06, "loss": 0.0327, "step": 3416 }, { "epoch": 1.5176548967355097, "grad_norm": 0.4493644872097168, "learning_rate": 7.80388785584749e-06, "loss": 0.0401, "step": 3417 }, { "epoch": 1.518099045081057, "grad_norm": 0.4270665812232271, "learning_rate": 7.802282601676522e-06, "loss": 0.0517, "step": 3418 }, { "epoch": 1.5185431934266045, "grad_norm": 0.48691036016457884, "learning_rate": 7.800676926274881e-06, "loss": 0.0398, "step": 3419 }, { "epoch": 1.518987341772152, "grad_norm": 0.5623140589479151, "learning_rate": 7.79907082988392e-06, "loss": 0.0482, "step": 3420 }, { "epoch": 1.5194314901176993, "grad_norm": 0.6102149116754874, "learning_rate": 7.797464312745067e-06, "loss": 0.0541, "step": 3421 }, { "epoch": 1.5198756384632466, "grad_norm": 0.5623605927044153, "learning_rate": 7.795857375099806e-06, "loss": 0.05, "step": 3422 }, { "epoch": 1.520319786808794, "grad_norm": 0.4402109113654038, "learning_rate": 7.794250017189689e-06, "loss": 0.046, "step": 3423 }, { "epoch": 1.5207639351543416, "grad_norm": 0.48101120541164943, "learning_rate": 7.792642239256327e-06, "loss": 0.0461, "step": 3424 }, { "epoch": 1.521208083499889, "grad_norm": 0.5296531840354222, "learning_rate": 7.791034041541398e-06, "loss": 0.0473, "step": 3425 }, { "epoch": 1.5216522318454364, "grad_norm": 0.37685452582544965, "learning_rate": 7.78942542428664e-06, "loss": 0.0317, "step": 3426 }, { "epoch": 1.522096380190984, "grad_norm": 0.3991510986249673, "learning_rate": 7.78781638773386e-06, "loss": 0.0347, "step": 3427 }, { "epoch": 1.5225405285365312, "grad_norm": 0.7554838302963066, "learning_rate": 7.786206932124918e-06, "loss": 0.0586, "step": 3428 }, { "epoch": 1.5229846768820785, "grad_norm": 0.42032863329599834, "learning_rate": 7.784597057701745e-06, "loss": 0.0252, "step": 3429 }, { "epoch": 1.523428825227626, "grad_norm": 0.5551436730522737, "learning_rate": 7.782986764706334e-06, "loss": 0.0563, "step": 3430 }, { "epoch": 1.5238729735731735, "grad_norm": 0.6187970153795604, "learning_rate": 7.781376053380735e-06, "loss": 0.0592, "step": 3431 }, { "epoch": 1.5243171219187208, "grad_norm": 0.5501189193555737, "learning_rate": 7.779764923967069e-06, "loss": 0.0556, "step": 3432 }, { "epoch": 1.524761270264268, "grad_norm": 0.44533955935715447, "learning_rate": 7.778153376707513e-06, "loss": 0.0507, "step": 3433 }, { "epoch": 1.5252054186098158, "grad_norm": 0.4340430082400607, "learning_rate": 7.776541411844315e-06, "loss": 0.0385, "step": 3434 }, { "epoch": 1.5256495669553631, "grad_norm": 0.5599425444294093, "learning_rate": 7.774929029619775e-06, "loss": 0.0454, "step": 3435 }, { "epoch": 1.5260937153009104, "grad_norm": 0.3908201446163151, "learning_rate": 7.773316230276267e-06, "loss": 0.041, "step": 3436 }, { "epoch": 1.526537863646458, "grad_norm": 0.6432918655836172, "learning_rate": 7.771703014056217e-06, "loss": 0.0664, "step": 3437 }, { "epoch": 1.5269820119920055, "grad_norm": 0.4341973490141778, "learning_rate": 7.770089381202121e-06, "loss": 0.0536, "step": 3438 }, { "epoch": 1.5274261603375527, "grad_norm": 0.5232821809072614, "learning_rate": 7.768475331956537e-06, "loss": 0.0506, "step": 3439 }, { "epoch": 1.5278703086831, "grad_norm": 0.49831548658076835, "learning_rate": 7.76686086656208e-06, "loss": 0.0497, "step": 3440 }, { "epoch": 1.5283144570286475, "grad_norm": 0.6350239638046895, "learning_rate": 7.765245985261436e-06, "loss": 0.0603, "step": 3441 }, { "epoch": 1.528758605374195, "grad_norm": 0.8205513486577842, "learning_rate": 7.763630688297347e-06, "loss": 0.0555, "step": 3442 }, { "epoch": 1.5292027537197423, "grad_norm": 0.5773711065726657, "learning_rate": 7.76201497591262e-06, "loss": 0.0564, "step": 3443 }, { "epoch": 1.5296469020652899, "grad_norm": 0.42024558526878264, "learning_rate": 7.760398848350121e-06, "loss": 0.0327, "step": 3444 }, { "epoch": 1.5300910504108374, "grad_norm": 0.5673807060962988, "learning_rate": 7.758782305852787e-06, "loss": 0.0551, "step": 3445 }, { "epoch": 1.5305351987563847, "grad_norm": 0.4401980723100094, "learning_rate": 7.757165348663606e-06, "loss": 0.0341, "step": 3446 }, { "epoch": 1.530979347101932, "grad_norm": 0.4134655160444097, "learning_rate": 7.755547977025641e-06, "loss": 0.0396, "step": 3447 }, { "epoch": 1.5314234954474795, "grad_norm": 0.7907504517928898, "learning_rate": 7.753930191182005e-06, "loss": 0.072, "step": 3448 }, { "epoch": 1.531867643793027, "grad_norm": 0.7425759733481122, "learning_rate": 7.752311991375878e-06, "loss": 0.0584, "step": 3449 }, { "epoch": 1.5323117921385743, "grad_norm": 0.3701818263547308, "learning_rate": 7.750693377850506e-06, "loss": 0.0311, "step": 3450 }, { "epoch": 1.5327559404841216, "grad_norm": 0.4194874739178355, "learning_rate": 7.749074350849196e-06, "loss": 0.0299, "step": 3451 }, { "epoch": 1.533200088829669, "grad_norm": 0.399280041395756, "learning_rate": 7.747454910615309e-06, "loss": 0.0333, "step": 3452 }, { "epoch": 1.5336442371752166, "grad_norm": 0.5116821227256184, "learning_rate": 7.74583505739228e-06, "loss": 0.053, "step": 3453 }, { "epoch": 1.5340883855207639, "grad_norm": 0.6704114941806725, "learning_rate": 7.744214791423597e-06, "loss": 0.0559, "step": 3454 }, { "epoch": 1.5345325338663114, "grad_norm": 0.4031097936225357, "learning_rate": 7.742594112952816e-06, "loss": 0.04, "step": 3455 }, { "epoch": 1.534976682211859, "grad_norm": 0.39875784908896506, "learning_rate": 7.74097302222355e-06, "loss": 0.0359, "step": 3456 }, { "epoch": 1.5354208305574062, "grad_norm": 0.49710136867375654, "learning_rate": 7.739351519479479e-06, "loss": 0.0433, "step": 3457 }, { "epoch": 1.5358649789029535, "grad_norm": 0.587479752061736, "learning_rate": 7.73772960496434e-06, "loss": 0.0482, "step": 3458 }, { "epoch": 1.536309127248501, "grad_norm": 0.9958134431642607, "learning_rate": 7.736107278921937e-06, "loss": 0.0604, "step": 3459 }, { "epoch": 1.5367532755940485, "grad_norm": 0.5924386484163253, "learning_rate": 7.73448454159613e-06, "loss": 0.0643, "step": 3460 }, { "epoch": 1.5371974239395958, "grad_norm": 0.4469041409309243, "learning_rate": 7.732861393230845e-06, "loss": 0.0377, "step": 3461 }, { "epoch": 1.537641572285143, "grad_norm": 0.6971338285035604, "learning_rate": 7.731237834070071e-06, "loss": 0.0525, "step": 3462 }, { "epoch": 1.5380857206306906, "grad_norm": 0.377836838682399, "learning_rate": 7.729613864357854e-06, "loss": 0.037, "step": 3463 }, { "epoch": 1.5385298689762381, "grad_norm": 0.3909300894809345, "learning_rate": 7.727989484338306e-06, "loss": 0.0327, "step": 3464 }, { "epoch": 1.5389740173217854, "grad_norm": 0.44535118870735724, "learning_rate": 7.726364694255598e-06, "loss": 0.0451, "step": 3465 }, { "epoch": 1.539418165667333, "grad_norm": 0.4310182821670584, "learning_rate": 7.724739494353963e-06, "loss": 0.043, "step": 3466 }, { "epoch": 1.5398623140128804, "grad_norm": 0.5072110735404695, "learning_rate": 7.723113884877698e-06, "loss": 0.0409, "step": 3467 }, { "epoch": 1.5403064623584277, "grad_norm": 0.4243921084057989, "learning_rate": 7.721487866071158e-06, "loss": 0.0577, "step": 3468 }, { "epoch": 1.540750610703975, "grad_norm": 0.6225210766153572, "learning_rate": 7.71986143817876e-06, "loss": 0.041, "step": 3469 }, { "epoch": 1.5411947590495225, "grad_norm": 0.7207947800360287, "learning_rate": 7.718234601444987e-06, "loss": 0.0525, "step": 3470 }, { "epoch": 1.54163890739507, "grad_norm": 0.6448910865817588, "learning_rate": 7.716607356114378e-06, "loss": 0.0691, "step": 3471 }, { "epoch": 1.5420830557406173, "grad_norm": 0.4192180148347075, "learning_rate": 7.714979702431537e-06, "loss": 0.0401, "step": 3472 }, { "epoch": 1.5425272040861646, "grad_norm": 0.5353520298031071, "learning_rate": 7.713351640641127e-06, "loss": 0.047, "step": 3473 }, { "epoch": 1.5429713524317124, "grad_norm": 0.4566196176860585, "learning_rate": 7.711723170987875e-06, "loss": 0.0377, "step": 3474 }, { "epoch": 1.5434155007772596, "grad_norm": 0.8744543565589462, "learning_rate": 7.710094293716563e-06, "loss": 0.0421, "step": 3475 }, { "epoch": 1.543859649122807, "grad_norm": 0.4450101049065203, "learning_rate": 7.708465009072046e-06, "loss": 0.0368, "step": 3476 }, { "epoch": 1.5443037974683544, "grad_norm": 0.3880510280013894, "learning_rate": 7.706835317299228e-06, "loss": 0.042, "step": 3477 }, { "epoch": 1.544747945813902, "grad_norm": 0.5627812490127366, "learning_rate": 7.705205218643079e-06, "loss": 0.0485, "step": 3478 }, { "epoch": 1.5451920941594492, "grad_norm": 0.5226084645577708, "learning_rate": 7.703574713348633e-06, "loss": 0.0392, "step": 3479 }, { "epoch": 1.5456362425049965, "grad_norm": 0.646405817519741, "learning_rate": 7.701943801660983e-06, "loss": 0.0679, "step": 3480 }, { "epoch": 1.546080390850544, "grad_norm": 0.4165107020464423, "learning_rate": 7.700312483825281e-06, "loss": 0.0398, "step": 3481 }, { "epoch": 1.5465245391960916, "grad_norm": 0.4015766878565599, "learning_rate": 7.698680760086743e-06, "loss": 0.0397, "step": 3482 }, { "epoch": 1.5469686875416389, "grad_norm": 0.5170446946635605, "learning_rate": 7.697048630690642e-06, "loss": 0.0437, "step": 3483 }, { "epoch": 1.5474128358871864, "grad_norm": 0.6516919138739381, "learning_rate": 7.69541609588232e-06, "loss": 0.0487, "step": 3484 }, { "epoch": 1.5478569842327339, "grad_norm": 0.3866667271027062, "learning_rate": 7.69378315590717e-06, "loss": 0.0455, "step": 3485 }, { "epoch": 1.5483011325782812, "grad_norm": 0.4947363891869916, "learning_rate": 7.692149811010651e-06, "loss": 0.0455, "step": 3486 }, { "epoch": 1.5487452809238285, "grad_norm": 0.3642222145788099, "learning_rate": 7.690516061438287e-06, "loss": 0.0302, "step": 3487 }, { "epoch": 1.549189429269376, "grad_norm": 0.49548313003009337, "learning_rate": 7.688881907435653e-06, "loss": 0.0491, "step": 3488 }, { "epoch": 1.5496335776149235, "grad_norm": 0.5051267185591125, "learning_rate": 7.687247349248393e-06, "loss": 0.037, "step": 3489 }, { "epoch": 1.5500777259604708, "grad_norm": 0.35479785899347316, "learning_rate": 7.685612387122206e-06, "loss": 0.0393, "step": 3490 }, { "epoch": 1.550521874306018, "grad_norm": 0.3491505040065738, "learning_rate": 7.68397702130286e-06, "loss": 0.0391, "step": 3491 }, { "epoch": 1.5509660226515656, "grad_norm": 0.4064600432185325, "learning_rate": 7.682341252036171e-06, "loss": 0.0369, "step": 3492 }, { "epoch": 1.551410170997113, "grad_norm": 0.5118602299189453, "learning_rate": 7.68070507956803e-06, "loss": 0.0455, "step": 3493 }, { "epoch": 1.5518543193426604, "grad_norm": 0.4565926867283545, "learning_rate": 7.679068504144378e-06, "loss": 0.038, "step": 3494 }, { "epoch": 1.552298467688208, "grad_norm": 0.7287584197582313, "learning_rate": 7.677431526011218e-06, "loss": 0.0569, "step": 3495 }, { "epoch": 1.5527426160337554, "grad_norm": 0.5994194003053249, "learning_rate": 7.67579414541462e-06, "loss": 0.0652, "step": 3496 }, { "epoch": 1.5531867643793027, "grad_norm": 0.826708798968826, "learning_rate": 7.674156362600708e-06, "loss": 0.0788, "step": 3497 }, { "epoch": 1.55363091272485, "grad_norm": 0.6304316452414537, "learning_rate": 7.672518177815669e-06, "loss": 0.0447, "step": 3498 }, { "epoch": 1.5540750610703975, "grad_norm": 0.5354755353841119, "learning_rate": 7.67087959130575e-06, "loss": 0.0398, "step": 3499 }, { "epoch": 1.554519209415945, "grad_norm": 0.5926105062715095, "learning_rate": 7.669240603317257e-06, "loss": 0.0495, "step": 3500 }, { "epoch": 1.5549633577614923, "grad_norm": 0.4138530773015362, "learning_rate": 7.66760121409656e-06, "loss": 0.0354, "step": 3501 }, { "epoch": 1.5554075061070396, "grad_norm": 0.44256364969603434, "learning_rate": 7.665961423890085e-06, "loss": 0.045, "step": 3502 }, { "epoch": 1.5558516544525873, "grad_norm": 0.47643325623563626, "learning_rate": 7.664321232944321e-06, "loss": 0.0403, "step": 3503 }, { "epoch": 1.5562958027981346, "grad_norm": 0.3731544040854046, "learning_rate": 7.662680641505817e-06, "loss": 0.0412, "step": 3504 }, { "epoch": 1.556739951143682, "grad_norm": 0.8553804228205281, "learning_rate": 7.661039649821183e-06, "loss": 0.0688, "step": 3505 }, { "epoch": 1.5571840994892294, "grad_norm": 0.5134256859909045, "learning_rate": 7.659398258137085e-06, "loss": 0.044, "step": 3506 }, { "epoch": 1.557628247834777, "grad_norm": 0.5647254301702445, "learning_rate": 7.657756466700252e-06, "loss": 0.0437, "step": 3507 }, { "epoch": 1.5580723961803242, "grad_norm": 0.44604429848268956, "learning_rate": 7.656114275757477e-06, "loss": 0.0399, "step": 3508 }, { "epoch": 1.5585165445258715, "grad_norm": 0.45991000971025053, "learning_rate": 7.654471685555606e-06, "loss": 0.0484, "step": 3509 }, { "epoch": 1.558960692871419, "grad_norm": 0.5298662733782437, "learning_rate": 7.65282869634155e-06, "loss": 0.061, "step": 3510 }, { "epoch": 1.5594048412169665, "grad_norm": 0.49816414515287183, "learning_rate": 7.651185308362276e-06, "loss": 0.0499, "step": 3511 }, { "epoch": 1.5598489895625138, "grad_norm": 0.4052222432440109, "learning_rate": 7.649541521864816e-06, "loss": 0.0451, "step": 3512 }, { "epoch": 1.5602931379080613, "grad_norm": 0.5042113116702226, "learning_rate": 7.647897337096257e-06, "loss": 0.0457, "step": 3513 }, { "epoch": 1.5607372862536089, "grad_norm": 0.7798340643811736, "learning_rate": 7.646252754303746e-06, "loss": 0.1019, "step": 3514 }, { "epoch": 1.5611814345991561, "grad_norm": 0.4972991596269239, "learning_rate": 7.644607773734496e-06, "loss": 0.0578, "step": 3515 }, { "epoch": 1.5616255829447034, "grad_norm": 0.5179606694497955, "learning_rate": 7.642962395635773e-06, "loss": 0.0484, "step": 3516 }, { "epoch": 1.562069731290251, "grad_norm": 0.4239628046426599, "learning_rate": 7.641316620254907e-06, "loss": 0.0401, "step": 3517 }, { "epoch": 1.5625138796357985, "grad_norm": 0.41910897244128137, "learning_rate": 7.639670447839284e-06, "loss": 0.0431, "step": 3518 }, { "epoch": 1.5629580279813458, "grad_norm": 0.6150053023433038, "learning_rate": 7.638023878636353e-06, "loss": 0.0401, "step": 3519 }, { "epoch": 1.563402176326893, "grad_norm": 0.4401882181013756, "learning_rate": 7.63637691289362e-06, "loss": 0.0557, "step": 3520 }, { "epoch": 1.5638463246724406, "grad_norm": 0.4753196454260942, "learning_rate": 7.634729550858652e-06, "loss": 0.0393, "step": 3521 }, { "epoch": 1.564290473017988, "grad_norm": 0.36301949275943124, "learning_rate": 7.633081792779079e-06, "loss": 0.0352, "step": 3522 }, { "epoch": 1.5647346213635354, "grad_norm": 0.6312670336926534, "learning_rate": 7.631433638902583e-06, "loss": 0.0458, "step": 3523 }, { "epoch": 1.5651787697090829, "grad_norm": 0.4853790667151454, "learning_rate": 7.629785089476912e-06, "loss": 0.0465, "step": 3524 }, { "epoch": 1.5656229180546304, "grad_norm": 0.5377885843949731, "learning_rate": 7.628136144749867e-06, "loss": 0.0432, "step": 3525 }, { "epoch": 1.5660670664001777, "grad_norm": 0.4560714160433749, "learning_rate": 7.626486804969316e-06, "loss": 0.0443, "step": 3526 }, { "epoch": 1.566511214745725, "grad_norm": 0.40682805145370315, "learning_rate": 7.624837070383183e-06, "loss": 0.0353, "step": 3527 }, { "epoch": 1.5669553630912725, "grad_norm": 0.4791132302623183, "learning_rate": 7.6231869412394495e-06, "loss": 0.0502, "step": 3528 }, { "epoch": 1.56739951143682, "grad_norm": 0.4343747460657118, "learning_rate": 7.621536417786159e-06, "loss": 0.0395, "step": 3529 }, { "epoch": 1.5678436597823673, "grad_norm": 0.492794572243227, "learning_rate": 7.619885500271413e-06, "loss": 0.0361, "step": 3530 }, { "epoch": 1.5682878081279146, "grad_norm": 0.3645392422465027, "learning_rate": 7.618234188943372e-06, "loss": 0.0389, "step": 3531 }, { "epoch": 1.568731956473462, "grad_norm": 0.43518151556556955, "learning_rate": 7.616582484050256e-06, "loss": 0.037, "step": 3532 }, { "epoch": 1.5691761048190096, "grad_norm": 0.5607869474661537, "learning_rate": 7.614930385840345e-06, "loss": 0.0494, "step": 3533 }, { "epoch": 1.5696202531645569, "grad_norm": 0.3841819768936594, "learning_rate": 7.613277894561978e-06, "loss": 0.0426, "step": 3534 }, { "epoch": 1.5700644015101044, "grad_norm": 0.3817054851480292, "learning_rate": 7.611625010463549e-06, "loss": 0.045, "step": 3535 }, { "epoch": 1.570508549855652, "grad_norm": 0.8877604841062007, "learning_rate": 7.60997173379352e-06, "loss": 0.0511, "step": 3536 }, { "epoch": 1.5709526982011992, "grad_norm": 0.5367152034215165, "learning_rate": 7.608318064800403e-06, "loss": 0.0437, "step": 3537 }, { "epoch": 1.5713968465467465, "grad_norm": 0.37970223029486955, "learning_rate": 7.606664003732771e-06, "loss": 0.0426, "step": 3538 }, { "epoch": 1.571840994892294, "grad_norm": 0.5586015168302153, "learning_rate": 7.605009550839263e-06, "loss": 0.0443, "step": 3539 }, { "epoch": 1.5722851432378415, "grad_norm": 0.5314909266145166, "learning_rate": 7.603354706368567e-06, "loss": 0.0482, "step": 3540 }, { "epoch": 1.5727292915833888, "grad_norm": 0.6999799538645216, "learning_rate": 7.601699470569434e-06, "loss": 0.0481, "step": 3541 }, { "epoch": 1.573173439928936, "grad_norm": 0.6037904990943627, "learning_rate": 7.600043843690677e-06, "loss": 0.0563, "step": 3542 }, { "epoch": 1.5736175882744838, "grad_norm": 0.47456339272048825, "learning_rate": 7.5983878259811625e-06, "loss": 0.069, "step": 3543 }, { "epoch": 1.5740617366200311, "grad_norm": 0.9776402251476092, "learning_rate": 7.59673141768982e-06, "loss": 0.0634, "step": 3544 }, { "epoch": 1.5745058849655784, "grad_norm": 0.430567824394842, "learning_rate": 7.595074619065635e-06, "loss": 0.0389, "step": 3545 }, { "epoch": 1.574950033311126, "grad_norm": 0.592046861388675, "learning_rate": 7.593417430357649e-06, "loss": 0.0569, "step": 3546 }, { "epoch": 1.5753941816566734, "grad_norm": 0.528657961671009, "learning_rate": 7.591759851814972e-06, "loss": 0.0477, "step": 3547 }, { "epoch": 1.5758383300022207, "grad_norm": 0.6683751656639313, "learning_rate": 7.590101883686761e-06, "loss": 0.0567, "step": 3548 }, { "epoch": 1.576282478347768, "grad_norm": 0.6152824586906789, "learning_rate": 7.58844352622224e-06, "loss": 0.0514, "step": 3549 }, { "epoch": 1.5767266266933155, "grad_norm": 0.5175459666893117, "learning_rate": 7.5867847796706865e-06, "loss": 0.0466, "step": 3550 }, { "epoch": 1.577170775038863, "grad_norm": 0.46509447514156393, "learning_rate": 7.585125644281439e-06, "loss": 0.0356, "step": 3551 }, { "epoch": 1.5776149233844103, "grad_norm": 0.42445822473522876, "learning_rate": 7.583466120303893e-06, "loss": 0.0368, "step": 3552 }, { "epoch": 1.5780590717299579, "grad_norm": 0.5278835867879181, "learning_rate": 7.581806207987504e-06, "loss": 0.0412, "step": 3553 }, { "epoch": 1.5785032200755054, "grad_norm": 0.49560125396493776, "learning_rate": 7.5801459075817865e-06, "loss": 0.0435, "step": 3554 }, { "epoch": 1.5789473684210527, "grad_norm": 0.5441296297037692, "learning_rate": 7.578485219336307e-06, "loss": 0.0408, "step": 3555 }, { "epoch": 1.5793915167666, "grad_norm": 0.5569362734290482, "learning_rate": 7.5768241435007e-06, "loss": 0.0508, "step": 3556 }, { "epoch": 1.5798356651121475, "grad_norm": 0.688395754837478, "learning_rate": 7.57516268032465e-06, "loss": 0.0575, "step": 3557 }, { "epoch": 1.580279813457695, "grad_norm": 0.5366509995471049, "learning_rate": 7.573500830057907e-06, "loss": 0.0393, "step": 3558 }, { "epoch": 1.5807239618032423, "grad_norm": 0.5521349728712537, "learning_rate": 7.571838592950271e-06, "loss": 0.0464, "step": 3559 }, { "epoch": 1.5811681101487896, "grad_norm": 0.3794603822110102, "learning_rate": 7.570175969251609e-06, "loss": 0.0271, "step": 3560 }, { "epoch": 1.581612258494337, "grad_norm": 0.954806853873049, "learning_rate": 7.568512959211838e-06, "loss": 0.0798, "step": 3561 }, { "epoch": 1.5820564068398846, "grad_norm": 0.3774549347222219, "learning_rate": 7.566849563080938e-06, "loss": 0.034, "step": 3562 }, { "epoch": 1.5825005551854319, "grad_norm": 0.5717245905532253, "learning_rate": 7.565185781108944e-06, "loss": 0.0499, "step": 3563 }, { "epoch": 1.5829447035309794, "grad_norm": 0.5372548568959261, "learning_rate": 7.563521613545954e-06, "loss": 0.0543, "step": 3564 }, { "epoch": 1.583388851876527, "grad_norm": 0.6132605143548788, "learning_rate": 7.56185706064212e-06, "loss": 0.0513, "step": 3565 }, { "epoch": 1.5838330002220742, "grad_norm": 0.4784620611087414, "learning_rate": 7.560192122647647e-06, "loss": 0.0507, "step": 3566 }, { "epoch": 1.5842771485676215, "grad_norm": 0.46891721716697665, "learning_rate": 7.558526799812812e-06, "loss": 0.0447, "step": 3567 }, { "epoch": 1.584721296913169, "grad_norm": 0.6007245405053189, "learning_rate": 7.556861092387937e-06, "loss": 0.0461, "step": 3568 }, { "epoch": 1.5851654452587165, "grad_norm": 0.6832271146214529, "learning_rate": 7.555195000623404e-06, "loss": 0.0615, "step": 3569 }, { "epoch": 1.5856095936042638, "grad_norm": 0.628507751994035, "learning_rate": 7.553528524769658e-06, "loss": 0.0511, "step": 3570 }, { "epoch": 1.586053741949811, "grad_norm": 0.47038947814194393, "learning_rate": 7.551861665077199e-06, "loss": 0.0503, "step": 3571 }, { "epoch": 1.5864978902953588, "grad_norm": 0.5039990895794063, "learning_rate": 7.550194421796583e-06, "loss": 0.0757, "step": 3572 }, { "epoch": 1.586942038640906, "grad_norm": 0.43304671963904556, "learning_rate": 7.548526795178424e-06, "loss": 0.0435, "step": 3573 }, { "epoch": 1.5873861869864534, "grad_norm": 0.35567796030147675, "learning_rate": 7.546858785473397e-06, "loss": 0.0411, "step": 3574 }, { "epoch": 1.587830335332001, "grad_norm": 0.5082433548982926, "learning_rate": 7.54519039293223e-06, "loss": 0.0476, "step": 3575 }, { "epoch": 1.5882744836775484, "grad_norm": 0.4853723595468299, "learning_rate": 7.543521617805711e-06, "loss": 0.0403, "step": 3576 }, { "epoch": 1.5887186320230957, "grad_norm": 0.4349996299659683, "learning_rate": 7.541852460344687e-06, "loss": 0.0444, "step": 3577 }, { "epoch": 1.589162780368643, "grad_norm": 0.3738858521887595, "learning_rate": 7.540182920800061e-06, "loss": 0.0342, "step": 3578 }, { "epoch": 1.5896069287141905, "grad_norm": 0.3860111205972228, "learning_rate": 7.5385129994227916e-06, "loss": 0.0326, "step": 3579 }, { "epoch": 1.590051077059738, "grad_norm": 0.4607949804916419, "learning_rate": 7.536842696463894e-06, "loss": 0.0406, "step": 3580 }, { "epoch": 1.5904952254052853, "grad_norm": 0.4090438356252611, "learning_rate": 7.535172012174447e-06, "loss": 0.0435, "step": 3581 }, { "epoch": 1.5909393737508328, "grad_norm": 0.44992279283684267, "learning_rate": 7.533500946805583e-06, "loss": 0.0433, "step": 3582 }, { "epoch": 1.5913835220963803, "grad_norm": 0.5608893338456544, "learning_rate": 7.531829500608489e-06, "loss": 0.0461, "step": 3583 }, { "epoch": 1.5918276704419276, "grad_norm": 0.46587297637504893, "learning_rate": 7.530157673834413e-06, "loss": 0.0462, "step": 3584 }, { "epoch": 1.592271818787475, "grad_norm": 0.4650339559246697, "learning_rate": 7.528485466734658e-06, "loss": 0.0463, "step": 3585 }, { "epoch": 1.5927159671330224, "grad_norm": 0.6227086084109071, "learning_rate": 7.526812879560586e-06, "loss": 0.0579, "step": 3586 }, { "epoch": 1.59316011547857, "grad_norm": 0.38020073620262274, "learning_rate": 7.525139912563616e-06, "loss": 0.0356, "step": 3587 }, { "epoch": 1.5936042638241172, "grad_norm": 0.4538280786991795, "learning_rate": 7.523466565995224e-06, "loss": 0.041, "step": 3588 }, { "epoch": 1.5940484121696645, "grad_norm": 0.41382625396603623, "learning_rate": 7.521792840106937e-06, "loss": 0.0309, "step": 3589 }, { "epoch": 1.594492560515212, "grad_norm": 0.6086317820490929, "learning_rate": 7.52011873515035e-06, "loss": 0.055, "step": 3590 }, { "epoch": 1.5949367088607596, "grad_norm": 0.5518315774986915, "learning_rate": 7.518444251377108e-06, "loss": 0.0448, "step": 3591 }, { "epoch": 1.5953808572063068, "grad_norm": 0.46458719141632254, "learning_rate": 7.516769389038915e-06, "loss": 0.0375, "step": 3592 }, { "epoch": 1.5958250055518544, "grad_norm": 0.6946531905139686, "learning_rate": 7.515094148387529e-06, "loss": 0.0637, "step": 3593 }, { "epoch": 1.5962691538974019, "grad_norm": 0.47113343903410826, "learning_rate": 7.51341852967477e-06, "loss": 0.0403, "step": 3594 }, { "epoch": 1.5967133022429492, "grad_norm": 0.34877248157599594, "learning_rate": 7.511742533152509e-06, "loss": 0.035, "step": 3595 }, { "epoch": 1.5971574505884965, "grad_norm": 0.6597113801136277, "learning_rate": 7.51006615907268e-06, "loss": 0.0419, "step": 3596 }, { "epoch": 1.597601598934044, "grad_norm": 0.6377364313042297, "learning_rate": 7.508389407687267e-06, "loss": 0.0503, "step": 3597 }, { "epoch": 1.5980457472795915, "grad_norm": 0.45492565688398423, "learning_rate": 7.506712279248316e-06, "loss": 0.0399, "step": 3598 }, { "epoch": 1.5984898956251388, "grad_norm": 0.4963351358417883, "learning_rate": 7.5050347740079285e-06, "loss": 0.0404, "step": 3599 }, { "epoch": 1.598934043970686, "grad_norm": 0.3821109498168256, "learning_rate": 7.503356892218261e-06, "loss": 0.0341, "step": 3600 }, { "epoch": 1.5993781923162336, "grad_norm": 0.5218817014308058, "learning_rate": 7.501678634131528e-06, "loss": 0.0457, "step": 3601 }, { "epoch": 1.599822340661781, "grad_norm": 0.48140971211980904, "learning_rate": 7.500000000000001e-06, "loss": 0.0416, "step": 3602 }, { "epoch": 1.6002664890073284, "grad_norm": 0.6724090547593772, "learning_rate": 7.498320990076006e-06, "loss": 0.0407, "step": 3603 }, { "epoch": 1.6007106373528759, "grad_norm": 0.4673511339888186, "learning_rate": 7.496641604611926e-06, "loss": 0.0445, "step": 3604 }, { "epoch": 1.6011547856984234, "grad_norm": 0.4736546778146936, "learning_rate": 7.494961843860204e-06, "loss": 0.0457, "step": 3605 }, { "epoch": 1.6015989340439707, "grad_norm": 0.46683144779872127, "learning_rate": 7.4932817080733345e-06, "loss": 0.0534, "step": 3606 }, { "epoch": 1.602043082389518, "grad_norm": 0.38959987746176267, "learning_rate": 7.491601197503871e-06, "loss": 0.037, "step": 3607 }, { "epoch": 1.6024872307350655, "grad_norm": 0.5453813196476248, "learning_rate": 7.489920312404422e-06, "loss": 0.0524, "step": 3608 }, { "epoch": 1.602931379080613, "grad_norm": 0.5800473949841606, "learning_rate": 7.488239053027653e-06, "loss": 0.0575, "step": 3609 }, { "epoch": 1.6033755274261603, "grad_norm": 0.5289683983594908, "learning_rate": 7.486557419626288e-06, "loss": 0.0548, "step": 3610 }, { "epoch": 1.6038196757717076, "grad_norm": 0.6070529095018132, "learning_rate": 7.484875412453102e-06, "loss": 0.0412, "step": 3611 }, { "epoch": 1.6042638241172553, "grad_norm": 0.589551206830852, "learning_rate": 7.483193031760932e-06, "loss": 0.0548, "step": 3612 }, { "epoch": 1.6047079724628026, "grad_norm": 0.4017721125553618, "learning_rate": 7.481510277802667e-06, "loss": 0.0294, "step": 3613 }, { "epoch": 1.60515212080835, "grad_norm": 0.45650083282856746, "learning_rate": 7.479827150831254e-06, "loss": 0.0368, "step": 3614 }, { "epoch": 1.6055962691538974, "grad_norm": 0.3377636070281279, "learning_rate": 7.478143651099694e-06, "loss": 0.0361, "step": 3615 }, { "epoch": 1.606040417499445, "grad_norm": 0.4206873654117964, "learning_rate": 7.4764597788610496e-06, "loss": 0.0373, "step": 3616 }, { "epoch": 1.6064845658449922, "grad_norm": 0.5196617152819853, "learning_rate": 7.47477553436843e-06, "loss": 0.0535, "step": 3617 }, { "epoch": 1.6069287141905395, "grad_norm": 0.499359501911801, "learning_rate": 7.47309091787501e-06, "loss": 0.0414, "step": 3618 }, { "epoch": 1.607372862536087, "grad_norm": 0.5994533032039974, "learning_rate": 7.471405929634014e-06, "loss": 0.0521, "step": 3619 }, { "epoch": 1.6078170108816345, "grad_norm": 0.3309953500833457, "learning_rate": 7.469720569898725e-06, "loss": 0.0331, "step": 3620 }, { "epoch": 1.6082611592271818, "grad_norm": 0.4509086857710446, "learning_rate": 7.468034838922482e-06, "loss": 0.0458, "step": 3621 }, { "epoch": 1.6087053075727293, "grad_norm": 0.45137595767758343, "learning_rate": 7.4663487369586776e-06, "loss": 0.0469, "step": 3622 }, { "epoch": 1.6091494559182768, "grad_norm": 0.4696695504363821, "learning_rate": 7.464662264260761e-06, "loss": 0.0452, "step": 3623 }, { "epoch": 1.6095936042638241, "grad_norm": 0.4596849970347727, "learning_rate": 7.46297542108224e-06, "loss": 0.0438, "step": 3624 }, { "epoch": 1.6100377526093714, "grad_norm": 0.5614676347080366, "learning_rate": 7.4612882076766744e-06, "loss": 0.0599, "step": 3625 }, { "epoch": 1.610481900954919, "grad_norm": 0.3947391918010033, "learning_rate": 7.459600624297681e-06, "loss": 0.0347, "step": 3626 }, { "epoch": 1.6109260493004665, "grad_norm": 0.6133457502317237, "learning_rate": 7.4579126711989326e-06, "loss": 0.053, "step": 3627 }, { "epoch": 1.6113701976460137, "grad_norm": 0.45537205293561567, "learning_rate": 7.456224348634158e-06, "loss": 0.0436, "step": 3628 }, { "epoch": 1.611814345991561, "grad_norm": 0.46644204998070415, "learning_rate": 7.454535656857138e-06, "loss": 0.0492, "step": 3629 }, { "epoch": 1.6122584943371085, "grad_norm": 0.5228600296837344, "learning_rate": 7.4528465961217145e-06, "loss": 0.0521, "step": 3630 }, { "epoch": 1.612702642682656, "grad_norm": 0.5557514684322911, "learning_rate": 7.451157166681781e-06, "loss": 0.0526, "step": 3631 }, { "epoch": 1.6131467910282034, "grad_norm": 0.4758338953909275, "learning_rate": 7.449467368791287e-06, "loss": 0.0417, "step": 3632 }, { "epoch": 1.6135909393737509, "grad_norm": 0.5619608884775461, "learning_rate": 7.4477772027042395e-06, "loss": 0.0582, "step": 3633 }, { "epoch": 1.6140350877192984, "grad_norm": 0.46236286351850703, "learning_rate": 7.4460866686746966e-06, "loss": 0.0384, "step": 3634 }, { "epoch": 1.6144792360648457, "grad_norm": 0.44941172754104425, "learning_rate": 7.444395766956776e-06, "loss": 0.0451, "step": 3635 }, { "epoch": 1.614923384410393, "grad_norm": 0.5343367958828964, "learning_rate": 7.4427044978046496e-06, "loss": 0.0539, "step": 3636 }, { "epoch": 1.6153675327559405, "grad_norm": 0.7114764542018213, "learning_rate": 7.4410128614725406e-06, "loss": 0.0632, "step": 3637 }, { "epoch": 1.615811681101488, "grad_norm": 0.33754698523568316, "learning_rate": 7.439320858214736e-06, "loss": 0.0432, "step": 3638 }, { "epoch": 1.6162558294470353, "grad_norm": 0.43643183273515707, "learning_rate": 7.437628488285568e-06, "loss": 0.0379, "step": 3639 }, { "epoch": 1.6166999777925826, "grad_norm": 0.44390629309704815, "learning_rate": 7.435935751939429e-06, "loss": 0.0446, "step": 3640 }, { "epoch": 1.6171441261381303, "grad_norm": 0.4354396645312695, "learning_rate": 7.4342426494307695e-06, "loss": 0.043, "step": 3641 }, { "epoch": 1.6175882744836776, "grad_norm": 0.44873911764176055, "learning_rate": 7.432549181014088e-06, "loss": 0.0302, "step": 3642 }, { "epoch": 1.6180324228292249, "grad_norm": 0.41154412795322054, "learning_rate": 7.430855346943942e-06, "loss": 0.0395, "step": 3643 }, { "epoch": 1.6184765711747724, "grad_norm": 0.5287832714190103, "learning_rate": 7.4291611474749455e-06, "loss": 0.0473, "step": 3644 }, { "epoch": 1.61892071952032, "grad_norm": 0.4007407767223763, "learning_rate": 7.427466582861765e-06, "loss": 0.0378, "step": 3645 }, { "epoch": 1.6193648678658672, "grad_norm": 0.58929177581027, "learning_rate": 7.42577165335912e-06, "loss": 0.0551, "step": 3646 }, { "epoch": 1.6198090162114145, "grad_norm": 0.3940542333112033, "learning_rate": 7.42407635922179e-06, "loss": 0.0294, "step": 3647 }, { "epoch": 1.620253164556962, "grad_norm": 0.3862139275428533, "learning_rate": 7.4223807007046045e-06, "loss": 0.0318, "step": 3648 }, { "epoch": 1.6206973129025095, "grad_norm": 0.4233634000349271, "learning_rate": 7.4206846780624505e-06, "loss": 0.0351, "step": 3649 }, { "epoch": 1.6211414612480568, "grad_norm": 0.595696689600085, "learning_rate": 7.418988291550271e-06, "loss": 0.05, "step": 3650 }, { "epoch": 1.6215856095936043, "grad_norm": 0.45676187183693634, "learning_rate": 7.417291541423057e-06, "loss": 0.0408, "step": 3651 }, { "epoch": 1.6220297579391518, "grad_norm": 0.5662085190370286, "learning_rate": 7.415594427935864e-06, "loss": 0.0449, "step": 3652 }, { "epoch": 1.6224739062846991, "grad_norm": 0.5893188179834489, "learning_rate": 7.4138969513437945e-06, "loss": 0.0514, "step": 3653 }, { "epoch": 1.6229180546302464, "grad_norm": 1.1475511733222958, "learning_rate": 7.412199111902007e-06, "loss": 0.0539, "step": 3654 }, { "epoch": 1.623362202975794, "grad_norm": 0.4595814551148835, "learning_rate": 7.410500909865718e-06, "loss": 0.0422, "step": 3655 }, { "epoch": 1.6238063513213414, "grad_norm": 0.4905543614968632, "learning_rate": 7.408802345490194e-06, "loss": 0.0426, "step": 3656 }, { "epoch": 1.6242504996668887, "grad_norm": 0.44546697092666765, "learning_rate": 7.407103419030759e-06, "loss": 0.0543, "step": 3657 }, { "epoch": 1.624694648012436, "grad_norm": 0.4740804743160003, "learning_rate": 7.405404130742793e-06, "loss": 0.0493, "step": 3658 }, { "epoch": 1.6251387963579835, "grad_norm": 1.5473311509032195, "learning_rate": 7.4037044808817224e-06, "loss": 0.0513, "step": 3659 }, { "epoch": 1.625582944703531, "grad_norm": 0.6098239500874039, "learning_rate": 7.402004469703038e-06, "loss": 0.0367, "step": 3660 }, { "epoch": 1.6260270930490783, "grad_norm": 0.6586494701114967, "learning_rate": 7.4003040974622784e-06, "loss": 0.0585, "step": 3661 }, { "epoch": 1.6264712413946258, "grad_norm": 0.45627803806709566, "learning_rate": 7.39860336441504e-06, "loss": 0.048, "step": 3662 }, { "epoch": 1.6269153897401734, "grad_norm": 0.5771583864203911, "learning_rate": 7.3969022708169695e-06, "loss": 0.0378, "step": 3663 }, { "epoch": 1.6273595380857206, "grad_norm": 0.4931171394680339, "learning_rate": 7.395200816923774e-06, "loss": 0.033, "step": 3664 }, { "epoch": 1.627803686431268, "grad_norm": 0.7206307962976604, "learning_rate": 7.393499002991206e-06, "loss": 0.0541, "step": 3665 }, { "epoch": 1.6282478347768155, "grad_norm": 0.4622361606059328, "learning_rate": 7.3917968292750785e-06, "loss": 0.0415, "step": 3666 }, { "epoch": 1.628691983122363, "grad_norm": 0.7660149085068831, "learning_rate": 7.390094296031259e-06, "loss": 0.0627, "step": 3667 }, { "epoch": 1.6291361314679103, "grad_norm": 0.7776923661917167, "learning_rate": 7.3883914035156666e-06, "loss": 0.0396, "step": 3668 }, { "epoch": 1.6295802798134575, "grad_norm": 0.6556574717804856, "learning_rate": 7.386688151984275e-06, "loss": 0.0449, "step": 3669 }, { "epoch": 1.630024428159005, "grad_norm": 0.6486302576198958, "learning_rate": 7.384984541693111e-06, "loss": 0.0456, "step": 3670 }, { "epoch": 1.6304685765045526, "grad_norm": 0.38753606074247743, "learning_rate": 7.383280572898256e-06, "loss": 0.0374, "step": 3671 }, { "epoch": 1.6309127248500999, "grad_norm": 0.4446665040074296, "learning_rate": 7.381576245855847e-06, "loss": 0.0399, "step": 3672 }, { "epoch": 1.6313568731956474, "grad_norm": 0.6808941373857945, "learning_rate": 7.379871560822071e-06, "loss": 0.0523, "step": 3673 }, { "epoch": 1.6318010215411949, "grad_norm": 0.6878067318911092, "learning_rate": 7.378166518053174e-06, "loss": 0.0557, "step": 3674 }, { "epoch": 1.6322451698867422, "grad_norm": 0.4460005003836504, "learning_rate": 7.37646111780545e-06, "loss": 0.0338, "step": 3675 }, { "epoch": 1.6326893182322895, "grad_norm": 0.392173745509544, "learning_rate": 7.374755360335253e-06, "loss": 0.0358, "step": 3676 }, { "epoch": 1.633133466577837, "grad_norm": 0.4942681170970006, "learning_rate": 7.3730492458989825e-06, "loss": 0.0451, "step": 3677 }, { "epoch": 1.6335776149233845, "grad_norm": 0.35125391907425363, "learning_rate": 7.371342774753101e-06, "loss": 0.0376, "step": 3678 }, { "epoch": 1.6340217632689318, "grad_norm": 0.5143098431682592, "learning_rate": 7.369635947154119e-06, "loss": 0.0469, "step": 3679 }, { "epoch": 1.634465911614479, "grad_norm": 0.5196302676867172, "learning_rate": 7.3679287633585995e-06, "loss": 0.0436, "step": 3680 }, { "epoch": 1.6349100599600268, "grad_norm": 0.5148856987270073, "learning_rate": 7.366221223623163e-06, "loss": 0.0429, "step": 3681 }, { "epoch": 1.635354208305574, "grad_norm": 0.42764696599583774, "learning_rate": 7.3645133282044835e-06, "loss": 0.0333, "step": 3682 }, { "epoch": 1.6357983566511214, "grad_norm": 0.3662627068077827, "learning_rate": 7.362805077359283e-06, "loss": 0.0312, "step": 3683 }, { "epoch": 1.636242504996669, "grad_norm": 0.5318925951291891, "learning_rate": 7.361096471344341e-06, "loss": 0.0392, "step": 3684 }, { "epoch": 1.6366866533422164, "grad_norm": 0.5101772714710058, "learning_rate": 7.359387510416494e-06, "loss": 0.0444, "step": 3685 }, { "epoch": 1.6371308016877637, "grad_norm": 0.6838175180628748, "learning_rate": 7.357678194832623e-06, "loss": 0.0443, "step": 3686 }, { "epoch": 1.637574950033311, "grad_norm": 0.4298262192631913, "learning_rate": 7.355968524849671e-06, "loss": 0.0397, "step": 3687 }, { "epoch": 1.6380190983788585, "grad_norm": 0.5227260046796232, "learning_rate": 7.354258500724627e-06, "loss": 0.0428, "step": 3688 }, { "epoch": 1.638463246724406, "grad_norm": 0.3848693505144518, "learning_rate": 7.352548122714541e-06, "loss": 0.038, "step": 3689 }, { "epoch": 1.6389073950699533, "grad_norm": 0.35605605527782813, "learning_rate": 7.350837391076509e-06, "loss": 0.0385, "step": 3690 }, { "epoch": 1.6393515434155008, "grad_norm": 0.5496746740887447, "learning_rate": 7.349126306067681e-06, "loss": 0.0399, "step": 3691 }, { "epoch": 1.6397956917610483, "grad_norm": 0.3577570105037349, "learning_rate": 7.347414867945266e-06, "loss": 0.035, "step": 3692 }, { "epoch": 1.6402398401065956, "grad_norm": 0.4964615292337297, "learning_rate": 7.345703076966522e-06, "loss": 0.0517, "step": 3693 }, { "epoch": 1.640683988452143, "grad_norm": 0.7390592067058361, "learning_rate": 7.343990933388757e-06, "loss": 0.0367, "step": 3694 }, { "epoch": 1.6411281367976904, "grad_norm": 0.3720816990200118, "learning_rate": 7.342278437469338e-06, "loss": 0.0385, "step": 3695 }, { "epoch": 1.641572285143238, "grad_norm": 0.4417091992699088, "learning_rate": 7.340565589465681e-06, "loss": 0.0496, "step": 3696 }, { "epoch": 1.6420164334887852, "grad_norm": 0.4387119629881648, "learning_rate": 7.338852389635258e-06, "loss": 0.0482, "step": 3697 }, { "epoch": 1.6424605818343325, "grad_norm": 0.5237427797642873, "learning_rate": 7.337138838235589e-06, "loss": 0.0397, "step": 3698 }, { "epoch": 1.64290473017988, "grad_norm": 0.5126897468237226, "learning_rate": 7.335424935524254e-06, "loss": 0.0354, "step": 3699 }, { "epoch": 1.6433488785254275, "grad_norm": 0.5632224593772709, "learning_rate": 7.333710681758876e-06, "loss": 0.0515, "step": 3700 }, { "epoch": 1.6437930268709748, "grad_norm": 0.37651906930453466, "learning_rate": 7.331996077197141e-06, "loss": 0.0263, "step": 3701 }, { "epoch": 1.6442371752165224, "grad_norm": 0.403180672879644, "learning_rate": 7.330281122096783e-06, "loss": 0.0361, "step": 3702 }, { "epoch": 1.6446813235620699, "grad_norm": 0.3870458503286661, "learning_rate": 7.328565816715587e-06, "loss": 0.0391, "step": 3703 }, { "epoch": 1.6451254719076172, "grad_norm": 0.46910403259693756, "learning_rate": 7.326850161311394e-06, "loss": 0.0539, "step": 3704 }, { "epoch": 1.6455696202531644, "grad_norm": 0.6277176707566176, "learning_rate": 7.325134156142093e-06, "loss": 0.0434, "step": 3705 }, { "epoch": 1.646013768598712, "grad_norm": 0.5311345861511363, "learning_rate": 7.323417801465633e-06, "loss": 0.0528, "step": 3706 }, { "epoch": 1.6464579169442595, "grad_norm": 0.49558316497497373, "learning_rate": 7.32170109754001e-06, "loss": 0.0518, "step": 3707 }, { "epoch": 1.6469020652898068, "grad_norm": 0.4215235777284794, "learning_rate": 7.319984044623274e-06, "loss": 0.0374, "step": 3708 }, { "epoch": 1.647346213635354, "grad_norm": 0.7018323227064814, "learning_rate": 7.3182666429735236e-06, "loss": 0.0589, "step": 3709 }, { "epoch": 1.6477903619809018, "grad_norm": 0.3811938009083553, "learning_rate": 7.316548892848919e-06, "loss": 0.0336, "step": 3710 }, { "epoch": 1.648234510326449, "grad_norm": 0.48162066324285185, "learning_rate": 7.314830794507664e-06, "loss": 0.0392, "step": 3711 }, { "epoch": 1.6486786586719964, "grad_norm": 0.7241447798938966, "learning_rate": 7.313112348208017e-06, "loss": 0.0705, "step": 3712 }, { "epoch": 1.6491228070175439, "grad_norm": 0.4040868157055559, "learning_rate": 7.311393554208292e-06, "loss": 0.0417, "step": 3713 }, { "epoch": 1.6495669553630914, "grad_norm": 0.43212705890325137, "learning_rate": 7.3096744127668515e-06, "loss": 0.0516, "step": 3714 }, { "epoch": 1.6500111037086387, "grad_norm": 0.7227761525435585, "learning_rate": 7.307954924142113e-06, "loss": 0.0427, "step": 3715 }, { "epoch": 1.650455252054186, "grad_norm": 0.6454421002922318, "learning_rate": 7.306235088592545e-06, "loss": 0.0709, "step": 3716 }, { "epoch": 1.6508994003997335, "grad_norm": 0.5263009072366842, "learning_rate": 7.304514906376665e-06, "loss": 0.0354, "step": 3717 }, { "epoch": 1.651343548745281, "grad_norm": 0.4475608856046002, "learning_rate": 7.3027943777530504e-06, "loss": 0.0376, "step": 3718 }, { "epoch": 1.6517876970908283, "grad_norm": 0.4187029855449, "learning_rate": 7.301073502980321e-06, "loss": 0.032, "step": 3719 }, { "epoch": 1.6522318454363758, "grad_norm": 0.4688452297403804, "learning_rate": 7.299352282317156e-06, "loss": 0.0328, "step": 3720 }, { "epoch": 1.6526759937819233, "grad_norm": 0.4905958290493516, "learning_rate": 7.297630716022285e-06, "loss": 0.0408, "step": 3721 }, { "epoch": 1.6531201421274706, "grad_norm": 0.5196371163361964, "learning_rate": 7.295908804354486e-06, "loss": 0.0518, "step": 3722 }, { "epoch": 1.653564290473018, "grad_norm": 0.5745303077003567, "learning_rate": 7.294186547572593e-06, "loss": 0.0525, "step": 3723 }, { "epoch": 1.6540084388185654, "grad_norm": 0.4232397957508058, "learning_rate": 7.292463945935492e-06, "loss": 0.046, "step": 3724 }, { "epoch": 1.654452587164113, "grad_norm": 0.5715502169683124, "learning_rate": 7.290740999702117e-06, "loss": 0.0502, "step": 3725 }, { "epoch": 1.6548967355096602, "grad_norm": 0.45041445198482544, "learning_rate": 7.289017709131456e-06, "loss": 0.0385, "step": 3726 }, { "epoch": 1.6553408838552075, "grad_norm": 0.5898309113830352, "learning_rate": 7.287294074482551e-06, "loss": 0.0586, "step": 3727 }, { "epoch": 1.655785032200755, "grad_norm": 0.42537829603624505, "learning_rate": 7.285570096014491e-06, "loss": 0.0382, "step": 3728 }, { "epoch": 1.6562291805463025, "grad_norm": 0.5068748041039001, "learning_rate": 7.283845773986421e-06, "loss": 0.0424, "step": 3729 }, { "epoch": 1.6566733288918498, "grad_norm": 0.49158429307244256, "learning_rate": 7.2821211086575365e-06, "loss": 0.0413, "step": 3730 }, { "epoch": 1.6571174772373973, "grad_norm": 0.5434741115681563, "learning_rate": 7.280396100287082e-06, "loss": 0.0415, "step": 3731 }, { "epoch": 1.6575616255829448, "grad_norm": 0.54047842476067, "learning_rate": 7.278670749134356e-06, "loss": 0.0337, "step": 3732 }, { "epoch": 1.6580057739284921, "grad_norm": 0.7610613678272631, "learning_rate": 7.276945055458709e-06, "loss": 0.0513, "step": 3733 }, { "epoch": 1.6584499222740394, "grad_norm": 0.43103079664602795, "learning_rate": 7.275219019519542e-06, "loss": 0.0365, "step": 3734 }, { "epoch": 1.658894070619587, "grad_norm": 0.6707297267218786, "learning_rate": 7.2734926415763074e-06, "loss": 0.0425, "step": 3735 }, { "epoch": 1.6593382189651344, "grad_norm": 0.3906372094311339, "learning_rate": 7.271765921888507e-06, "loss": 0.0379, "step": 3736 }, { "epoch": 1.6597823673106817, "grad_norm": 0.8497034117571121, "learning_rate": 7.2700388607157e-06, "loss": 0.0592, "step": 3737 }, { "epoch": 1.660226515656229, "grad_norm": 0.4906869751133543, "learning_rate": 7.268311458317491e-06, "loss": 0.0448, "step": 3738 }, { "epoch": 1.6606706640017765, "grad_norm": 0.46268372537996755, "learning_rate": 7.266583714953536e-06, "loss": 0.0363, "step": 3739 }, { "epoch": 1.661114812347324, "grad_norm": 0.3974000929871205, "learning_rate": 7.2648556308835476e-06, "loss": 0.0375, "step": 3740 }, { "epoch": 1.6615589606928713, "grad_norm": 0.3782320844424881, "learning_rate": 7.263127206367285e-06, "loss": 0.0327, "step": 3741 }, { "epoch": 1.6620031090384189, "grad_norm": 0.7073636631807192, "learning_rate": 7.2613984416645586e-06, "loss": 0.0549, "step": 3742 }, { "epoch": 1.6624472573839664, "grad_norm": 0.48125971182332594, "learning_rate": 7.2596693370352325e-06, "loss": 0.0342, "step": 3743 }, { "epoch": 1.6628914057295137, "grad_norm": 0.4622696042568772, "learning_rate": 7.257939892739221e-06, "loss": 0.0366, "step": 3744 }, { "epoch": 1.663335554075061, "grad_norm": 0.4506482837249478, "learning_rate": 7.256210109036485e-06, "loss": 0.0399, "step": 3745 }, { "epoch": 1.6637797024206085, "grad_norm": 0.468794971821978, "learning_rate": 7.254479986187045e-06, "loss": 0.0486, "step": 3746 }, { "epoch": 1.664223850766156, "grad_norm": 0.461349555479564, "learning_rate": 7.252749524450967e-06, "loss": 0.0399, "step": 3747 }, { "epoch": 1.6646679991117033, "grad_norm": 0.35487507978196975, "learning_rate": 7.251018724088367e-06, "loss": 0.0399, "step": 3748 }, { "epoch": 1.6651121474572506, "grad_norm": 0.5665408950529203, "learning_rate": 7.249287585359416e-06, "loss": 0.0515, "step": 3749 }, { "epoch": 1.6655562958027983, "grad_norm": 0.47765729176525834, "learning_rate": 7.24755610852433e-06, "loss": 0.0524, "step": 3750 }, { "epoch": 1.6660004441483456, "grad_norm": 0.49560492659442207, "learning_rate": 7.245824293843382e-06, "loss": 0.0474, "step": 3751 }, { "epoch": 1.6664445924938929, "grad_norm": 0.47858236763124845, "learning_rate": 7.244092141576895e-06, "loss": 0.0404, "step": 3752 }, { "epoch": 1.6668887408394404, "grad_norm": 0.5242443798051569, "learning_rate": 7.2423596519852354e-06, "loss": 0.0564, "step": 3753 }, { "epoch": 1.667332889184988, "grad_norm": 0.4276118939925525, "learning_rate": 7.240626825328832e-06, "loss": 0.0396, "step": 3754 }, { "epoch": 1.6677770375305352, "grad_norm": 0.365991499711862, "learning_rate": 7.238893661868154e-06, "loss": 0.0373, "step": 3755 }, { "epoch": 1.6682211858760825, "grad_norm": 0.614415895237667, "learning_rate": 7.237160161863725e-06, "loss": 0.0528, "step": 3756 }, { "epoch": 1.66866533422163, "grad_norm": 0.40303741888400607, "learning_rate": 7.235426325576123e-06, "loss": 0.0407, "step": 3757 }, { "epoch": 1.6691094825671775, "grad_norm": 0.5735579153246894, "learning_rate": 7.23369215326597e-06, "loss": 0.047, "step": 3758 }, { "epoch": 1.6695536309127248, "grad_norm": 0.6025767202418845, "learning_rate": 7.231957645193943e-06, "loss": 0.0375, "step": 3759 }, { "epoch": 1.6699977792582723, "grad_norm": 0.4473170515938267, "learning_rate": 7.2302228016207666e-06, "loss": 0.0483, "step": 3760 }, { "epoch": 1.6704419276038198, "grad_norm": 0.677052589914415, "learning_rate": 7.2284876228072195e-06, "loss": 0.0635, "step": 3761 }, { "epoch": 1.6708860759493671, "grad_norm": 0.5710811815695489, "learning_rate": 7.226752109014127e-06, "loss": 0.0464, "step": 3762 }, { "epoch": 1.6713302242949144, "grad_norm": 0.36528232704433305, "learning_rate": 7.225016260502366e-06, "loss": 0.0372, "step": 3763 }, { "epoch": 1.671774372640462, "grad_norm": 0.5011404570603847, "learning_rate": 7.223280077532866e-06, "loss": 0.0361, "step": 3764 }, { "epoch": 1.6722185209860094, "grad_norm": 0.9792675536360905, "learning_rate": 7.221543560366602e-06, "loss": 0.0695, "step": 3765 }, { "epoch": 1.6726626693315567, "grad_norm": 0.40690296889128674, "learning_rate": 7.219806709264605e-06, "loss": 0.0434, "step": 3766 }, { "epoch": 1.673106817677104, "grad_norm": 0.454187985377748, "learning_rate": 7.21806952448795e-06, "loss": 0.0353, "step": 3767 }, { "epoch": 1.6735509660226515, "grad_norm": 0.41286435056484855, "learning_rate": 7.216332006297769e-06, "loss": 0.0379, "step": 3768 }, { "epoch": 1.673995114368199, "grad_norm": 0.6022194735287347, "learning_rate": 7.2145941549552364e-06, "loss": 0.0491, "step": 3769 }, { "epoch": 1.6744392627137463, "grad_norm": 0.368180906342434, "learning_rate": 7.212855970721584e-06, "loss": 0.0302, "step": 3770 }, { "epoch": 1.6748834110592938, "grad_norm": 0.3923410824291727, "learning_rate": 7.211117453858088e-06, "loss": 0.0434, "step": 3771 }, { "epoch": 1.6753275594048413, "grad_norm": 0.5009695065668602, "learning_rate": 7.209378604626081e-06, "loss": 0.042, "step": 3772 }, { "epoch": 1.6757717077503886, "grad_norm": 0.41349339679074837, "learning_rate": 7.207639423286938e-06, "loss": 0.0363, "step": 3773 }, { "epoch": 1.676215856095936, "grad_norm": 0.363113317588028, "learning_rate": 7.205899910102087e-06, "loss": 0.0502, "step": 3774 }, { "epoch": 1.6766600044414834, "grad_norm": 0.5667120483543412, "learning_rate": 7.204160065333009e-06, "loss": 0.047, "step": 3775 }, { "epoch": 1.677104152787031, "grad_norm": 0.6591252875236339, "learning_rate": 7.202419889241231e-06, "loss": 0.0622, "step": 3776 }, { "epoch": 1.6775483011325782, "grad_norm": 0.4627599954413904, "learning_rate": 7.2006793820883315e-06, "loss": 0.0294, "step": 3777 }, { "epoch": 1.6779924494781255, "grad_norm": 0.35617793994157315, "learning_rate": 7.198938544135936e-06, "loss": 0.0302, "step": 3778 }, { "epoch": 1.6784365978236733, "grad_norm": 0.4097862631071463, "learning_rate": 7.197197375645724e-06, "loss": 0.0414, "step": 3779 }, { "epoch": 1.6788807461692206, "grad_norm": 0.675818069401739, "learning_rate": 7.195455876879425e-06, "loss": 0.0386, "step": 3780 }, { "epoch": 1.6793248945147679, "grad_norm": 0.5308299724023986, "learning_rate": 7.193714048098812e-06, "loss": 0.0425, "step": 3781 }, { "epoch": 1.6797690428603154, "grad_norm": 0.5374963949654195, "learning_rate": 7.191971889565713e-06, "loss": 0.0381, "step": 3782 }, { "epoch": 1.6802131912058629, "grad_norm": 0.5783394947551381, "learning_rate": 7.190229401542004e-06, "loss": 0.0568, "step": 3783 }, { "epoch": 1.6806573395514102, "grad_norm": 0.5082898930645872, "learning_rate": 7.18848658428961e-06, "loss": 0.0421, "step": 3784 }, { "epoch": 1.6811014878969575, "grad_norm": 0.4025027461987578, "learning_rate": 7.186743438070507e-06, "loss": 0.0311, "step": 3785 }, { "epoch": 1.681545636242505, "grad_norm": 0.3763513501474927, "learning_rate": 7.1849999631467194e-06, "loss": 0.0334, "step": 3786 }, { "epoch": 1.6819897845880525, "grad_norm": 0.4272449780002536, "learning_rate": 7.183256159780321e-06, "loss": 0.0461, "step": 3787 }, { "epoch": 1.6824339329335998, "grad_norm": 0.6764426293492224, "learning_rate": 7.181512028233433e-06, "loss": 0.0494, "step": 3788 }, { "epoch": 1.6828780812791473, "grad_norm": 0.4493107523320909, "learning_rate": 7.17976756876823e-06, "loss": 0.0407, "step": 3789 }, { "epoch": 1.6833222296246948, "grad_norm": 0.5311611910450604, "learning_rate": 7.178022781646936e-06, "loss": 0.0446, "step": 3790 }, { "epoch": 1.683766377970242, "grad_norm": 0.49447252037240147, "learning_rate": 7.176277667131817e-06, "loss": 0.0418, "step": 3791 }, { "epoch": 1.6842105263157894, "grad_norm": 0.5692208839762621, "learning_rate": 7.1745322254851966e-06, "loss": 0.0492, "step": 3792 }, { "epoch": 1.684654674661337, "grad_norm": 0.5169615968797148, "learning_rate": 7.172786456969445e-06, "loss": 0.0529, "step": 3793 }, { "epoch": 1.6850988230068844, "grad_norm": 0.393440341612259, "learning_rate": 7.171040361846979e-06, "loss": 0.0467, "step": 3794 }, { "epoch": 1.6855429713524317, "grad_norm": 0.4049011862637367, "learning_rate": 7.1692939403802676e-06, "loss": 0.0395, "step": 3795 }, { "epoch": 1.685987119697979, "grad_norm": 0.33756471256165815, "learning_rate": 7.167547192831827e-06, "loss": 0.0321, "step": 3796 }, { "epoch": 1.6864312680435265, "grad_norm": 0.4670323249605969, "learning_rate": 7.1658001194642225e-06, "loss": 0.0433, "step": 3797 }, { "epoch": 1.686875416389074, "grad_norm": 0.47348531343201694, "learning_rate": 7.16405272054007e-06, "loss": 0.0391, "step": 3798 }, { "epoch": 1.6873195647346213, "grad_norm": 0.5297792409513463, "learning_rate": 7.1623049963220325e-06, "loss": 0.0396, "step": 3799 }, { "epoch": 1.6877637130801688, "grad_norm": 0.6022176716427052, "learning_rate": 7.160556947072823e-06, "loss": 0.0457, "step": 3800 }, { "epoch": 1.6882078614257163, "grad_norm": 0.4342556651872498, "learning_rate": 7.158808573055205e-06, "loss": 0.0406, "step": 3801 }, { "epoch": 1.6886520097712636, "grad_norm": 0.571370547347438, "learning_rate": 7.157059874531982e-06, "loss": 0.039, "step": 3802 }, { "epoch": 1.689096158116811, "grad_norm": 0.6752816385308873, "learning_rate": 7.155310851766022e-06, "loss": 0.0518, "step": 3803 }, { "epoch": 1.6895403064623584, "grad_norm": 0.4681922042318178, "learning_rate": 7.153561505020228e-06, "loss": 0.0487, "step": 3804 }, { "epoch": 1.689984454807906, "grad_norm": 0.4722352367968629, "learning_rate": 7.151811834557556e-06, "loss": 0.0506, "step": 3805 }, { "epoch": 1.6904286031534532, "grad_norm": 0.46550029686119654, "learning_rate": 7.150061840641012e-06, "loss": 0.042, "step": 3806 }, { "epoch": 1.6908727514990005, "grad_norm": 0.46380378977431014, "learning_rate": 7.148311523533652e-06, "loss": 0.051, "step": 3807 }, { "epoch": 1.691316899844548, "grad_norm": 0.40338431598829894, "learning_rate": 7.146560883498575e-06, "loss": 0.0358, "step": 3808 }, { "epoch": 1.6917610481900955, "grad_norm": 0.5997068568205779, "learning_rate": 7.144809920798934e-06, "loss": 0.0549, "step": 3809 }, { "epoch": 1.6922051965356428, "grad_norm": 0.746314445865342, "learning_rate": 7.143058635697928e-06, "loss": 0.0572, "step": 3810 }, { "epoch": 1.6926493448811903, "grad_norm": 0.6126953828556693, "learning_rate": 7.141307028458805e-06, "loss": 0.0581, "step": 3811 }, { "epoch": 1.6930934932267379, "grad_norm": 0.45183518843270887, "learning_rate": 7.13955509934486e-06, "loss": 0.0527, "step": 3812 }, { "epoch": 1.6935376415722851, "grad_norm": 0.40741342750683424, "learning_rate": 7.137802848619442e-06, "loss": 0.0448, "step": 3813 }, { "epoch": 1.6939817899178324, "grad_norm": 0.409139232347241, "learning_rate": 7.136050276545937e-06, "loss": 0.0381, "step": 3814 }, { "epoch": 1.69442593826338, "grad_norm": 0.532572512466246, "learning_rate": 7.134297383387794e-06, "loss": 0.0541, "step": 3815 }, { "epoch": 1.6948700866089275, "grad_norm": 0.38463725645897007, "learning_rate": 7.1325441694084955e-06, "loss": 0.0413, "step": 3816 }, { "epoch": 1.6953142349544748, "grad_norm": 1.088465520533885, "learning_rate": 7.130790634871585e-06, "loss": 0.052, "step": 3817 }, { "epoch": 1.695758383300022, "grad_norm": 0.629929584219581, "learning_rate": 7.129036780040646e-06, "loss": 0.0567, "step": 3818 }, { "epoch": 1.6962025316455698, "grad_norm": 0.63053218226347, "learning_rate": 7.127282605179311e-06, "loss": 0.0532, "step": 3819 }, { "epoch": 1.696646679991117, "grad_norm": 0.4433559445013171, "learning_rate": 7.125528110551266e-06, "loss": 0.0463, "step": 3820 }, { "epoch": 1.6970908283366644, "grad_norm": 0.42229460674035696, "learning_rate": 7.12377329642024e-06, "loss": 0.0397, "step": 3821 }, { "epoch": 1.6975349766822119, "grad_norm": 0.49873647457140674, "learning_rate": 7.122018163050011e-06, "loss": 0.0545, "step": 3822 }, { "epoch": 1.6979791250277594, "grad_norm": 0.6060966583878594, "learning_rate": 7.1202627107044035e-06, "loss": 0.0457, "step": 3823 }, { "epoch": 1.6984232733733067, "grad_norm": 0.603057650967115, "learning_rate": 7.118506939647295e-06, "loss": 0.0464, "step": 3824 }, { "epoch": 1.698867421718854, "grad_norm": 0.6194476035587843, "learning_rate": 7.116750850142606e-06, "loss": 0.0457, "step": 3825 }, { "epoch": 1.6993115700644015, "grad_norm": 0.5096734124949287, "learning_rate": 7.114994442454306e-06, "loss": 0.045, "step": 3826 }, { "epoch": 1.699755718409949, "grad_norm": 0.5365035460325245, "learning_rate": 7.113237716846416e-06, "loss": 0.0623, "step": 3827 }, { "epoch": 1.7001998667554963, "grad_norm": 0.4418379177592074, "learning_rate": 7.111480673582998e-06, "loss": 0.0434, "step": 3828 }, { "epoch": 1.7006440151010438, "grad_norm": 0.5486324781269635, "learning_rate": 7.1097233129281674e-06, "loss": 0.0515, "step": 3829 }, { "epoch": 1.7010881634465913, "grad_norm": 0.46832429223654365, "learning_rate": 7.107965635146085e-06, "loss": 0.036, "step": 3830 }, { "epoch": 1.7015323117921386, "grad_norm": 0.4428576565727785, "learning_rate": 7.106207640500959e-06, "loss": 0.0431, "step": 3831 }, { "epoch": 1.7019764601376859, "grad_norm": 0.44260932357845195, "learning_rate": 7.104449329257047e-06, "loss": 0.0488, "step": 3832 }, { "epoch": 1.7024206084832334, "grad_norm": 0.6480909417028274, "learning_rate": 7.10269070167865e-06, "loss": 0.0538, "step": 3833 }, { "epoch": 1.702864756828781, "grad_norm": 0.5907643196584621, "learning_rate": 7.100931758030126e-06, "loss": 0.0667, "step": 3834 }, { "epoch": 1.7033089051743282, "grad_norm": 0.530721353895675, "learning_rate": 7.0991724985758694e-06, "loss": 0.0355, "step": 3835 }, { "epoch": 1.7037530535198755, "grad_norm": 0.536328694373341, "learning_rate": 7.0974129235803256e-06, "loss": 0.0492, "step": 3836 }, { "epoch": 1.704197201865423, "grad_norm": 0.47794780473543447, "learning_rate": 7.095653033307992e-06, "loss": 0.0394, "step": 3837 }, { "epoch": 1.7046413502109705, "grad_norm": 0.5788739767339988, "learning_rate": 7.093892828023408e-06, "loss": 0.0684, "step": 3838 }, { "epoch": 1.7050854985565178, "grad_norm": 0.4591656439375814, "learning_rate": 7.092132307991163e-06, "loss": 0.0426, "step": 3839 }, { "epoch": 1.7055296469020653, "grad_norm": 0.4376969955300339, "learning_rate": 7.090371473475894e-06, "loss": 0.0424, "step": 3840 }, { "epoch": 1.7059737952476128, "grad_norm": 0.43964430921337727, "learning_rate": 7.088610324742282e-06, "loss": 0.0348, "step": 3841 }, { "epoch": 1.7064179435931601, "grad_norm": 0.40903365943544095, "learning_rate": 7.086848862055059e-06, "loss": 0.0317, "step": 3842 }, { "epoch": 1.7068620919387074, "grad_norm": 0.5320572070600705, "learning_rate": 7.085087085679003e-06, "loss": 0.0493, "step": 3843 }, { "epoch": 1.707306240284255, "grad_norm": 0.4182055882812196, "learning_rate": 7.0833249958789396e-06, "loss": 0.0413, "step": 3844 }, { "epoch": 1.7077503886298024, "grad_norm": 0.5174573980801822, "learning_rate": 7.081562592919737e-06, "loss": 0.0468, "step": 3845 }, { "epoch": 1.7081945369753497, "grad_norm": 0.36708254507339966, "learning_rate": 7.07979987706632e-06, "loss": 0.0392, "step": 3846 }, { "epoch": 1.708638685320897, "grad_norm": 0.42988860867301804, "learning_rate": 7.078036848583651e-06, "loss": 0.0438, "step": 3847 }, { "epoch": 1.7090828336664448, "grad_norm": 0.8624108371914289, "learning_rate": 7.076273507736744e-06, "loss": 0.0564, "step": 3848 }, { "epoch": 1.709526982011992, "grad_norm": 0.3825294027975058, "learning_rate": 7.074509854790659e-06, "loss": 0.042, "step": 3849 }, { "epoch": 1.7099711303575393, "grad_norm": 0.7845280836992656, "learning_rate": 7.072745890010502e-06, "loss": 0.0513, "step": 3850 }, { "epoch": 1.7104152787030868, "grad_norm": 0.511895797137735, "learning_rate": 7.070981613661429e-06, "loss": 0.042, "step": 3851 }, { "epoch": 1.7108594270486344, "grad_norm": 0.39864960740423, "learning_rate": 7.06921702600864e-06, "loss": 0.0435, "step": 3852 }, { "epoch": 1.7113035753941817, "grad_norm": 0.3728345248612773, "learning_rate": 7.067452127317381e-06, "loss": 0.0414, "step": 3853 }, { "epoch": 1.711747723739729, "grad_norm": 0.4146176183523404, "learning_rate": 7.065686917852948e-06, "loss": 0.0394, "step": 3854 }, { "epoch": 1.7121918720852765, "grad_norm": 0.4550263965386846, "learning_rate": 7.063921397880682e-06, "loss": 0.0353, "step": 3855 }, { "epoch": 1.712636020430824, "grad_norm": 0.6273234491307014, "learning_rate": 7.062155567665969e-06, "loss": 0.0336, "step": 3856 }, { "epoch": 1.7130801687763713, "grad_norm": 0.5739885387932523, "learning_rate": 7.0603894274742445e-06, "loss": 0.0463, "step": 3857 }, { "epoch": 1.7135243171219188, "grad_norm": 0.46562620789463843, "learning_rate": 7.05862297757099e-06, "loss": 0.0453, "step": 3858 }, { "epoch": 1.7139684654674663, "grad_norm": 0.5078681314633765, "learning_rate": 7.056856218221731e-06, "loss": 0.0472, "step": 3859 }, { "epoch": 1.7144126138130136, "grad_norm": 0.44379024742354006, "learning_rate": 7.055089149692044e-06, "loss": 0.0362, "step": 3860 }, { "epoch": 1.7148567621585609, "grad_norm": 0.6765029010883838, "learning_rate": 7.053321772247546e-06, "loss": 0.0605, "step": 3861 }, { "epoch": 1.7153009105041084, "grad_norm": 0.5667840551717872, "learning_rate": 7.051554086153907e-06, "loss": 0.052, "step": 3862 }, { "epoch": 1.715745058849656, "grad_norm": 0.4505940404872499, "learning_rate": 7.049786091676838e-06, "loss": 0.0437, "step": 3863 }, { "epoch": 1.7161892071952032, "grad_norm": 0.3740985912782501, "learning_rate": 7.0480177890821e-06, "loss": 0.034, "step": 3864 }, { "epoch": 1.7166333555407505, "grad_norm": 0.5770251755986612, "learning_rate": 7.046249178635499e-06, "loss": 0.0502, "step": 3865 }, { "epoch": 1.717077503886298, "grad_norm": 0.5237695494745064, "learning_rate": 7.044480260602888e-06, "loss": 0.0565, "step": 3866 }, { "epoch": 1.7175216522318455, "grad_norm": 0.49695229712892924, "learning_rate": 7.042711035250162e-06, "loss": 0.0526, "step": 3867 }, { "epoch": 1.7179658005773928, "grad_norm": 0.7710570644630359, "learning_rate": 7.0409415028432685e-06, "loss": 0.0353, "step": 3868 }, { "epoch": 1.7184099489229403, "grad_norm": 0.5569345846072523, "learning_rate": 7.0391716636481976e-06, "loss": 0.0463, "step": 3869 }, { "epoch": 1.7188540972684878, "grad_norm": 0.46227610895953286, "learning_rate": 7.037401517930986e-06, "loss": 0.0518, "step": 3870 }, { "epoch": 1.719298245614035, "grad_norm": 0.3855965081893482, "learning_rate": 7.035631065957718e-06, "loss": 0.0345, "step": 3871 }, { "epoch": 1.7197423939595824, "grad_norm": 0.4363418780122806, "learning_rate": 7.03386030799452e-06, "loss": 0.0492, "step": 3872 }, { "epoch": 1.72018654230513, "grad_norm": 0.4427385688478732, "learning_rate": 7.03208924430757e-06, "loss": 0.0463, "step": 3873 }, { "epoch": 1.7206306906506774, "grad_norm": 0.5897859219840801, "learning_rate": 7.030317875163086e-06, "loss": 0.0472, "step": 3874 }, { "epoch": 1.7210748389962247, "grad_norm": 0.44136482686712686, "learning_rate": 7.0285462008273365e-06, "loss": 0.0339, "step": 3875 }, { "epoch": 1.721518987341772, "grad_norm": 0.4341786806553805, "learning_rate": 7.026774221566634e-06, "loss": 0.0479, "step": 3876 }, { "epoch": 1.7219631356873195, "grad_norm": 0.4859029892336248, "learning_rate": 7.0250019376473375e-06, "loss": 0.0481, "step": 3877 }, { "epoch": 1.722407284032867, "grad_norm": 0.554393064292429, "learning_rate": 7.0232293493358515e-06, "loss": 0.0423, "step": 3878 }, { "epoch": 1.7228514323784143, "grad_norm": 0.4954355417965089, "learning_rate": 7.021456456898624e-06, "loss": 0.0529, "step": 3879 }, { "epoch": 1.7232955807239618, "grad_norm": 0.39348052805418005, "learning_rate": 7.019683260602155e-06, "loss": 0.04, "step": 3880 }, { "epoch": 1.7237397290695093, "grad_norm": 0.4876831116193254, "learning_rate": 7.017909760712982e-06, "loss": 0.0416, "step": 3881 }, { "epoch": 1.7241838774150566, "grad_norm": 0.5390273774730024, "learning_rate": 7.016135957497693e-06, "loss": 0.0685, "step": 3882 }, { "epoch": 1.724628025760604, "grad_norm": 0.553831689747292, "learning_rate": 7.014361851222923e-06, "loss": 0.0575, "step": 3883 }, { "epoch": 1.7250721741061514, "grad_norm": 0.5211453167042258, "learning_rate": 7.012587442155349e-06, "loss": 0.0508, "step": 3884 }, { "epoch": 1.725516322451699, "grad_norm": 0.5052516566700862, "learning_rate": 7.010812730561691e-06, "loss": 0.0411, "step": 3885 }, { "epoch": 1.7259604707972462, "grad_norm": 0.5073428394585604, "learning_rate": 7.009037716708725e-06, "loss": 0.0426, "step": 3886 }, { "epoch": 1.7264046191427935, "grad_norm": 0.48746152511684293, "learning_rate": 7.007262400863262e-06, "loss": 0.043, "step": 3887 }, { "epoch": 1.7268487674883413, "grad_norm": 0.48800631902973207, "learning_rate": 7.005486783292164e-06, "loss": 0.0416, "step": 3888 }, { "epoch": 1.7272929158338886, "grad_norm": 0.5824704615616569, "learning_rate": 7.003710864262333e-06, "loss": 0.0485, "step": 3889 }, { "epoch": 1.7277370641794358, "grad_norm": 0.4562507795280803, "learning_rate": 7.0019346440407225e-06, "loss": 0.0392, "step": 3890 }, { "epoch": 1.7281812125249834, "grad_norm": 0.5660300341993364, "learning_rate": 7.000158122894329e-06, "loss": 0.0443, "step": 3891 }, { "epoch": 1.7286253608705309, "grad_norm": 1.2237037452955262, "learning_rate": 6.9983813010901925e-06, "loss": 0.0446, "step": 3892 }, { "epoch": 1.7290695092160782, "grad_norm": 0.3834611348340635, "learning_rate": 6.996604178895398e-06, "loss": 0.0321, "step": 3893 }, { "epoch": 1.7295136575616255, "grad_norm": 0.500508115884937, "learning_rate": 6.994826756577082e-06, "loss": 0.0547, "step": 3894 }, { "epoch": 1.729957805907173, "grad_norm": 0.41016179375129264, "learning_rate": 6.993049034402417e-06, "loss": 0.0299, "step": 3895 }, { "epoch": 1.7304019542527205, "grad_norm": 0.5423253130879615, "learning_rate": 6.991271012638626e-06, "loss": 0.0616, "step": 3896 }, { "epoch": 1.7308461025982678, "grad_norm": 0.5282418101161326, "learning_rate": 6.9894926915529774e-06, "loss": 0.0457, "step": 3897 }, { "epoch": 1.7312902509438153, "grad_norm": 0.4989861140225304, "learning_rate": 6.987714071412781e-06, "loss": 0.0408, "step": 3898 }, { "epoch": 1.7317343992893628, "grad_norm": 0.48460144374543296, "learning_rate": 6.985935152485392e-06, "loss": 0.0508, "step": 3899 }, { "epoch": 1.73217854763491, "grad_norm": 0.37082720341713776, "learning_rate": 6.984155935038217e-06, "loss": 0.0376, "step": 3900 }, { "epoch": 1.7326226959804574, "grad_norm": 0.6794555649441637, "learning_rate": 6.9823764193387e-06, "loss": 0.0568, "step": 3901 }, { "epoch": 1.7330668443260049, "grad_norm": 0.5551808325017253, "learning_rate": 6.980596605654332e-06, "loss": 0.0502, "step": 3902 }, { "epoch": 1.7335109926715524, "grad_norm": 0.5306956016396489, "learning_rate": 6.9788164942526495e-06, "loss": 0.0432, "step": 3903 }, { "epoch": 1.7339551410170997, "grad_norm": 0.6362458412949452, "learning_rate": 6.977036085401234e-06, "loss": 0.0486, "step": 3904 }, { "epoch": 1.734399289362647, "grad_norm": 0.29485239085618, "learning_rate": 6.9752553793677105e-06, "loss": 0.0245, "step": 3905 }, { "epoch": 1.7348434377081945, "grad_norm": 0.5487150575741024, "learning_rate": 6.9734743764197485e-06, "loss": 0.0463, "step": 3906 }, { "epoch": 1.735287586053742, "grad_norm": 0.7871819167162935, "learning_rate": 6.9716930768250655e-06, "loss": 0.0524, "step": 3907 }, { "epoch": 1.7357317343992893, "grad_norm": 0.5219429848305476, "learning_rate": 6.9699114808514215e-06, "loss": 0.0549, "step": 3908 }, { "epoch": 1.7361758827448368, "grad_norm": 0.4635849498371789, "learning_rate": 6.968129588766617e-06, "loss": 0.0604, "step": 3909 }, { "epoch": 1.7366200310903843, "grad_norm": 0.43616273854041554, "learning_rate": 6.966347400838502e-06, "loss": 0.0373, "step": 3910 }, { "epoch": 1.7370641794359316, "grad_norm": 0.593451901083156, "learning_rate": 6.964564917334973e-06, "loss": 0.0527, "step": 3911 }, { "epoch": 1.737508327781479, "grad_norm": 0.6448036909846457, "learning_rate": 6.962782138523963e-06, "loss": 0.048, "step": 3912 }, { "epoch": 1.7379524761270264, "grad_norm": 0.6507383892986597, "learning_rate": 6.960999064673455e-06, "loss": 0.0405, "step": 3913 }, { "epoch": 1.738396624472574, "grad_norm": 0.3700619002880566, "learning_rate": 6.959215696051478e-06, "loss": 0.0375, "step": 3914 }, { "epoch": 1.7388407728181212, "grad_norm": 0.5636095366670266, "learning_rate": 6.957432032926099e-06, "loss": 0.0473, "step": 3915 }, { "epoch": 1.7392849211636685, "grad_norm": 0.31284491700710365, "learning_rate": 6.955648075565435e-06, "loss": 0.0262, "step": 3916 }, { "epoch": 1.7397290695092162, "grad_norm": 0.88892656741503, "learning_rate": 6.953863824237644e-06, "loss": 0.0439, "step": 3917 }, { "epoch": 1.7401732178547635, "grad_norm": 0.3940650039553802, "learning_rate": 6.952079279210931e-06, "loss": 0.0336, "step": 3918 }, { "epoch": 1.7406173662003108, "grad_norm": 0.43415099426990134, "learning_rate": 6.950294440753542e-06, "loss": 0.0372, "step": 3919 }, { "epoch": 1.7410615145458583, "grad_norm": 0.43498781141215753, "learning_rate": 6.948509309133769e-06, "loss": 0.0471, "step": 3920 }, { "epoch": 1.7415056628914058, "grad_norm": 0.3673155534955715, "learning_rate": 6.9467238846199465e-06, "loss": 0.0329, "step": 3921 }, { "epoch": 1.7419498112369531, "grad_norm": 0.3522859182784973, "learning_rate": 6.944938167480456e-06, "loss": 0.0337, "step": 3922 }, { "epoch": 1.7423939595825004, "grad_norm": 0.40635886553673484, "learning_rate": 6.943152157983719e-06, "loss": 0.0429, "step": 3923 }, { "epoch": 1.742838107928048, "grad_norm": 0.8451883765559832, "learning_rate": 6.941365856398205e-06, "loss": 0.0668, "step": 3924 }, { "epoch": 1.7432822562735955, "grad_norm": 0.5600822140198348, "learning_rate": 6.939579262992426e-06, "loss": 0.0451, "step": 3925 }, { "epoch": 1.7437264046191427, "grad_norm": 1.2545344075254676, "learning_rate": 6.937792378034936e-06, "loss": 0.0743, "step": 3926 }, { "epoch": 1.7441705529646903, "grad_norm": 1.1071420667016871, "learning_rate": 6.936005201794331e-06, "loss": 0.073, "step": 3927 }, { "epoch": 1.7446147013102378, "grad_norm": 0.4227881516723295, "learning_rate": 6.93421773453926e-06, "loss": 0.043, "step": 3928 }, { "epoch": 1.745058849655785, "grad_norm": 0.46137549848480547, "learning_rate": 6.932429976538407e-06, "loss": 0.0413, "step": 3929 }, { "epoch": 1.7455029980013324, "grad_norm": 0.5679012650124624, "learning_rate": 6.930641928060501e-06, "loss": 0.0331, "step": 3930 }, { "epoch": 1.7459471463468799, "grad_norm": 0.5321986376581693, "learning_rate": 6.928853589374318e-06, "loss": 0.0467, "step": 3931 }, { "epoch": 1.7463912946924274, "grad_norm": 0.6219676953962676, "learning_rate": 6.927064960748675e-06, "loss": 0.0479, "step": 3932 }, { "epoch": 1.7468354430379747, "grad_norm": 0.5001191355293207, "learning_rate": 6.925276042452433e-06, "loss": 0.0535, "step": 3933 }, { "epoch": 1.747279591383522, "grad_norm": 0.4563948076269455, "learning_rate": 6.923486834754498e-06, "loss": 0.0427, "step": 3934 }, { "epoch": 1.7477237397290695, "grad_norm": 0.6974643048935755, "learning_rate": 6.9216973379238175e-06, "loss": 0.0485, "step": 3935 }, { "epoch": 1.748167888074617, "grad_norm": 0.9428504067985796, "learning_rate": 6.9199075522293815e-06, "loss": 0.0612, "step": 3936 }, { "epoch": 1.7486120364201643, "grad_norm": 0.8097896480889225, "learning_rate": 6.918117477940227e-06, "loss": 0.0467, "step": 3937 }, { "epoch": 1.7490561847657118, "grad_norm": 0.5120773769465646, "learning_rate": 6.916327115325434e-06, "loss": 0.0554, "step": 3938 }, { "epoch": 1.7495003331112593, "grad_norm": 0.49704552104919647, "learning_rate": 6.914536464654123e-06, "loss": 0.0429, "step": 3939 }, { "epoch": 1.7499444814568066, "grad_norm": 0.4966530008001101, "learning_rate": 6.912745526195457e-06, "loss": 0.0416, "step": 3940 }, { "epoch": 1.7503886298023539, "grad_norm": 0.6069162585490371, "learning_rate": 6.910954300218648e-06, "loss": 0.0407, "step": 3941 }, { "epoch": 1.7508327781479014, "grad_norm": 0.5065891450556588, "learning_rate": 6.9091627869929456e-06, "loss": 0.0522, "step": 3942 }, { "epoch": 1.751276926493449, "grad_norm": 0.5821716098410316, "learning_rate": 6.907370986787647e-06, "loss": 0.0588, "step": 3943 }, { "epoch": 1.7517210748389962, "grad_norm": 0.4854527206747191, "learning_rate": 6.905578899872085e-06, "loss": 0.0347, "step": 3944 }, { "epoch": 1.7521652231845435, "grad_norm": 0.6099586438595753, "learning_rate": 6.903786526515648e-06, "loss": 0.057, "step": 3945 }, { "epoch": 1.752609371530091, "grad_norm": 0.39621484939213947, "learning_rate": 6.901993866987755e-06, "loss": 0.036, "step": 3946 }, { "epoch": 1.7530535198756385, "grad_norm": 0.5825955749635414, "learning_rate": 6.9002009215578736e-06, "loss": 0.0371, "step": 3947 }, { "epoch": 1.7534976682211858, "grad_norm": 0.5518034617506287, "learning_rate": 6.898407690495516e-06, "loss": 0.0535, "step": 3948 }, { "epoch": 1.7539418165667333, "grad_norm": 0.7169225712774631, "learning_rate": 6.896614174070234e-06, "loss": 0.065, "step": 3949 }, { "epoch": 1.7543859649122808, "grad_norm": 0.487246075580528, "learning_rate": 6.894820372551624e-06, "loss": 0.0425, "step": 3950 }, { "epoch": 1.7548301132578281, "grad_norm": 0.5762490980340829, "learning_rate": 6.893026286209324e-06, "loss": 0.0579, "step": 3951 }, { "epoch": 1.7552742616033754, "grad_norm": 0.40568305353842643, "learning_rate": 6.891231915313017e-06, "loss": 0.0447, "step": 3952 }, { "epoch": 1.755718409948923, "grad_norm": 1.1277438855550295, "learning_rate": 6.889437260132426e-06, "loss": 0.0646, "step": 3953 }, { "epoch": 1.7561625582944704, "grad_norm": 1.0348468689646162, "learning_rate": 6.887642320937319e-06, "loss": 0.0477, "step": 3954 }, { "epoch": 1.7566067066400177, "grad_norm": 0.4516096937991291, "learning_rate": 6.885847097997507e-06, "loss": 0.0441, "step": 3955 }, { "epoch": 1.757050854985565, "grad_norm": 0.5215834296906894, "learning_rate": 6.884051591582838e-06, "loss": 0.0413, "step": 3956 }, { "epoch": 1.7574950033311127, "grad_norm": 0.4942925713506997, "learning_rate": 6.882255801963215e-06, "loss": 0.0313, "step": 3957 }, { "epoch": 1.75793915167666, "grad_norm": 0.5511920080731652, "learning_rate": 6.8804597294085676e-06, "loss": 0.0468, "step": 3958 }, { "epoch": 1.7583833000222073, "grad_norm": 0.8005773203813454, "learning_rate": 6.87866337418888e-06, "loss": 0.0723, "step": 3959 }, { "epoch": 1.7588274483677548, "grad_norm": 0.6193224773136193, "learning_rate": 6.876866736574175e-06, "loss": 0.0458, "step": 3960 }, { "epoch": 1.7592715967133024, "grad_norm": 0.7463153621358136, "learning_rate": 6.875069816834517e-06, "loss": 0.0452, "step": 3961 }, { "epoch": 1.7597157450588496, "grad_norm": 0.6653689016758167, "learning_rate": 6.873272615240013e-06, "loss": 0.0379, "step": 3962 }, { "epoch": 1.760159893404397, "grad_norm": 0.46506318034990934, "learning_rate": 6.871475132060814e-06, "loss": 0.0405, "step": 3963 }, { "epoch": 1.7606040417499444, "grad_norm": 0.4803172632231074, "learning_rate": 6.8696773675671125e-06, "loss": 0.0494, "step": 3964 }, { "epoch": 1.761048190095492, "grad_norm": 0.5683228177486831, "learning_rate": 6.8678793220291406e-06, "loss": 0.0474, "step": 3965 }, { "epoch": 1.7614923384410393, "grad_norm": 0.6405917034428388, "learning_rate": 6.866080995717179e-06, "loss": 0.0444, "step": 3966 }, { "epoch": 1.7619364867865868, "grad_norm": 0.469723345316827, "learning_rate": 6.864282388901544e-06, "loss": 0.0456, "step": 3967 }, { "epoch": 1.7623806351321343, "grad_norm": 0.3596729988674669, "learning_rate": 6.862483501852597e-06, "loss": 0.0273, "step": 3968 }, { "epoch": 1.7628247834776816, "grad_norm": 0.4095733039990315, "learning_rate": 6.8606843348407416e-06, "loss": 0.0423, "step": 3969 }, { "epoch": 1.7632689318232289, "grad_norm": 0.5817103130183384, "learning_rate": 6.858884888136423e-06, "loss": 0.061, "step": 3970 }, { "epoch": 1.7637130801687764, "grad_norm": 0.5515400819468078, "learning_rate": 6.85708516201013e-06, "loss": 0.0493, "step": 3971 }, { "epoch": 1.7641572285143239, "grad_norm": 0.4010509115861224, "learning_rate": 6.855285156732389e-06, "loss": 0.0384, "step": 3972 }, { "epoch": 1.7646013768598712, "grad_norm": 0.48651107038259084, "learning_rate": 6.853484872573773e-06, "loss": 0.0417, "step": 3973 }, { "epoch": 1.7650455252054185, "grad_norm": 0.6804998963023164, "learning_rate": 6.851684309804898e-06, "loss": 0.0511, "step": 3974 }, { "epoch": 1.765489673550966, "grad_norm": 0.5378576658955843, "learning_rate": 6.849883468696414e-06, "loss": 0.0466, "step": 3975 }, { "epoch": 1.7659338218965135, "grad_norm": 0.6080913375351253, "learning_rate": 6.848082349519021e-06, "loss": 0.0504, "step": 3976 }, { "epoch": 1.7663779702420608, "grad_norm": 0.4317853459587004, "learning_rate": 6.846280952543459e-06, "loss": 0.0419, "step": 3977 }, { "epoch": 1.7668221185876083, "grad_norm": 0.44881873306480574, "learning_rate": 6.844479278040506e-06, "loss": 0.0373, "step": 3978 }, { "epoch": 1.7672662669331558, "grad_norm": 0.38885972043532946, "learning_rate": 6.842677326280984e-06, "loss": 0.0396, "step": 3979 }, { "epoch": 1.767710415278703, "grad_norm": 0.4524735899424509, "learning_rate": 6.840875097535761e-06, "loss": 0.0448, "step": 3980 }, { "epoch": 1.7681545636242504, "grad_norm": 0.455210687429983, "learning_rate": 6.8390725920757374e-06, "loss": 0.0382, "step": 3981 }, { "epoch": 1.768598711969798, "grad_norm": 0.398331712609469, "learning_rate": 6.837269810171864e-06, "loss": 0.0414, "step": 3982 }, { "epoch": 1.7690428603153454, "grad_norm": 0.37522143210177494, "learning_rate": 6.835466752095129e-06, "loss": 0.0386, "step": 3983 }, { "epoch": 1.7694870086608927, "grad_norm": 0.6174104941670406, "learning_rate": 6.833663418116561e-06, "loss": 0.0389, "step": 3984 }, { "epoch": 1.76993115700644, "grad_norm": 0.6659454494298849, "learning_rate": 6.831859808507233e-06, "loss": 0.0447, "step": 3985 }, { "epoch": 1.7703753053519877, "grad_norm": 0.47498410446306544, "learning_rate": 6.830055923538258e-06, "loss": 0.0417, "step": 3986 }, { "epoch": 1.770819453697535, "grad_norm": 0.4396131477174768, "learning_rate": 6.82825176348079e-06, "loss": 0.0372, "step": 3987 }, { "epoch": 1.7712636020430823, "grad_norm": 0.576388987980822, "learning_rate": 6.826447328606026e-06, "loss": 0.0432, "step": 3988 }, { "epoch": 1.7717077503886298, "grad_norm": 0.5115559440177404, "learning_rate": 6.8246426191852025e-06, "loss": 0.046, "step": 3989 }, { "epoch": 1.7721518987341773, "grad_norm": 0.3933392068642798, "learning_rate": 6.822837635489597e-06, "loss": 0.0424, "step": 3990 }, { "epoch": 1.7725960470797246, "grad_norm": 0.48762159715898146, "learning_rate": 6.821032377790533e-06, "loss": 0.0428, "step": 3991 }, { "epoch": 1.773040195425272, "grad_norm": 0.5275360991096351, "learning_rate": 6.819226846359366e-06, "loss": 0.0437, "step": 3992 }, { "epoch": 1.7734843437708194, "grad_norm": 0.5998959208861094, "learning_rate": 6.817421041467501e-06, "loss": 0.0493, "step": 3993 }, { "epoch": 1.773928492116367, "grad_norm": 0.558883606638067, "learning_rate": 6.815614963386383e-06, "loss": 0.0443, "step": 3994 }, { "epoch": 1.7743726404619142, "grad_norm": 0.530805377008147, "learning_rate": 6.813808612387493e-06, "loss": 0.0493, "step": 3995 }, { "epoch": 1.7748167888074617, "grad_norm": 0.44577841042032773, "learning_rate": 6.812001988742356e-06, "loss": 0.0423, "step": 3996 }, { "epoch": 1.7752609371530093, "grad_norm": 0.4208112565964139, "learning_rate": 6.81019509272254e-06, "loss": 0.0383, "step": 3997 }, { "epoch": 1.7757050854985565, "grad_norm": 0.5652594178574358, "learning_rate": 6.808387924599653e-06, "loss": 0.0469, "step": 3998 }, { "epoch": 1.7761492338441038, "grad_norm": 0.47567339447317186, "learning_rate": 6.806580484645342e-06, "loss": 0.0536, "step": 3999 }, { "epoch": 1.7765933821896513, "grad_norm": 0.5061651547732329, "learning_rate": 6.804772773131294e-06, "loss": 0.0477, "step": 4000 }, { "epoch": 1.7770375305351989, "grad_norm": 0.3302628327205031, "learning_rate": 6.80296479032924e-06, "loss": 0.0287, "step": 4001 }, { "epoch": 1.7774816788807462, "grad_norm": 0.5657117208796698, "learning_rate": 6.801156536510953e-06, "loss": 0.063, "step": 4002 }, { "epoch": 1.7779258272262934, "grad_norm": 0.40290409532344185, "learning_rate": 6.799348011948242e-06, "loss": 0.0328, "step": 4003 }, { "epoch": 1.778369975571841, "grad_norm": 0.4303302869190356, "learning_rate": 6.797539216912958e-06, "loss": 0.0389, "step": 4004 }, { "epoch": 1.7788141239173885, "grad_norm": 0.4915190826163001, "learning_rate": 6.795730151676996e-06, "loss": 0.0506, "step": 4005 }, { "epoch": 1.7792582722629358, "grad_norm": 0.4472370244553756, "learning_rate": 6.793920816512287e-06, "loss": 0.0479, "step": 4006 }, { "epoch": 1.7797024206084833, "grad_norm": 0.46823865267198, "learning_rate": 6.792111211690807e-06, "loss": 0.04, "step": 4007 }, { "epoch": 1.7801465689540308, "grad_norm": 0.713302627768962, "learning_rate": 6.790301337484569e-06, "loss": 0.0467, "step": 4008 }, { "epoch": 1.780590717299578, "grad_norm": 0.544988693204241, "learning_rate": 6.788491194165629e-06, "loss": 0.0597, "step": 4009 }, { "epoch": 1.7810348656451254, "grad_norm": 0.5488358962167847, "learning_rate": 6.786680782006079e-06, "loss": 0.0454, "step": 4010 }, { "epoch": 1.7814790139906729, "grad_norm": 0.45588667933120275, "learning_rate": 6.784870101278058e-06, "loss": 0.0458, "step": 4011 }, { "epoch": 1.7819231623362204, "grad_norm": 0.41134367023706303, "learning_rate": 6.783059152253743e-06, "loss": 0.0423, "step": 4012 }, { "epoch": 1.7823673106817677, "grad_norm": 0.5379695606986187, "learning_rate": 6.7812479352053465e-06, "loss": 0.0434, "step": 4013 }, { "epoch": 1.782811459027315, "grad_norm": 0.390627468228112, "learning_rate": 6.779436450405127e-06, "loss": 0.0459, "step": 4014 }, { "epoch": 1.7832556073728625, "grad_norm": 0.7523391862616232, "learning_rate": 6.7776246981253835e-06, "loss": 0.0578, "step": 4015 }, { "epoch": 1.78369975571841, "grad_norm": 0.4670417748350829, "learning_rate": 6.775812678638449e-06, "loss": 0.0391, "step": 4016 }, { "epoch": 1.7841439040639573, "grad_norm": 0.6629585111671369, "learning_rate": 6.7740003922167045e-06, "loss": 0.0525, "step": 4017 }, { "epoch": 1.7845880524095048, "grad_norm": 0.5468251468649823, "learning_rate": 6.7721878391325655e-06, "loss": 0.048, "step": 4018 }, { "epoch": 1.7850322007550523, "grad_norm": 0.6407194911078166, "learning_rate": 6.770375019658491e-06, "loss": 0.0342, "step": 4019 }, { "epoch": 1.7854763491005996, "grad_norm": 0.3555576600475007, "learning_rate": 6.7685619340669775e-06, "loss": 0.0424, "step": 4020 }, { "epoch": 1.785920497446147, "grad_norm": 0.5655823535701915, "learning_rate": 6.766748582630561e-06, "loss": 0.0498, "step": 4021 }, { "epoch": 1.7863646457916944, "grad_norm": 0.5446372824826297, "learning_rate": 6.764934965621823e-06, "loss": 0.0326, "step": 4022 }, { "epoch": 1.786808794137242, "grad_norm": 0.7635864213363933, "learning_rate": 6.763121083313378e-06, "loss": 0.0469, "step": 4023 }, { "epoch": 1.7872529424827892, "grad_norm": 0.5831230436540045, "learning_rate": 6.761306935977883e-06, "loss": 0.0437, "step": 4024 }, { "epoch": 1.7876970908283365, "grad_norm": 0.4755466748257561, "learning_rate": 6.759492523888036e-06, "loss": 0.0349, "step": 4025 }, { "epoch": 1.7881412391738842, "grad_norm": 0.3810710893901648, "learning_rate": 6.757677847316576e-06, "loss": 0.0304, "step": 4026 }, { "epoch": 1.7885853875194315, "grad_norm": 0.5635909990676548, "learning_rate": 6.755862906536276e-06, "loss": 0.0509, "step": 4027 }, { "epoch": 1.7890295358649788, "grad_norm": 0.44071811791526233, "learning_rate": 6.754047701819954e-06, "loss": 0.0409, "step": 4028 }, { "epoch": 1.7894736842105263, "grad_norm": 0.6047548580207962, "learning_rate": 6.752232233440469e-06, "loss": 0.0472, "step": 4029 }, { "epoch": 1.7899178325560738, "grad_norm": 0.4265056649643106, "learning_rate": 6.750416501670712e-06, "loss": 0.0341, "step": 4030 }, { "epoch": 1.7903619809016211, "grad_norm": 0.4750323545017985, "learning_rate": 6.74860050678362e-06, "loss": 0.0399, "step": 4031 }, { "epoch": 1.7908061292471684, "grad_norm": 0.43275311316068676, "learning_rate": 6.74678424905217e-06, "loss": 0.0381, "step": 4032 }, { "epoch": 1.791250277592716, "grad_norm": 0.4591066133415168, "learning_rate": 6.744967728749374e-06, "loss": 0.049, "step": 4033 }, { "epoch": 1.7916944259382634, "grad_norm": 0.46035832875677174, "learning_rate": 6.743150946148286e-06, "loss": 0.037, "step": 4034 }, { "epoch": 1.7921385742838107, "grad_norm": 0.44089681593883423, "learning_rate": 6.7413339015219995e-06, "loss": 0.0489, "step": 4035 }, { "epoch": 1.7925827226293582, "grad_norm": 0.3967713328470784, "learning_rate": 6.739516595143649e-06, "loss": 0.0348, "step": 4036 }, { "epoch": 1.7930268709749058, "grad_norm": 0.47307408803909806, "learning_rate": 6.737699027286404e-06, "loss": 0.0488, "step": 4037 }, { "epoch": 1.793471019320453, "grad_norm": 0.6980297064306341, "learning_rate": 6.735881198223476e-06, "loss": 0.0603, "step": 4038 }, { "epoch": 1.7939151676660003, "grad_norm": 0.641654879554228, "learning_rate": 6.734063108228118e-06, "loss": 0.046, "step": 4039 }, { "epoch": 1.7943593160115479, "grad_norm": 0.43889131829448813, "learning_rate": 6.732244757573619e-06, "loss": 0.0417, "step": 4040 }, { "epoch": 1.7948034643570954, "grad_norm": 0.5791145146187624, "learning_rate": 6.730426146533304e-06, "loss": 0.0629, "step": 4041 }, { "epoch": 1.7952476127026427, "grad_norm": 0.3191065387044477, "learning_rate": 6.728607275380548e-06, "loss": 0.034, "step": 4042 }, { "epoch": 1.79569176104819, "grad_norm": 0.5785674908181686, "learning_rate": 6.726788144388754e-06, "loss": 0.0495, "step": 4043 }, { "epoch": 1.7961359093937375, "grad_norm": 0.5420845224176172, "learning_rate": 6.724968753831367e-06, "loss": 0.0568, "step": 4044 }, { "epoch": 1.796580057739285, "grad_norm": 0.4088444696114397, "learning_rate": 6.723149103981874e-06, "loss": 0.045, "step": 4045 }, { "epoch": 1.7970242060848323, "grad_norm": 0.6392237685246868, "learning_rate": 6.721329195113802e-06, "loss": 0.0554, "step": 4046 }, { "epoch": 1.7974683544303798, "grad_norm": 0.3977912647423139, "learning_rate": 6.7195090275007104e-06, "loss": 0.0361, "step": 4047 }, { "epoch": 1.7979125027759273, "grad_norm": 0.36038011427697747, "learning_rate": 6.717688601416201e-06, "loss": 0.0406, "step": 4048 }, { "epoch": 1.7983566511214746, "grad_norm": 0.5212415789499063, "learning_rate": 6.715867917133919e-06, "loss": 0.0422, "step": 4049 }, { "epoch": 1.7988007994670219, "grad_norm": 0.49377763847355416, "learning_rate": 6.714046974927539e-06, "loss": 0.043, "step": 4050 }, { "epoch": 1.7992449478125694, "grad_norm": 0.47840704302357145, "learning_rate": 6.712225775070784e-06, "loss": 0.044, "step": 4051 }, { "epoch": 1.799689096158117, "grad_norm": 0.4112067824225565, "learning_rate": 6.71040431783741e-06, "loss": 0.0354, "step": 4052 }, { "epoch": 1.8001332445036642, "grad_norm": 0.3668140961966663, "learning_rate": 6.70858260350121e-06, "loss": 0.0369, "step": 4053 }, { "epoch": 1.8005773928492115, "grad_norm": 0.6061145451464093, "learning_rate": 6.706760632336023e-06, "loss": 0.0529, "step": 4054 }, { "epoch": 1.8010215411947592, "grad_norm": 0.804408929113384, "learning_rate": 6.704938404615718e-06, "loss": 0.0695, "step": 4055 }, { "epoch": 1.8014656895403065, "grad_norm": 0.44516753707686785, "learning_rate": 6.703115920614212e-06, "loss": 0.041, "step": 4056 }, { "epoch": 1.8019098378858538, "grad_norm": 0.6944007446846528, "learning_rate": 6.701293180605451e-06, "loss": 0.063, "step": 4057 }, { "epoch": 1.8023539862314013, "grad_norm": 0.6199411354460107, "learning_rate": 6.699470184863423e-06, "loss": 0.0478, "step": 4058 }, { "epoch": 1.8027981345769488, "grad_norm": 0.6947222002075307, "learning_rate": 6.6976469336621595e-06, "loss": 0.0425, "step": 4059 }, { "epoch": 1.803242282922496, "grad_norm": 0.5292009929059116, "learning_rate": 6.6958234272757235e-06, "loss": 0.0575, "step": 4060 }, { "epoch": 1.8036864312680434, "grad_norm": 0.438201912001024, "learning_rate": 6.6939996659782194e-06, "loss": 0.0395, "step": 4061 }, { "epoch": 1.804130579613591, "grad_norm": 0.5407542517674611, "learning_rate": 6.692175650043789e-06, "loss": 0.0403, "step": 4062 }, { "epoch": 1.8045747279591384, "grad_norm": 0.5631536015470302, "learning_rate": 6.690351379746613e-06, "loss": 0.0421, "step": 4063 }, { "epoch": 1.8050188763046857, "grad_norm": 0.5913174820930391, "learning_rate": 6.6885268553609115e-06, "loss": 0.0265, "step": 4064 }, { "epoch": 1.8054630246502332, "grad_norm": 0.576629329530581, "learning_rate": 6.68670207716094e-06, "loss": 0.0393, "step": 4065 }, { "epoch": 1.8059071729957807, "grad_norm": 0.47244671586292697, "learning_rate": 6.6848770454209955e-06, "loss": 0.043, "step": 4066 }, { "epoch": 1.806351321341328, "grad_norm": 0.42376459190231364, "learning_rate": 6.683051760415409e-06, "loss": 0.0349, "step": 4067 }, { "epoch": 1.8067954696868753, "grad_norm": 0.5089448244604045, "learning_rate": 6.681226222418553e-06, "loss": 0.0367, "step": 4068 }, { "epoch": 1.8072396180324228, "grad_norm": 0.7287865087611148, "learning_rate": 6.679400431704837e-06, "loss": 0.0474, "step": 4069 }, { "epoch": 1.8076837663779703, "grad_norm": 0.4089819648317366, "learning_rate": 6.677574388548706e-06, "loss": 0.0273, "step": 4070 }, { "epoch": 1.8081279147235176, "grad_norm": 0.3655870554565193, "learning_rate": 6.67574809322465e-06, "loss": 0.0366, "step": 4071 }, { "epoch": 1.808572063069065, "grad_norm": 0.4524590952965907, "learning_rate": 6.6739215460071885e-06, "loss": 0.0388, "step": 4072 }, { "epoch": 1.8090162114146124, "grad_norm": 0.8966024978124474, "learning_rate": 6.672094747170883e-06, "loss": 0.0566, "step": 4073 }, { "epoch": 1.80946035976016, "grad_norm": 0.3591914341706326, "learning_rate": 6.670267696990335e-06, "loss": 0.0318, "step": 4074 }, { "epoch": 1.8099045081057072, "grad_norm": 0.44339585259060615, "learning_rate": 6.668440395740178e-06, "loss": 0.0363, "step": 4075 }, { "epoch": 1.8103486564512548, "grad_norm": 0.3901591376438945, "learning_rate": 6.666612843695087e-06, "loss": 0.0485, "step": 4076 }, { "epoch": 1.8107928047968023, "grad_norm": 0.4015830587763988, "learning_rate": 6.664785041129777e-06, "loss": 0.0383, "step": 4077 }, { "epoch": 1.8112369531423496, "grad_norm": 0.5367818518459079, "learning_rate": 6.662956988318994e-06, "loss": 0.0456, "step": 4078 }, { "epoch": 1.8116811014878968, "grad_norm": 0.40853437415084287, "learning_rate": 6.661128685537526e-06, "loss": 0.0322, "step": 4079 }, { "epoch": 1.8121252498334444, "grad_norm": 0.6527784454525124, "learning_rate": 6.659300133060201e-06, "loss": 0.0507, "step": 4080 }, { "epoch": 1.8125693981789919, "grad_norm": 0.5074533806315775, "learning_rate": 6.657471331161878e-06, "loss": 0.0432, "step": 4081 }, { "epoch": 1.8130135465245392, "grad_norm": 0.4472089329933828, "learning_rate": 6.65564228011746e-06, "loss": 0.0599, "step": 4082 }, { "epoch": 1.8134576948700865, "grad_norm": 0.47876435411805834, "learning_rate": 6.653812980201882e-06, "loss": 0.0478, "step": 4083 }, { "epoch": 1.813901843215634, "grad_norm": 0.44056500378250846, "learning_rate": 6.651983431690119e-06, "loss": 0.0434, "step": 4084 }, { "epoch": 1.8143459915611815, "grad_norm": 0.42532030681624505, "learning_rate": 6.650153634857183e-06, "loss": 0.0372, "step": 4085 }, { "epoch": 1.8147901399067288, "grad_norm": 0.38789115474665614, "learning_rate": 6.648323589978128e-06, "loss": 0.0332, "step": 4086 }, { "epoch": 1.8152342882522763, "grad_norm": 0.5287105455383381, "learning_rate": 6.646493297328034e-06, "loss": 0.0483, "step": 4087 }, { "epoch": 1.8156784365978238, "grad_norm": 2.7184134061804475, "learning_rate": 6.6446627571820295e-06, "loss": 0.05, "step": 4088 }, { "epoch": 1.816122584943371, "grad_norm": 0.48923408476238167, "learning_rate": 6.642831969815275e-06, "loss": 0.0524, "step": 4089 }, { "epoch": 1.8165667332889184, "grad_norm": 0.4086306503799156, "learning_rate": 6.641000935502968e-06, "loss": 0.0369, "step": 4090 }, { "epoch": 1.817010881634466, "grad_norm": 0.42279116752419027, "learning_rate": 6.639169654520345e-06, "loss": 0.0384, "step": 4091 }, { "epoch": 1.8174550299800134, "grad_norm": 0.47809993483091023, "learning_rate": 6.637338127142678e-06, "loss": 0.0557, "step": 4092 }, { "epoch": 1.8178991783255607, "grad_norm": 0.43894142864567726, "learning_rate": 6.635506353645277e-06, "loss": 0.0442, "step": 4093 }, { "epoch": 1.818343326671108, "grad_norm": 0.4258410799351809, "learning_rate": 6.633674334303489e-06, "loss": 0.0453, "step": 4094 }, { "epoch": 1.8187874750166557, "grad_norm": 0.4223073933724936, "learning_rate": 6.631842069392698e-06, "loss": 0.0425, "step": 4095 }, { "epoch": 1.819231623362203, "grad_norm": 0.379351123137964, "learning_rate": 6.630009559188323e-06, "loss": 0.0333, "step": 4096 }, { "epoch": 1.8196757717077503, "grad_norm": 0.38768367796199465, "learning_rate": 6.628176803965823e-06, "loss": 0.0391, "step": 4097 }, { "epoch": 1.8201199200532978, "grad_norm": 0.34224780916225656, "learning_rate": 6.62634380400069e-06, "loss": 0.0341, "step": 4098 }, { "epoch": 1.8205640683988453, "grad_norm": 0.42049665765180966, "learning_rate": 6.624510559568458e-06, "loss": 0.0332, "step": 4099 }, { "epoch": 1.8210082167443926, "grad_norm": 0.4650798805753388, "learning_rate": 6.622677070944692e-06, "loss": 0.0522, "step": 4100 }, { "epoch": 1.82145236508994, "grad_norm": 0.7403412237132078, "learning_rate": 6.6208433384049974e-06, "loss": 0.0468, "step": 4101 }, { "epoch": 1.8218965134354874, "grad_norm": 0.5003084175625752, "learning_rate": 6.619009362225017e-06, "loss": 0.0369, "step": 4102 }, { "epoch": 1.822340661781035, "grad_norm": 0.4709276604503224, "learning_rate": 6.617175142680426e-06, "loss": 0.0373, "step": 4103 }, { "epoch": 1.8227848101265822, "grad_norm": 0.405634995248283, "learning_rate": 6.615340680046941e-06, "loss": 0.0402, "step": 4104 }, { "epoch": 1.8232289584721297, "grad_norm": 0.48277401824042093, "learning_rate": 6.613505974600313e-06, "loss": 0.0427, "step": 4105 }, { "epoch": 1.8236731068176772, "grad_norm": 0.5146304089304894, "learning_rate": 6.611671026616328e-06, "loss": 0.0449, "step": 4106 }, { "epoch": 1.8241172551632245, "grad_norm": 0.5014137439041658, "learning_rate": 6.609835836370808e-06, "loss": 0.0462, "step": 4107 }, { "epoch": 1.8245614035087718, "grad_norm": 0.5653319356301473, "learning_rate": 6.6080004041396176e-06, "loss": 0.0385, "step": 4108 }, { "epoch": 1.8250055518543193, "grad_norm": 0.43607877589233934, "learning_rate": 6.60616473019865e-06, "loss": 0.0366, "step": 4109 }, { "epoch": 1.8254497001998669, "grad_norm": 0.34512361350227927, "learning_rate": 6.6043288148238405e-06, "loss": 0.0327, "step": 4110 }, { "epoch": 1.8258938485454141, "grad_norm": 0.46278297105670835, "learning_rate": 6.6024926582911576e-06, "loss": 0.0412, "step": 4111 }, { "epoch": 1.8263379968909614, "grad_norm": 0.4906820801194115, "learning_rate": 6.600656260876605e-06, "loss": 0.0381, "step": 4112 }, { "epoch": 1.826782145236509, "grad_norm": 0.48071884316968305, "learning_rate": 6.598819622856227e-06, "loss": 0.0409, "step": 4113 }, { "epoch": 1.8272262935820565, "grad_norm": 0.47730730807604993, "learning_rate": 6.596982744506101e-06, "loss": 0.0557, "step": 4114 }, { "epoch": 1.8276704419276038, "grad_norm": 0.3790889364580573, "learning_rate": 6.595145626102339e-06, "loss": 0.0404, "step": 4115 }, { "epoch": 1.8281145902731513, "grad_norm": 0.463399073158891, "learning_rate": 6.593308267921095e-06, "loss": 0.0411, "step": 4116 }, { "epoch": 1.8285587386186988, "grad_norm": 0.501176246654533, "learning_rate": 6.59147067023855e-06, "loss": 0.0506, "step": 4117 }, { "epoch": 1.829002886964246, "grad_norm": 0.46578549229661426, "learning_rate": 6.58963283333093e-06, "loss": 0.0454, "step": 4118 }, { "epoch": 1.8294470353097934, "grad_norm": 0.4499464900379103, "learning_rate": 6.587794757474493e-06, "loss": 0.0328, "step": 4119 }, { "epoch": 1.8298911836553409, "grad_norm": 0.4379709545832753, "learning_rate": 6.585956442945531e-06, "loss": 0.0342, "step": 4120 }, { "epoch": 1.8303353320008884, "grad_norm": 0.5260425346926577, "learning_rate": 6.584117890020374e-06, "loss": 0.0439, "step": 4121 }, { "epoch": 1.8307794803464357, "grad_norm": 0.426360295901579, "learning_rate": 6.5822790989753905e-06, "loss": 0.0411, "step": 4122 }, { "epoch": 1.831223628691983, "grad_norm": 0.45007999975919516, "learning_rate": 6.5804400700869806e-06, "loss": 0.0443, "step": 4123 }, { "epoch": 1.8316677770375307, "grad_norm": 0.59501919115864, "learning_rate": 6.578600803631579e-06, "loss": 0.0454, "step": 4124 }, { "epoch": 1.832111925383078, "grad_norm": 0.3671794598732458, "learning_rate": 6.5767612998856625e-06, "loss": 0.0393, "step": 4125 }, { "epoch": 1.8325560737286253, "grad_norm": 0.36069972183791427, "learning_rate": 6.574921559125737e-06, "loss": 0.0375, "step": 4126 }, { "epoch": 1.8330002220741728, "grad_norm": 0.47931723820550676, "learning_rate": 6.573081581628349e-06, "loss": 0.0398, "step": 4127 }, { "epoch": 1.8334443704197203, "grad_norm": 0.4393540376137144, "learning_rate": 6.571241367670077e-06, "loss": 0.048, "step": 4128 }, { "epoch": 1.8338885187652676, "grad_norm": 0.5502773042166398, "learning_rate": 6.569400917527536e-06, "loss": 0.0391, "step": 4129 }, { "epoch": 1.8343326671108149, "grad_norm": 0.42922588665741507, "learning_rate": 6.567560231477379e-06, "loss": 0.0371, "step": 4130 }, { "epoch": 1.8347768154563624, "grad_norm": 0.4243115655078441, "learning_rate": 6.56571930979629e-06, "loss": 0.0464, "step": 4131 }, { "epoch": 1.83522096380191, "grad_norm": 0.5637304292180152, "learning_rate": 6.563878152760992e-06, "loss": 0.0587, "step": 4132 }, { "epoch": 1.8356651121474572, "grad_norm": 0.5080427318268489, "learning_rate": 6.562036760648242e-06, "loss": 0.0445, "step": 4133 }, { "epoch": 1.8361092604930047, "grad_norm": 0.5069784829710062, "learning_rate": 6.560195133734833e-06, "loss": 0.0329, "step": 4134 }, { "epoch": 1.8365534088385522, "grad_norm": 0.3415574151612511, "learning_rate": 6.55835327229759e-06, "loss": 0.0292, "step": 4135 }, { "epoch": 1.8369975571840995, "grad_norm": 0.4077726290203496, "learning_rate": 6.556511176613381e-06, "loss": 0.0347, "step": 4136 }, { "epoch": 1.8374417055296468, "grad_norm": 0.5240761531044744, "learning_rate": 6.554668846959102e-06, "loss": 0.0454, "step": 4137 }, { "epoch": 1.8378858538751943, "grad_norm": 0.5670009106024317, "learning_rate": 6.552826283611684e-06, "loss": 0.0488, "step": 4138 }, { "epoch": 1.8383300022207418, "grad_norm": 0.5071828765865913, "learning_rate": 6.5509834868480994e-06, "loss": 0.0464, "step": 4139 }, { "epoch": 1.8387741505662891, "grad_norm": 0.6942163546904666, "learning_rate": 6.54914045694535e-06, "loss": 0.0558, "step": 4140 }, { "epoch": 1.8392182989118364, "grad_norm": 0.42462008853388156, "learning_rate": 6.547297194180473e-06, "loss": 0.028, "step": 4141 }, { "epoch": 1.839662447257384, "grad_norm": 0.3935408254645297, "learning_rate": 6.545453698830545e-06, "loss": 0.028, "step": 4142 }, { "epoch": 1.8401065956029314, "grad_norm": 0.43222089529601715, "learning_rate": 6.543609971172673e-06, "loss": 0.0439, "step": 4143 }, { "epoch": 1.8405507439484787, "grad_norm": 0.8970440945065747, "learning_rate": 6.541766011484001e-06, "loss": 0.0569, "step": 4144 }, { "epoch": 1.8409948922940262, "grad_norm": 0.43018474254953293, "learning_rate": 6.539921820041708e-06, "loss": 0.0429, "step": 4145 }, { "epoch": 1.8414390406395738, "grad_norm": 0.5272129102858879, "learning_rate": 6.538077397123006e-06, "loss": 0.042, "step": 4146 }, { "epoch": 1.841883188985121, "grad_norm": 0.5767831742144044, "learning_rate": 6.536232743005144e-06, "loss": 0.0505, "step": 4147 }, { "epoch": 1.8423273373306683, "grad_norm": 0.5455867077197153, "learning_rate": 6.534387857965405e-06, "loss": 0.0499, "step": 4148 }, { "epoch": 1.8427714856762158, "grad_norm": 0.4392562319330063, "learning_rate": 6.532542742281105e-06, "loss": 0.0404, "step": 4149 }, { "epoch": 1.8432156340217634, "grad_norm": 0.41150977239123576, "learning_rate": 6.5306973962296e-06, "loss": 0.0388, "step": 4150 }, { "epoch": 1.8436597823673107, "grad_norm": 1.313911582342841, "learning_rate": 6.528851820088273e-06, "loss": 0.0595, "step": 4151 }, { "epoch": 1.844103930712858, "grad_norm": 0.3322454559510484, "learning_rate": 6.527006014134546e-06, "loss": 0.0274, "step": 4152 }, { "epoch": 1.8445480790584055, "grad_norm": 0.44783811513765753, "learning_rate": 6.525159978645876e-06, "loss": 0.0391, "step": 4153 }, { "epoch": 1.844992227403953, "grad_norm": 0.5214447787650061, "learning_rate": 6.523313713899755e-06, "loss": 0.0396, "step": 4154 }, { "epoch": 1.8454363757495003, "grad_norm": 0.45172874563459914, "learning_rate": 6.521467220173705e-06, "loss": 0.0356, "step": 4155 }, { "epoch": 1.8458805240950478, "grad_norm": 0.3652608177780825, "learning_rate": 6.519620497745286e-06, "loss": 0.0413, "step": 4156 }, { "epoch": 1.8463246724405953, "grad_norm": 0.435616251575995, "learning_rate": 6.5177735468920935e-06, "loss": 0.0444, "step": 4157 }, { "epoch": 1.8467688207861426, "grad_norm": 0.7590654646546372, "learning_rate": 6.515926367891754e-06, "loss": 0.049, "step": 4158 }, { "epoch": 1.8472129691316899, "grad_norm": 0.45565692087369364, "learning_rate": 6.51407896102193e-06, "loss": 0.0377, "step": 4159 }, { "epoch": 1.8476571174772374, "grad_norm": 1.8021394181856782, "learning_rate": 6.512231326560319e-06, "loss": 0.0632, "step": 4160 }, { "epoch": 1.8481012658227849, "grad_norm": 0.37814866423712995, "learning_rate": 6.510383464784651e-06, "loss": 0.0357, "step": 4161 }, { "epoch": 1.8485454141683322, "grad_norm": 0.5013395214860348, "learning_rate": 6.508535375972691e-06, "loss": 0.0359, "step": 4162 }, { "epoch": 1.8489895625138795, "grad_norm": 0.6460169691477774, "learning_rate": 6.506687060402238e-06, "loss": 0.055, "step": 4163 }, { "epoch": 1.8494337108594272, "grad_norm": 0.731410974029027, "learning_rate": 6.504838518351127e-06, "loss": 0.0418, "step": 4164 }, { "epoch": 1.8498778592049745, "grad_norm": 0.4318554461256705, "learning_rate": 6.502989750097224e-06, "loss": 0.0423, "step": 4165 }, { "epoch": 1.8503220075505218, "grad_norm": 0.4065429352215067, "learning_rate": 6.501140755918428e-06, "loss": 0.0325, "step": 4166 }, { "epoch": 1.8507661558960693, "grad_norm": 0.5268522126273875, "learning_rate": 6.499291536092679e-06, "loss": 0.052, "step": 4167 }, { "epoch": 1.8512103042416168, "grad_norm": 0.4963236453076724, "learning_rate": 6.497442090897943e-06, "loss": 0.0507, "step": 4168 }, { "epoch": 1.851654452587164, "grad_norm": 0.5716468049405105, "learning_rate": 6.495592420612224e-06, "loss": 0.0557, "step": 4169 }, { "epoch": 1.8520986009327114, "grad_norm": 0.36399318359325455, "learning_rate": 6.493742525513556e-06, "loss": 0.0318, "step": 4170 }, { "epoch": 1.852542749278259, "grad_norm": 0.4368593597512668, "learning_rate": 6.491892405880015e-06, "loss": 0.036, "step": 4171 }, { "epoch": 1.8529868976238064, "grad_norm": 0.490968844938355, "learning_rate": 6.490042061989701e-06, "loss": 0.0384, "step": 4172 }, { "epoch": 1.8534310459693537, "grad_norm": 0.4895671851812533, "learning_rate": 6.4881914941207545e-06, "loss": 0.0526, "step": 4173 }, { "epoch": 1.8538751943149012, "grad_norm": 0.4582350620997356, "learning_rate": 6.486340702551347e-06, "loss": 0.044, "step": 4174 }, { "epoch": 1.8543193426604487, "grad_norm": 0.4289059463290211, "learning_rate": 6.484489687559682e-06, "loss": 0.0379, "step": 4175 }, { "epoch": 1.854763491005996, "grad_norm": 0.3956879154243665, "learning_rate": 6.4826384494240006e-06, "loss": 0.0324, "step": 4176 }, { "epoch": 1.8552076393515433, "grad_norm": 0.4900266012746407, "learning_rate": 6.480786988422575e-06, "loss": 0.0472, "step": 4177 }, { "epoch": 1.8556517876970908, "grad_norm": 0.4376815416787356, "learning_rate": 6.47893530483371e-06, "loss": 0.0631, "step": 4178 }, { "epoch": 1.8560959360426383, "grad_norm": 0.3452667188092541, "learning_rate": 6.4770833989357464e-06, "loss": 0.031, "step": 4179 }, { "epoch": 1.8565400843881856, "grad_norm": 0.4575028834398297, "learning_rate": 6.4752312710070565e-06, "loss": 0.0357, "step": 4180 }, { "epoch": 1.856984232733733, "grad_norm": 0.35802922814056004, "learning_rate": 6.4733789213260465e-06, "loss": 0.0363, "step": 4181 }, { "epoch": 1.8574283810792804, "grad_norm": 0.4610516053299678, "learning_rate": 6.471526350171158e-06, "loss": 0.0421, "step": 4182 }, { "epoch": 1.857872529424828, "grad_norm": 0.41994717663617964, "learning_rate": 6.46967355782086e-06, "loss": 0.0427, "step": 4183 }, { "epoch": 1.8583166777703752, "grad_norm": 0.41511781298154615, "learning_rate": 6.4678205445536615e-06, "loss": 0.0388, "step": 4184 }, { "epoch": 1.8587608261159227, "grad_norm": 0.4226703337206278, "learning_rate": 6.465967310648103e-06, "loss": 0.0363, "step": 4185 }, { "epoch": 1.8592049744614703, "grad_norm": 0.3756285974893726, "learning_rate": 6.464113856382752e-06, "loss": 0.0431, "step": 4186 }, { "epoch": 1.8596491228070176, "grad_norm": 0.7660609289210849, "learning_rate": 6.46226018203622e-06, "loss": 0.0548, "step": 4187 }, { "epoch": 1.8600932711525648, "grad_norm": 0.44778181358321073, "learning_rate": 6.460406287887142e-06, "loss": 0.0402, "step": 4188 }, { "epoch": 1.8605374194981124, "grad_norm": 0.4653083552128945, "learning_rate": 6.4585521742141924e-06, "loss": 0.0429, "step": 4189 }, { "epoch": 1.8609815678436599, "grad_norm": 0.4092361015012088, "learning_rate": 6.456697841296072e-06, "loss": 0.0538, "step": 4190 }, { "epoch": 1.8614257161892072, "grad_norm": 0.3943066511337006, "learning_rate": 6.4548432894115236e-06, "loss": 0.0422, "step": 4191 }, { "epoch": 1.8618698645347544, "grad_norm": 0.574897072037143, "learning_rate": 6.452988518839314e-06, "loss": 0.0638, "step": 4192 }, { "epoch": 1.8623140128803022, "grad_norm": 0.3525774963677566, "learning_rate": 6.451133529858249e-06, "loss": 0.0308, "step": 4193 }, { "epoch": 1.8627581612258495, "grad_norm": 0.47268074934306253, "learning_rate": 6.449278322747164e-06, "loss": 0.0443, "step": 4194 }, { "epoch": 1.8632023095713968, "grad_norm": 0.49332043887118054, "learning_rate": 6.447422897784927e-06, "loss": 0.0486, "step": 4195 }, { "epoch": 1.8636464579169443, "grad_norm": 0.4599615865532288, "learning_rate": 6.445567255250442e-06, "loss": 0.0374, "step": 4196 }, { "epoch": 1.8640906062624918, "grad_norm": 0.6216566278556651, "learning_rate": 6.443711395422641e-06, "loss": 0.0458, "step": 4197 }, { "epoch": 1.864534754608039, "grad_norm": 0.45476603291439793, "learning_rate": 6.4418553185804946e-06, "loss": 0.0411, "step": 4198 }, { "epoch": 1.8649789029535864, "grad_norm": 0.5568340998927974, "learning_rate": 6.4399990250030005e-06, "loss": 0.0469, "step": 4199 }, { "epoch": 1.8654230512991339, "grad_norm": 0.5792755550634929, "learning_rate": 6.438142514969192e-06, "loss": 0.0486, "step": 4200 }, { "epoch": 1.8658671996446814, "grad_norm": 0.5484197721468562, "learning_rate": 6.436285788758133e-06, "loss": 0.0373, "step": 4201 }, { "epoch": 1.8663113479902287, "grad_norm": 0.408467206743211, "learning_rate": 6.434428846648923e-06, "loss": 0.0345, "step": 4202 }, { "epoch": 1.8667554963357762, "grad_norm": 0.4273190617477224, "learning_rate": 6.43257168892069e-06, "loss": 0.0521, "step": 4203 }, { "epoch": 1.8671996446813237, "grad_norm": 0.913822616436895, "learning_rate": 6.430714315852595e-06, "loss": 0.0731, "step": 4204 }, { "epoch": 1.867643793026871, "grad_norm": 0.3654209870999856, "learning_rate": 6.428856727723838e-06, "loss": 0.0426, "step": 4205 }, { "epoch": 1.8680879413724183, "grad_norm": 0.35677094964790335, "learning_rate": 6.426998924813641e-06, "loss": 0.0354, "step": 4206 }, { "epoch": 1.8685320897179658, "grad_norm": 0.7442434568098797, "learning_rate": 6.425140907401266e-06, "loss": 0.057, "step": 4207 }, { "epoch": 1.8689762380635133, "grad_norm": 0.5792617957834036, "learning_rate": 6.423282675766002e-06, "loss": 0.0409, "step": 4208 }, { "epoch": 1.8694203864090606, "grad_norm": 0.7180220670929347, "learning_rate": 6.4214242301871766e-06, "loss": 0.055, "step": 4209 }, { "epoch": 1.869864534754608, "grad_norm": 0.369340852755107, "learning_rate": 6.4195655709441425e-06, "loss": 0.0365, "step": 4210 }, { "epoch": 1.8703086831001554, "grad_norm": 0.5907561592268787, "learning_rate": 6.41770669831629e-06, "loss": 0.05, "step": 4211 }, { "epoch": 1.870752831445703, "grad_norm": 0.4737879055532281, "learning_rate": 6.415847612583036e-06, "loss": 0.0387, "step": 4212 }, { "epoch": 1.8711969797912502, "grad_norm": 0.45358931580739376, "learning_rate": 6.413988314023837e-06, "loss": 0.0455, "step": 4213 }, { "epoch": 1.8716411281367977, "grad_norm": 0.396008499582763, "learning_rate": 6.412128802918174e-06, "loss": 0.0428, "step": 4214 }, { "epoch": 1.8720852764823452, "grad_norm": 0.446819083411039, "learning_rate": 6.410269079545563e-06, "loss": 0.0353, "step": 4215 }, { "epoch": 1.8725294248278925, "grad_norm": 0.5360694926018593, "learning_rate": 6.408409144185555e-06, "loss": 0.0394, "step": 4216 }, { "epoch": 1.8729735731734398, "grad_norm": 0.419696086205574, "learning_rate": 6.406548997117728e-06, "loss": 0.0442, "step": 4217 }, { "epoch": 1.8734177215189873, "grad_norm": 0.5541434243753425, "learning_rate": 6.404688638621691e-06, "loss": 0.0362, "step": 4218 }, { "epoch": 1.8738618698645348, "grad_norm": 0.5424748341749609, "learning_rate": 6.402828068977092e-06, "loss": 0.0385, "step": 4219 }, { "epoch": 1.8743060182100821, "grad_norm": 0.5755082545471814, "learning_rate": 6.400967288463604e-06, "loss": 0.0408, "step": 4220 }, { "epoch": 1.8747501665556294, "grad_norm": 0.5534620278515894, "learning_rate": 6.399106297360934e-06, "loss": 0.0394, "step": 4221 }, { "epoch": 1.875194314901177, "grad_norm": 0.43339133128599205, "learning_rate": 6.397245095948822e-06, "loss": 0.0447, "step": 4222 }, { "epoch": 1.8756384632467245, "grad_norm": 0.4242062003649521, "learning_rate": 6.395383684507036e-06, "loss": 0.0367, "step": 4223 }, { "epoch": 1.8760826115922717, "grad_norm": 0.429738984394203, "learning_rate": 6.393522063315379e-06, "loss": 0.0434, "step": 4224 }, { "epoch": 1.8765267599378193, "grad_norm": 0.5905398248805794, "learning_rate": 6.391660232653685e-06, "loss": 0.0387, "step": 4225 }, { "epoch": 1.8769709082833668, "grad_norm": 0.46069485833459556, "learning_rate": 6.389798192801816e-06, "loss": 0.042, "step": 4226 }, { "epoch": 1.877415056628914, "grad_norm": 0.3636972167781559, "learning_rate": 6.387935944039672e-06, "loss": 0.0355, "step": 4227 }, { "epoch": 1.8778592049744613, "grad_norm": 0.4783149493763749, "learning_rate": 6.3860734866471775e-06, "loss": 0.0404, "step": 4228 }, { "epoch": 1.8783033533200089, "grad_norm": 0.3543783240099781, "learning_rate": 6.384210820904292e-06, "loss": 0.0291, "step": 4229 }, { "epoch": 1.8787475016655564, "grad_norm": 0.4823964461174852, "learning_rate": 6.382347947091008e-06, "loss": 0.0454, "step": 4230 }, { "epoch": 1.8791916500111037, "grad_norm": 0.7938854928835402, "learning_rate": 6.380484865487346e-06, "loss": 0.0651, "step": 4231 }, { "epoch": 1.879635798356651, "grad_norm": 0.43202860148998995, "learning_rate": 6.378621576373356e-06, "loss": 0.0373, "step": 4232 }, { "epoch": 1.8800799467021987, "grad_norm": 0.40297846856972874, "learning_rate": 6.376758080029126e-06, "loss": 0.0399, "step": 4233 }, { "epoch": 1.880524095047746, "grad_norm": 0.6777187231862498, "learning_rate": 6.37489437673477e-06, "loss": 0.0451, "step": 4234 }, { "epoch": 1.8809682433932933, "grad_norm": 0.449724408023701, "learning_rate": 6.3730304667704315e-06, "loss": 0.0364, "step": 4235 }, { "epoch": 1.8814123917388408, "grad_norm": 0.508560023808873, "learning_rate": 6.371166350416293e-06, "loss": 0.0357, "step": 4236 }, { "epoch": 1.8818565400843883, "grad_norm": 0.45097496833863476, "learning_rate": 6.369302027952559e-06, "loss": 0.0314, "step": 4237 }, { "epoch": 1.8823006884299356, "grad_norm": 0.3904261571231129, "learning_rate": 6.36743749965947e-06, "loss": 0.0417, "step": 4238 }, { "epoch": 1.8827448367754829, "grad_norm": 0.3804238828895874, "learning_rate": 6.365572765817295e-06, "loss": 0.039, "step": 4239 }, { "epoch": 1.8831889851210304, "grad_norm": 0.5067555487119486, "learning_rate": 6.363707826706336e-06, "loss": 0.0382, "step": 4240 }, { "epoch": 1.883633133466578, "grad_norm": 0.4732566932753566, "learning_rate": 6.3618426826069265e-06, "loss": 0.0471, "step": 4241 }, { "epoch": 1.8840772818121252, "grad_norm": 0.4900514966702779, "learning_rate": 6.359977333799429e-06, "loss": 0.0441, "step": 4242 }, { "epoch": 1.8845214301576727, "grad_norm": 0.5196575689420362, "learning_rate": 6.358111780564233e-06, "loss": 0.0411, "step": 4243 }, { "epoch": 1.8849655785032202, "grad_norm": 0.43773037282352034, "learning_rate": 6.35624602318177e-06, "loss": 0.0402, "step": 4244 }, { "epoch": 1.8854097268487675, "grad_norm": 0.589548376792247, "learning_rate": 6.354380061932489e-06, "loss": 0.0381, "step": 4245 }, { "epoch": 1.8858538751943148, "grad_norm": 0.5624692048798179, "learning_rate": 6.352513897096878e-06, "loss": 0.0394, "step": 4246 }, { "epoch": 1.8862980235398623, "grad_norm": 0.48689251252375637, "learning_rate": 6.3506475289554534e-06, "loss": 0.0524, "step": 4247 }, { "epoch": 1.8867421718854098, "grad_norm": 0.8189635814115437, "learning_rate": 6.3487809577887625e-06, "loss": 0.0587, "step": 4248 }, { "epoch": 1.8871863202309571, "grad_norm": 0.43746947984188156, "learning_rate": 6.346914183877379e-06, "loss": 0.0316, "step": 4249 }, { "epoch": 1.8876304685765044, "grad_norm": 0.5538154749300077, "learning_rate": 6.345047207501916e-06, "loss": 0.0471, "step": 4250 }, { "epoch": 1.888074616922052, "grad_norm": 0.4360679802124053, "learning_rate": 6.34318002894301e-06, "loss": 0.0371, "step": 4251 }, { "epoch": 1.8885187652675994, "grad_norm": 0.5075276918535925, "learning_rate": 6.341312648481328e-06, "loss": 0.0417, "step": 4252 }, { "epoch": 1.8889629136131467, "grad_norm": 0.49156392064774324, "learning_rate": 6.339445066397569e-06, "loss": 0.0423, "step": 4253 }, { "epoch": 1.8894070619586942, "grad_norm": 0.852671422019365, "learning_rate": 6.337577282972465e-06, "loss": 0.0524, "step": 4254 }, { "epoch": 1.8898512103042417, "grad_norm": 0.37799180602912125, "learning_rate": 6.335709298486773e-06, "loss": 0.0303, "step": 4255 }, { "epoch": 1.890295358649789, "grad_norm": 0.4549492374420902, "learning_rate": 6.333841113221283e-06, "loss": 0.0376, "step": 4256 }, { "epoch": 1.8907395069953363, "grad_norm": 0.4682663299692366, "learning_rate": 6.331972727456816e-06, "loss": 0.0264, "step": 4257 }, { "epoch": 1.8911836553408838, "grad_norm": 0.3907535099519968, "learning_rate": 6.330104141474223e-06, "loss": 0.0382, "step": 4258 }, { "epoch": 1.8916278036864314, "grad_norm": 0.43935294276918296, "learning_rate": 6.328235355554382e-06, "loss": 0.0362, "step": 4259 }, { "epoch": 1.8920719520319786, "grad_norm": 0.576541252998893, "learning_rate": 6.326366369978204e-06, "loss": 0.0481, "step": 4260 }, { "epoch": 1.892516100377526, "grad_norm": 0.43778261815885045, "learning_rate": 6.324497185026631e-06, "loss": 0.034, "step": 4261 }, { "epoch": 1.8929602487230737, "grad_norm": 0.44466828672780784, "learning_rate": 6.3226278009806315e-06, "loss": 0.0307, "step": 4262 }, { "epoch": 1.893404397068621, "grad_norm": 0.41339084625105765, "learning_rate": 6.320758218121205e-06, "loss": 0.0443, "step": 4263 }, { "epoch": 1.8938485454141682, "grad_norm": 0.4481199734504317, "learning_rate": 6.318888436729382e-06, "loss": 0.0354, "step": 4264 }, { "epoch": 1.8942926937597158, "grad_norm": 0.6150629213528874, "learning_rate": 6.317018457086226e-06, "loss": 0.046, "step": 4265 }, { "epoch": 1.8947368421052633, "grad_norm": 0.8066271182355852, "learning_rate": 6.31514827947282e-06, "loss": 0.0425, "step": 4266 }, { "epoch": 1.8951809904508106, "grad_norm": 0.5827323877974867, "learning_rate": 6.31327790417029e-06, "loss": 0.0457, "step": 4267 }, { "epoch": 1.8956251387963579, "grad_norm": 0.4984272032963798, "learning_rate": 6.311407331459781e-06, "loss": 0.03, "step": 4268 }, { "epoch": 1.8960692871419054, "grad_norm": 0.5018245857439331, "learning_rate": 6.309536561622474e-06, "loss": 0.0399, "step": 4269 }, { "epoch": 1.8965134354874529, "grad_norm": 0.46552046167173733, "learning_rate": 6.307665594939575e-06, "loss": 0.0379, "step": 4270 }, { "epoch": 1.8969575838330002, "grad_norm": 0.48037243813958513, "learning_rate": 6.3057944316923246e-06, "loss": 0.0293, "step": 4271 }, { "epoch": 1.8974017321785477, "grad_norm": 0.4527119332615495, "learning_rate": 6.30392307216199e-06, "loss": 0.0386, "step": 4272 }, { "epoch": 1.8978458805240952, "grad_norm": 0.419684782365093, "learning_rate": 6.3020515166298665e-06, "loss": 0.0317, "step": 4273 }, { "epoch": 1.8982900288696425, "grad_norm": 0.418987789446061, "learning_rate": 6.300179765377283e-06, "loss": 0.0361, "step": 4274 }, { "epoch": 1.8987341772151898, "grad_norm": 0.58612758503126, "learning_rate": 6.298307818685595e-06, "loss": 0.0416, "step": 4275 }, { "epoch": 1.8991783255607373, "grad_norm": 0.501397444674289, "learning_rate": 6.296435676836188e-06, "loss": 0.0443, "step": 4276 }, { "epoch": 1.8996224739062848, "grad_norm": 0.4267458008276168, "learning_rate": 6.294563340110474e-06, "loss": 0.0465, "step": 4277 }, { "epoch": 1.900066622251832, "grad_norm": 0.5455194479999508, "learning_rate": 6.292690808789901e-06, "loss": 0.055, "step": 4278 }, { "epoch": 1.9005107705973794, "grad_norm": 0.678438654630706, "learning_rate": 6.290818083155941e-06, "loss": 0.0559, "step": 4279 }, { "epoch": 1.900954918942927, "grad_norm": 0.5367006488443361, "learning_rate": 6.288945163490093e-06, "loss": 0.045, "step": 4280 }, { "epoch": 1.9013990672884744, "grad_norm": 0.41451518482226474, "learning_rate": 6.287072050073894e-06, "loss": 0.0384, "step": 4281 }, { "epoch": 1.9018432156340217, "grad_norm": 0.594043909692931, "learning_rate": 6.2851987431889025e-06, "loss": 0.0414, "step": 4282 }, { "epoch": 1.9022873639795692, "grad_norm": 0.4491119239025248, "learning_rate": 6.2833252431167066e-06, "loss": 0.0393, "step": 4283 }, { "epoch": 1.9027315123251167, "grad_norm": 0.39577109884937645, "learning_rate": 6.2814515501389275e-06, "loss": 0.035, "step": 4284 }, { "epoch": 1.903175660670664, "grad_norm": 0.3708469471452283, "learning_rate": 6.279577664537213e-06, "loss": 0.026, "step": 4285 }, { "epoch": 1.9036198090162113, "grad_norm": 0.4178266369288517, "learning_rate": 6.2777035865932375e-06, "loss": 0.0334, "step": 4286 }, { "epoch": 1.9040639573617588, "grad_norm": 0.5196821555520535, "learning_rate": 6.275829316588711e-06, "loss": 0.0454, "step": 4287 }, { "epoch": 1.9045081057073063, "grad_norm": 0.4709231790248879, "learning_rate": 6.273954854805364e-06, "loss": 0.0395, "step": 4288 }, { "epoch": 1.9049522540528536, "grad_norm": 0.47040632233388197, "learning_rate": 6.2720802015249615e-06, "loss": 0.0459, "step": 4289 }, { "epoch": 1.905396402398401, "grad_norm": 0.5132518236758158, "learning_rate": 6.2702053570292976e-06, "loss": 0.0453, "step": 4290 }, { "epoch": 1.9058405507439484, "grad_norm": 0.4975445699296196, "learning_rate": 6.26833032160019e-06, "loss": 0.0384, "step": 4291 }, { "epoch": 1.906284699089496, "grad_norm": 0.42703967829123596, "learning_rate": 6.26645509551949e-06, "loss": 0.036, "step": 4292 }, { "epoch": 1.9067288474350432, "grad_norm": 0.4865047041410074, "learning_rate": 6.264579679069077e-06, "loss": 0.0371, "step": 4293 }, { "epoch": 1.9071729957805907, "grad_norm": 0.5254303510911112, "learning_rate": 6.262704072530856e-06, "loss": 0.0501, "step": 4294 }, { "epoch": 1.9076171441261383, "grad_norm": 0.36509386500252816, "learning_rate": 6.260828276186762e-06, "loss": 0.0366, "step": 4295 }, { "epoch": 1.9080612924716855, "grad_norm": 0.4317077297037696, "learning_rate": 6.258952290318763e-06, "loss": 0.0367, "step": 4296 }, { "epoch": 1.9085054408172328, "grad_norm": 0.4444631290624624, "learning_rate": 6.257076115208847e-06, "loss": 0.0366, "step": 4297 }, { "epoch": 1.9089495891627803, "grad_norm": 0.3303398456720475, "learning_rate": 6.255199751139036e-06, "loss": 0.03, "step": 4298 }, { "epoch": 1.9093937375083279, "grad_norm": 0.33258553726398404, "learning_rate": 6.253323198391383e-06, "loss": 0.0345, "step": 4299 }, { "epoch": 1.9098378858538752, "grad_norm": 0.6445178538911288, "learning_rate": 6.251446457247961e-06, "loss": 0.0444, "step": 4300 }, { "epoch": 1.9102820341994224, "grad_norm": 0.503486774604937, "learning_rate": 6.249569527990878e-06, "loss": 0.0508, "step": 4301 }, { "epoch": 1.9107261825449702, "grad_norm": 0.532598812939261, "learning_rate": 6.247692410902271e-06, "loss": 0.0366, "step": 4302 }, { "epoch": 1.9111703308905175, "grad_norm": 0.5689719802618134, "learning_rate": 6.245815106264297e-06, "loss": 0.0508, "step": 4303 }, { "epoch": 1.9116144792360648, "grad_norm": 0.44795775453400327, "learning_rate": 6.243937614359152e-06, "loss": 0.0425, "step": 4304 }, { "epoch": 1.9120586275816123, "grad_norm": 0.63806779442234, "learning_rate": 6.242059935469051e-06, "loss": 0.0409, "step": 4305 }, { "epoch": 1.9125027759271598, "grad_norm": 0.41950353971072635, "learning_rate": 6.240182069876244e-06, "loss": 0.0344, "step": 4306 }, { "epoch": 1.912946924272707, "grad_norm": 0.599261124567421, "learning_rate": 6.238304017863005e-06, "loss": 0.0415, "step": 4307 }, { "epoch": 1.9133910726182544, "grad_norm": 0.364585724464603, "learning_rate": 6.236425779711637e-06, "loss": 0.0346, "step": 4308 }, { "epoch": 1.9138352209638019, "grad_norm": 0.6254535972915019, "learning_rate": 6.23454735570447e-06, "loss": 0.0509, "step": 4309 }, { "epoch": 1.9142793693093494, "grad_norm": 0.41487059504531737, "learning_rate": 6.232668746123865e-06, "loss": 0.0331, "step": 4310 }, { "epoch": 1.9147235176548967, "grad_norm": 0.5104472659733067, "learning_rate": 6.230789951252208e-06, "loss": 0.0349, "step": 4311 }, { "epoch": 1.9151676660004442, "grad_norm": 0.40794899664296186, "learning_rate": 6.228910971371913e-06, "loss": 0.0408, "step": 4312 }, { "epoch": 1.9156118143459917, "grad_norm": 0.5437919242408334, "learning_rate": 6.227031806765424e-06, "loss": 0.0387, "step": 4313 }, { "epoch": 1.916055962691539, "grad_norm": 0.4898004885001543, "learning_rate": 6.225152457715211e-06, "loss": 0.0501, "step": 4314 }, { "epoch": 1.9165001110370863, "grad_norm": 0.32765369475691036, "learning_rate": 6.223272924503773e-06, "loss": 0.0313, "step": 4315 }, { "epoch": 1.9169442593826338, "grad_norm": 1.1036222737179877, "learning_rate": 6.221393207413634e-06, "loss": 0.0555, "step": 4316 }, { "epoch": 1.9173884077281813, "grad_norm": 0.5313402238818737, "learning_rate": 6.219513306727347e-06, "loss": 0.0435, "step": 4317 }, { "epoch": 1.9178325560737286, "grad_norm": 0.44083561782231834, "learning_rate": 6.217633222727495e-06, "loss": 0.0375, "step": 4318 }, { "epoch": 1.918276704419276, "grad_norm": 0.9040287143232594, "learning_rate": 6.215752955696686e-06, "loss": 0.0741, "step": 4319 }, { "epoch": 1.9187208527648234, "grad_norm": 0.406508099143207, "learning_rate": 6.213872505917554e-06, "loss": 0.0309, "step": 4320 }, { "epoch": 1.919165001110371, "grad_norm": 0.4001138707894935, "learning_rate": 6.2119918736727666e-06, "loss": 0.0292, "step": 4321 }, { "epoch": 1.9196091494559182, "grad_norm": 0.37528739888488877, "learning_rate": 6.210111059245011e-06, "loss": 0.0323, "step": 4322 }, { "epoch": 1.9200532978014657, "grad_norm": 0.4611567927734023, "learning_rate": 6.2082300629170065e-06, "loss": 0.0369, "step": 4323 }, { "epoch": 1.9204974461470132, "grad_norm": 0.4615724924310186, "learning_rate": 6.2063488849715e-06, "loss": 0.0468, "step": 4324 }, { "epoch": 1.9209415944925605, "grad_norm": 0.36847012924343564, "learning_rate": 6.204467525691265e-06, "loss": 0.0403, "step": 4325 }, { "epoch": 1.9213857428381078, "grad_norm": 0.7462560129733484, "learning_rate": 6.202585985359099e-06, "loss": 0.0463, "step": 4326 }, { "epoch": 1.9218298911836553, "grad_norm": 0.4111595385170657, "learning_rate": 6.200704264257832e-06, "loss": 0.0402, "step": 4327 }, { "epoch": 1.9222740395292028, "grad_norm": 0.5886660338070746, "learning_rate": 6.198822362670316e-06, "loss": 0.0409, "step": 4328 }, { "epoch": 1.9227181878747501, "grad_norm": 0.551351371253542, "learning_rate": 6.196940280879436e-06, "loss": 0.0371, "step": 4329 }, { "epoch": 1.9231623362202974, "grad_norm": 0.5211858751187458, "learning_rate": 6.1950580191681e-06, "loss": 0.043, "step": 4330 }, { "epoch": 1.9236064845658452, "grad_norm": 0.4258998039644192, "learning_rate": 6.193175577819242e-06, "loss": 0.0353, "step": 4331 }, { "epoch": 1.9240506329113924, "grad_norm": 0.4043175914509727, "learning_rate": 6.191292957115825e-06, "loss": 0.0285, "step": 4332 }, { "epoch": 1.9244947812569397, "grad_norm": 0.4563276066272647, "learning_rate": 6.1894101573408425e-06, "loss": 0.0371, "step": 4333 }, { "epoch": 1.9249389296024872, "grad_norm": 0.54090150421601, "learning_rate": 6.1875271787773075e-06, "loss": 0.0519, "step": 4334 }, { "epoch": 1.9253830779480348, "grad_norm": 0.3562700559697815, "learning_rate": 6.185644021708266e-06, "loss": 0.0267, "step": 4335 }, { "epoch": 1.925827226293582, "grad_norm": 1.2138310918842605, "learning_rate": 6.183760686416785e-06, "loss": 0.0483, "step": 4336 }, { "epoch": 1.9262713746391293, "grad_norm": 0.4024294145624342, "learning_rate": 6.181877173185966e-06, "loss": 0.0311, "step": 4337 }, { "epoch": 1.9267155229846769, "grad_norm": 0.6790926459631366, "learning_rate": 6.1799934822989315e-06, "loss": 0.0494, "step": 4338 }, { "epoch": 1.9271596713302244, "grad_norm": 0.48577305067511145, "learning_rate": 6.178109614038832e-06, "loss": 0.0412, "step": 4339 }, { "epoch": 1.9276038196757717, "grad_norm": 0.4461768395787564, "learning_rate": 6.176225568688844e-06, "loss": 0.0337, "step": 4340 }, { "epoch": 1.9280479680213192, "grad_norm": 0.38001980047585654, "learning_rate": 6.174341346532173e-06, "loss": 0.0404, "step": 4341 }, { "epoch": 1.9284921163668667, "grad_norm": 0.4252479347353827, "learning_rate": 6.1724569478520495e-06, "loss": 0.0452, "step": 4342 }, { "epoch": 1.928936264712414, "grad_norm": 0.3799614809822163, "learning_rate": 6.1705723729317295e-06, "loss": 0.0401, "step": 4343 }, { "epoch": 1.9293804130579613, "grad_norm": 0.8795175854169174, "learning_rate": 6.168687622054497e-06, "loss": 0.0419, "step": 4344 }, { "epoch": 1.9298245614035088, "grad_norm": 0.35580314936274277, "learning_rate": 6.1668026955036645e-06, "loss": 0.0275, "step": 4345 }, { "epoch": 1.9302687097490563, "grad_norm": 0.5402722431604882, "learning_rate": 6.1649175935625635e-06, "loss": 0.0459, "step": 4346 }, { "epoch": 1.9307128580946036, "grad_norm": 0.6557570062151068, "learning_rate": 6.1630323165145615e-06, "loss": 0.0463, "step": 4347 }, { "epoch": 1.9311570064401509, "grad_norm": 0.45047259263477546, "learning_rate": 6.161146864643045e-06, "loss": 0.0408, "step": 4348 }, { "epoch": 1.9316011547856984, "grad_norm": 0.4701310559005135, "learning_rate": 6.159261238231431e-06, "loss": 0.0379, "step": 4349 }, { "epoch": 1.932045303131246, "grad_norm": 0.4022795195574756, "learning_rate": 6.15737543756316e-06, "loss": 0.0372, "step": 4350 }, { "epoch": 1.9324894514767932, "grad_norm": 0.4838913493812182, "learning_rate": 6.1554894629217e-06, "loss": 0.0512, "step": 4351 }, { "epoch": 1.9329335998223407, "grad_norm": 1.2278585350906228, "learning_rate": 6.153603314590547e-06, "loss": 0.0864, "step": 4352 }, { "epoch": 1.9333777481678882, "grad_norm": 0.354715018531944, "learning_rate": 6.1517169928532185e-06, "loss": 0.0324, "step": 4353 }, { "epoch": 1.9338218965134355, "grad_norm": 0.3643440217338777, "learning_rate": 6.149830497993261e-06, "loss": 0.0346, "step": 4354 }, { "epoch": 1.9342660448589828, "grad_norm": 0.5368481909179409, "learning_rate": 6.147943830294248e-06, "loss": 0.0406, "step": 4355 }, { "epoch": 1.9347101932045303, "grad_norm": 0.3847206541326022, "learning_rate": 6.146056990039777e-06, "loss": 0.0291, "step": 4356 }, { "epoch": 1.9351543415500778, "grad_norm": 0.4195486386731195, "learning_rate": 6.1441699775134724e-06, "loss": 0.037, "step": 4357 }, { "epoch": 1.935598489895625, "grad_norm": 0.4387846462865948, "learning_rate": 6.142282792998985e-06, "loss": 0.0336, "step": 4358 }, { "epoch": 1.9360426382411724, "grad_norm": 0.33683241345246556, "learning_rate": 6.14039543677999e-06, "loss": 0.0357, "step": 4359 }, { "epoch": 1.93648678658672, "grad_norm": 0.39849510173786257, "learning_rate": 6.138507909140187e-06, "loss": 0.0384, "step": 4360 }, { "epoch": 1.9369309349322674, "grad_norm": 0.4846553810738608, "learning_rate": 6.136620210363307e-06, "loss": 0.0488, "step": 4361 }, { "epoch": 1.9373750832778147, "grad_norm": 0.44711186795351665, "learning_rate": 6.1347323407331e-06, "loss": 0.0403, "step": 4362 }, { "epoch": 1.9378192316233622, "grad_norm": 0.546459442186708, "learning_rate": 6.132844300533348e-06, "loss": 0.0442, "step": 4363 }, { "epoch": 1.9382633799689097, "grad_norm": 0.6727853387902581, "learning_rate": 6.130956090047852e-06, "loss": 0.0542, "step": 4364 }, { "epoch": 1.938707528314457, "grad_norm": 0.5269547149952615, "learning_rate": 6.129067709560445e-06, "loss": 0.0553, "step": 4365 }, { "epoch": 1.9391516766600043, "grad_norm": 1.3382161278121019, "learning_rate": 6.127179159354985e-06, "loss": 0.0426, "step": 4366 }, { "epoch": 1.9395958250055518, "grad_norm": 0.4304150369100395, "learning_rate": 6.125290439715346e-06, "loss": 0.0373, "step": 4367 }, { "epoch": 1.9400399733510993, "grad_norm": 0.43883673281798163, "learning_rate": 6.12340155092544e-06, "loss": 0.0377, "step": 4368 }, { "epoch": 1.9404841216966466, "grad_norm": 0.5188193338070988, "learning_rate": 6.121512493269197e-06, "loss": 0.0371, "step": 4369 }, { "epoch": 1.9409282700421941, "grad_norm": 0.505484234164325, "learning_rate": 6.119623267030576e-06, "loss": 0.0402, "step": 4370 }, { "epoch": 1.9413724183877417, "grad_norm": 0.5176850701235032, "learning_rate": 6.1177338724935576e-06, "loss": 0.0466, "step": 4371 }, { "epoch": 1.941816566733289, "grad_norm": 0.4703785260705156, "learning_rate": 6.115844309942153e-06, "loss": 0.04, "step": 4372 }, { "epoch": 1.9422607150788362, "grad_norm": 0.43937571329979475, "learning_rate": 6.1139545796603925e-06, "loss": 0.0395, "step": 4373 }, { "epoch": 1.9427048634243838, "grad_norm": 0.5671825068350916, "learning_rate": 6.112064681932335e-06, "loss": 0.0445, "step": 4374 }, { "epoch": 1.9431490117699313, "grad_norm": 0.38466158842057985, "learning_rate": 6.110174617042066e-06, "loss": 0.033, "step": 4375 }, { "epoch": 1.9435931601154786, "grad_norm": 0.46940056697212595, "learning_rate": 6.108284385273695e-06, "loss": 0.0448, "step": 4376 }, { "epoch": 1.9440373084610258, "grad_norm": 0.4747699143093753, "learning_rate": 6.106393986911353e-06, "loss": 0.0459, "step": 4377 }, { "epoch": 1.9444814568065734, "grad_norm": 0.4922037733238623, "learning_rate": 6.1045034222392e-06, "loss": 0.0462, "step": 4378 }, { "epoch": 1.9449256051521209, "grad_norm": 0.48074571927831655, "learning_rate": 6.102612691541422e-06, "loss": 0.0433, "step": 4379 }, { "epoch": 1.9453697534976682, "grad_norm": 0.45838348922641425, "learning_rate": 6.1007217951022244e-06, "loss": 0.0567, "step": 4380 }, { "epoch": 1.9458139018432157, "grad_norm": 0.39557432428184086, "learning_rate": 6.098830733205844e-06, "loss": 0.0318, "step": 4381 }, { "epoch": 1.9462580501887632, "grad_norm": 0.4177848324601465, "learning_rate": 6.096939506136539e-06, "loss": 0.0475, "step": 4382 }, { "epoch": 1.9467021985343105, "grad_norm": 0.4875377634328118, "learning_rate": 6.095048114178591e-06, "loss": 0.0385, "step": 4383 }, { "epoch": 1.9471463468798578, "grad_norm": 0.3458943442785933, "learning_rate": 6.093156557616311e-06, "loss": 0.0259, "step": 4384 }, { "epoch": 1.9475904952254053, "grad_norm": 0.48182367581929, "learning_rate": 6.09126483673403e-06, "loss": 0.0516, "step": 4385 }, { "epoch": 1.9480346435709528, "grad_norm": 0.4073510855592602, "learning_rate": 6.089372951816108e-06, "loss": 0.0313, "step": 4386 }, { "epoch": 1.9484787919165, "grad_norm": 0.48547952748532, "learning_rate": 6.087480903146926e-06, "loss": 0.0445, "step": 4387 }, { "epoch": 1.9489229402620474, "grad_norm": 0.5136489604538488, "learning_rate": 6.085588691010888e-06, "loss": 0.0452, "step": 4388 }, { "epoch": 1.9493670886075949, "grad_norm": 0.37826853557517365, "learning_rate": 6.0836963156924335e-06, "loss": 0.0356, "step": 4389 }, { "epoch": 1.9498112369531424, "grad_norm": 0.4912572577888434, "learning_rate": 6.081803777476012e-06, "loss": 0.0462, "step": 4390 }, { "epoch": 1.9502553852986897, "grad_norm": 0.4860207069904635, "learning_rate": 6.079911076646106e-06, "loss": 0.0425, "step": 4391 }, { "epoch": 1.9506995336442372, "grad_norm": 0.3704904721214982, "learning_rate": 6.07801821348722e-06, "loss": 0.032, "step": 4392 }, { "epoch": 1.9511436819897847, "grad_norm": 0.4740957603042776, "learning_rate": 6.076125188283885e-06, "loss": 0.0447, "step": 4393 }, { "epoch": 1.951587830335332, "grad_norm": 0.4597613542477622, "learning_rate": 6.074232001320654e-06, "loss": 0.0508, "step": 4394 }, { "epoch": 1.9520319786808793, "grad_norm": 0.35666990194120035, "learning_rate": 6.072338652882105e-06, "loss": 0.036, "step": 4395 }, { "epoch": 1.9524761270264268, "grad_norm": 0.5521794806193095, "learning_rate": 6.070445143252842e-06, "loss": 0.0555, "step": 4396 }, { "epoch": 1.9529202753719743, "grad_norm": 0.4402940395866301, "learning_rate": 6.0685514727174885e-06, "loss": 0.0365, "step": 4397 }, { "epoch": 1.9533644237175216, "grad_norm": 0.45905507047472427, "learning_rate": 6.066657641560697e-06, "loss": 0.0371, "step": 4398 }, { "epoch": 1.953808572063069, "grad_norm": 0.5661699028159852, "learning_rate": 6.064763650067145e-06, "loss": 0.0382, "step": 4399 }, { "epoch": 1.9542527204086166, "grad_norm": 0.35700483978034786, "learning_rate": 6.062869498521527e-06, "loss": 0.0366, "step": 4400 }, { "epoch": 1.954696868754164, "grad_norm": 0.43424661857402086, "learning_rate": 6.060975187208569e-06, "loss": 0.0489, "step": 4401 }, { "epoch": 1.9551410170997112, "grad_norm": 0.4129343066796859, "learning_rate": 6.059080716413016e-06, "loss": 0.0389, "step": 4402 }, { "epoch": 1.9555851654452587, "grad_norm": 0.5034728724416178, "learning_rate": 6.057186086419643e-06, "loss": 0.043, "step": 4403 }, { "epoch": 1.9560293137908062, "grad_norm": 0.36382177248066033, "learning_rate": 6.055291297513243e-06, "loss": 0.0305, "step": 4404 }, { "epoch": 1.9564734621363535, "grad_norm": 0.42171053505933703, "learning_rate": 6.053396349978632e-06, "loss": 0.0537, "step": 4405 }, { "epoch": 1.9569176104819008, "grad_norm": 0.42351346045820926, "learning_rate": 6.0515012441006574e-06, "loss": 0.0444, "step": 4406 }, { "epoch": 1.9573617588274483, "grad_norm": 0.3951815066981233, "learning_rate": 6.0496059801641835e-06, "loss": 0.0329, "step": 4407 }, { "epoch": 1.9578059071729959, "grad_norm": 0.6939750072874196, "learning_rate": 6.047710558454102e-06, "loss": 0.0495, "step": 4408 }, { "epoch": 1.9582500555185431, "grad_norm": 0.47139234544641123, "learning_rate": 6.0458149792553245e-06, "loss": 0.0338, "step": 4409 }, { "epoch": 1.9586942038640907, "grad_norm": 0.4743764736759579, "learning_rate": 6.043919242852792e-06, "loss": 0.0401, "step": 4410 }, { "epoch": 1.9591383522096382, "grad_norm": 0.7527833386589416, "learning_rate": 6.042023349531463e-06, "loss": 0.046, "step": 4411 }, { "epoch": 1.9595825005551855, "grad_norm": 0.4434725389118798, "learning_rate": 6.040127299576324e-06, "loss": 0.0366, "step": 4412 }, { "epoch": 1.9600266489007327, "grad_norm": 0.5310028240539822, "learning_rate": 6.038231093272383e-06, "loss": 0.0475, "step": 4413 }, { "epoch": 1.9604707972462803, "grad_norm": 0.6870302944712058, "learning_rate": 6.036334730904672e-06, "loss": 0.0401, "step": 4414 }, { "epoch": 1.9609149455918278, "grad_norm": 0.3754707520324698, "learning_rate": 6.034438212758249e-06, "loss": 0.0344, "step": 4415 }, { "epoch": 1.961359093937375, "grad_norm": 0.7273648929204632, "learning_rate": 6.032541539118188e-06, "loss": 0.059, "step": 4416 }, { "epoch": 1.9618032422829224, "grad_norm": 0.5563803052374947, "learning_rate": 6.030644710269595e-06, "loss": 0.0337, "step": 4417 }, { "epoch": 1.9622473906284699, "grad_norm": 0.5196612260430283, "learning_rate": 6.028747726497594e-06, "loss": 0.043, "step": 4418 }, { "epoch": 1.9626915389740174, "grad_norm": 0.37874941124980105, "learning_rate": 6.026850588087334e-06, "loss": 0.0438, "step": 4419 }, { "epoch": 1.9631356873195647, "grad_norm": 0.3791041958907497, "learning_rate": 6.024953295323987e-06, "loss": 0.0318, "step": 4420 }, { "epoch": 1.9635798356651122, "grad_norm": 0.7223736573889502, "learning_rate": 6.02305584849275e-06, "loss": 0.0406, "step": 4421 }, { "epoch": 1.9640239840106597, "grad_norm": 0.5182650699628792, "learning_rate": 6.02115824787884e-06, "loss": 0.0292, "step": 4422 }, { "epoch": 1.964468132356207, "grad_norm": 0.42934775755171317, "learning_rate": 6.019260493767499e-06, "loss": 0.0328, "step": 4423 }, { "epoch": 1.9649122807017543, "grad_norm": 0.5027721140555967, "learning_rate": 6.0173625864439924e-06, "loss": 0.0328, "step": 4424 }, { "epoch": 1.9653564290473018, "grad_norm": 0.4239046281164633, "learning_rate": 6.015464526193605e-06, "loss": 0.0402, "step": 4425 }, { "epoch": 1.9658005773928493, "grad_norm": 0.3685503956667686, "learning_rate": 6.013566313301651e-06, "loss": 0.0356, "step": 4426 }, { "epoch": 1.9662447257383966, "grad_norm": 0.5335432976601984, "learning_rate": 6.011667948053462e-06, "loss": 0.0488, "step": 4427 }, { "epoch": 1.9666888740839439, "grad_norm": 0.41452010755189883, "learning_rate": 6.009769430734395e-06, "loss": 0.0458, "step": 4428 }, { "epoch": 1.9671330224294914, "grad_norm": 0.4362710704789863, "learning_rate": 6.007870761629831e-06, "loss": 0.0399, "step": 4429 }, { "epoch": 1.967577170775039, "grad_norm": 0.40405492276821936, "learning_rate": 6.005971941025171e-06, "loss": 0.0338, "step": 4430 }, { "epoch": 1.9680213191205862, "grad_norm": 0.602695490290635, "learning_rate": 6.004072969205838e-06, "loss": 0.0684, "step": 4431 }, { "epoch": 1.9684654674661337, "grad_norm": 0.47106411491289435, "learning_rate": 6.002173846457282e-06, "loss": 0.0402, "step": 4432 }, { "epoch": 1.9689096158116812, "grad_norm": 0.5941226728121359, "learning_rate": 6.0002745730649725e-06, "loss": 0.0376, "step": 4433 }, { "epoch": 1.9693537641572285, "grad_norm": 0.3952404366832551, "learning_rate": 5.998375149314404e-06, "loss": 0.0375, "step": 4434 }, { "epoch": 1.9697979125027758, "grad_norm": 0.5414636177351856, "learning_rate": 5.996475575491091e-06, "loss": 0.036, "step": 4435 }, { "epoch": 1.9702420608483233, "grad_norm": 0.35141134241433086, "learning_rate": 5.994575851880571e-06, "loss": 0.0299, "step": 4436 }, { "epoch": 1.9706862091938708, "grad_norm": 0.5469528061005677, "learning_rate": 5.992675978768406e-06, "loss": 0.0312, "step": 4437 }, { "epoch": 1.9711303575394181, "grad_norm": 0.4485140591176533, "learning_rate": 5.99077595644018e-06, "loss": 0.0397, "step": 4438 }, { "epoch": 1.9715745058849656, "grad_norm": 0.4559475041999291, "learning_rate": 5.988875785181496e-06, "loss": 0.0399, "step": 4439 }, { "epoch": 1.9720186542305131, "grad_norm": 0.5184191786649656, "learning_rate": 5.986975465277983e-06, "loss": 0.042, "step": 4440 }, { "epoch": 1.9724628025760604, "grad_norm": 0.5018306786878316, "learning_rate": 5.9850749970152935e-06, "loss": 0.0433, "step": 4441 }, { "epoch": 1.9729069509216077, "grad_norm": 0.41853899287083135, "learning_rate": 5.983174380679096e-06, "loss": 0.0311, "step": 4442 }, { "epoch": 1.9733510992671552, "grad_norm": 0.3979151331311135, "learning_rate": 5.98127361655509e-06, "loss": 0.0393, "step": 4443 }, { "epoch": 1.9737952476127028, "grad_norm": 0.4773579735193138, "learning_rate": 5.979372704928991e-06, "loss": 0.0404, "step": 4444 }, { "epoch": 1.97423939595825, "grad_norm": 0.5742352016540867, "learning_rate": 5.977471646086535e-06, "loss": 0.0371, "step": 4445 }, { "epoch": 1.9746835443037973, "grad_norm": 0.4462263069511724, "learning_rate": 5.97557044031349e-06, "loss": 0.0327, "step": 4446 }, { "epoch": 1.9751276926493448, "grad_norm": 0.6732019962659918, "learning_rate": 5.973669087895633e-06, "loss": 0.0596, "step": 4447 }, { "epoch": 1.9755718409948924, "grad_norm": 0.485770657972991, "learning_rate": 5.971767589118772e-06, "loss": 0.0337, "step": 4448 }, { "epoch": 1.9760159893404396, "grad_norm": 0.342670431757156, "learning_rate": 5.969865944268737e-06, "loss": 0.0303, "step": 4449 }, { "epoch": 1.9764601376859872, "grad_norm": 0.4112414603189444, "learning_rate": 5.9679641536313734e-06, "loss": 0.038, "step": 4450 }, { "epoch": 1.9769042860315347, "grad_norm": 0.6127929425160972, "learning_rate": 5.9660622174925564e-06, "loss": 0.0537, "step": 4451 }, { "epoch": 1.977348434377082, "grad_norm": 0.33024844150086474, "learning_rate": 5.964160136138177e-06, "loss": 0.0353, "step": 4452 }, { "epoch": 1.9777925827226293, "grad_norm": 0.5529535095882723, "learning_rate": 5.96225790985415e-06, "loss": 0.0442, "step": 4453 }, { "epoch": 1.9782367310681768, "grad_norm": 0.4225770450854345, "learning_rate": 5.960355538926414e-06, "loss": 0.0494, "step": 4454 }, { "epoch": 1.9786808794137243, "grad_norm": 0.5282007627303459, "learning_rate": 5.958453023640928e-06, "loss": 0.0416, "step": 4455 }, { "epoch": 1.9791250277592716, "grad_norm": 0.3898204372093461, "learning_rate": 5.956550364283671e-06, "loss": 0.0373, "step": 4456 }, { "epoch": 1.9795691761048189, "grad_norm": 0.6543021886189491, "learning_rate": 5.954647561140643e-06, "loss": 0.0406, "step": 4457 }, { "epoch": 1.9800133244503664, "grad_norm": 0.7003740987141756, "learning_rate": 5.952744614497872e-06, "loss": 0.0437, "step": 4458 }, { "epoch": 1.9804574727959139, "grad_norm": 0.4937974164168222, "learning_rate": 5.9508415246414e-06, "loss": 0.0539, "step": 4459 }, { "epoch": 1.9809016211414612, "grad_norm": 0.46863912061469276, "learning_rate": 5.948938291857296e-06, "loss": 0.0503, "step": 4460 }, { "epoch": 1.9813457694870087, "grad_norm": 0.5235494039593215, "learning_rate": 5.947034916431646e-06, "loss": 0.0462, "step": 4461 }, { "epoch": 1.9817899178325562, "grad_norm": 0.4097032398279417, "learning_rate": 5.945131398650561e-06, "loss": 0.038, "step": 4462 }, { "epoch": 1.9822340661781035, "grad_norm": 0.4528672743260559, "learning_rate": 5.943227738800172e-06, "loss": 0.0558, "step": 4463 }, { "epoch": 1.9826782145236508, "grad_norm": 0.5204018448947473, "learning_rate": 5.941323937166632e-06, "loss": 0.0522, "step": 4464 }, { "epoch": 1.9831223628691983, "grad_norm": 0.3540421279343773, "learning_rate": 5.939419994036113e-06, "loss": 0.038, "step": 4465 }, { "epoch": 1.9835665112147458, "grad_norm": 0.6263363230917521, "learning_rate": 5.937515909694811e-06, "loss": 0.0569, "step": 4466 }, { "epoch": 1.984010659560293, "grad_norm": 0.4116502316940924, "learning_rate": 5.9356116844289426e-06, "loss": 0.0413, "step": 4467 }, { "epoch": 1.9844548079058404, "grad_norm": 0.44077017956645004, "learning_rate": 5.933707318524744e-06, "loss": 0.0333, "step": 4468 }, { "epoch": 1.9848989562513881, "grad_norm": 0.3729944832338479, "learning_rate": 5.931802812268476e-06, "loss": 0.0327, "step": 4469 }, { "epoch": 1.9853431045969354, "grad_norm": 0.40626287554685864, "learning_rate": 5.929898165946416e-06, "loss": 0.0363, "step": 4470 }, { "epoch": 1.9857872529424827, "grad_norm": 0.5950032125541065, "learning_rate": 5.927993379844864e-06, "loss": 0.0426, "step": 4471 }, { "epoch": 1.9862314012880302, "grad_norm": 0.40799770820647496, "learning_rate": 5.9260884542501455e-06, "loss": 0.044, "step": 4472 }, { "epoch": 1.9866755496335777, "grad_norm": 0.41140378050861237, "learning_rate": 5.9241833894486e-06, "loss": 0.0482, "step": 4473 }, { "epoch": 1.987119697979125, "grad_norm": 0.40378379008973436, "learning_rate": 5.922278185726591e-06, "loss": 0.0327, "step": 4474 }, { "epoch": 1.9875638463246723, "grad_norm": 0.41453258443290436, "learning_rate": 5.920372843370504e-06, "loss": 0.0416, "step": 4475 }, { "epoch": 1.9880079946702198, "grad_norm": 0.5109670847768875, "learning_rate": 5.9184673626667455e-06, "loss": 0.039, "step": 4476 }, { "epoch": 1.9884521430157673, "grad_norm": 0.5686294228064248, "learning_rate": 5.9165617439017395e-06, "loss": 0.0534, "step": 4477 }, { "epoch": 1.9888962913613146, "grad_norm": 0.4064180941136422, "learning_rate": 5.914655987361934e-06, "loss": 0.0328, "step": 4478 }, { "epoch": 1.9893404397068621, "grad_norm": 0.5070558821658653, "learning_rate": 5.912750093333796e-06, "loss": 0.0542, "step": 4479 }, { "epoch": 1.9897845880524097, "grad_norm": 0.3530913079623509, "learning_rate": 5.910844062103814e-06, "loss": 0.0395, "step": 4480 }, { "epoch": 1.990228736397957, "grad_norm": 0.43523429839148015, "learning_rate": 5.908937893958497e-06, "loss": 0.0366, "step": 4481 }, { "epoch": 1.9906728847435042, "grad_norm": 0.4229033433291945, "learning_rate": 5.907031589184374e-06, "loss": 0.0383, "step": 4482 }, { "epoch": 1.9911170330890517, "grad_norm": 0.39083478145330547, "learning_rate": 5.905125148067997e-06, "loss": 0.0371, "step": 4483 }, { "epoch": 1.9915611814345993, "grad_norm": 0.5025271192866929, "learning_rate": 5.9032185708959354e-06, "loss": 0.0395, "step": 4484 }, { "epoch": 1.9920053297801465, "grad_norm": 0.349650340955071, "learning_rate": 5.901311857954777e-06, "loss": 0.0266, "step": 4485 }, { "epoch": 1.9924494781256938, "grad_norm": 0.5279876104400976, "learning_rate": 5.899405009531136e-06, "loss": 0.0402, "step": 4486 }, { "epoch": 1.9928936264712414, "grad_norm": 0.6233845443742896, "learning_rate": 5.897498025911645e-06, "loss": 0.048, "step": 4487 }, { "epoch": 1.9933377748167889, "grad_norm": 0.49026655627892524, "learning_rate": 5.8955909073829555e-06, "loss": 0.0412, "step": 4488 }, { "epoch": 1.9937819231623362, "grad_norm": 0.5000915892039338, "learning_rate": 5.893683654231737e-06, "loss": 0.0394, "step": 4489 }, { "epoch": 1.9942260715078837, "grad_norm": 0.5000990386217609, "learning_rate": 5.891776266744686e-06, "loss": 0.0355, "step": 4490 }, { "epoch": 1.9946702198534312, "grad_norm": 0.39359957737650036, "learning_rate": 5.889868745208514e-06, "loss": 0.0304, "step": 4491 }, { "epoch": 1.9951143681989785, "grad_norm": 0.9651263441707514, "learning_rate": 5.8879610899099505e-06, "loss": 0.0529, "step": 4492 }, { "epoch": 1.9955585165445258, "grad_norm": 0.32907807731556993, "learning_rate": 5.886053301135755e-06, "loss": 0.0322, "step": 4493 }, { "epoch": 1.9960026648900733, "grad_norm": 0.4231846959633054, "learning_rate": 5.8841453791726944e-06, "loss": 0.0362, "step": 4494 }, { "epoch": 1.9964468132356208, "grad_norm": 0.4958581720016726, "learning_rate": 5.882237324307564e-06, "loss": 0.0304, "step": 4495 }, { "epoch": 1.996890961581168, "grad_norm": 0.44600019470214586, "learning_rate": 5.880329136827178e-06, "loss": 0.0405, "step": 4496 }, { "epoch": 1.9973351099267154, "grad_norm": 0.5659602027375302, "learning_rate": 5.878420817018369e-06, "loss": 0.0472, "step": 4497 }, { "epoch": 1.9977792582722629, "grad_norm": 1.0132374448664145, "learning_rate": 5.87651236516799e-06, "loss": 0.0706, "step": 4498 }, { "epoch": 1.9982234066178104, "grad_norm": 0.4914853042492375, "learning_rate": 5.874603781562911e-06, "loss": 0.0422, "step": 4499 }, { "epoch": 1.9986675549633577, "grad_norm": 0.40024702771181153, "learning_rate": 5.872695066490028e-06, "loss": 0.0344, "step": 4500 }, { "epoch": 1.9991117033089052, "grad_norm": 0.49874140037011355, "learning_rate": 5.870786220236253e-06, "loss": 0.0417, "step": 4501 }, { "epoch": 1.9995558516544527, "grad_norm": 0.47988916644310126, "learning_rate": 5.868877243088515e-06, "loss": 0.0441, "step": 4502 }, { "epoch": 2.0, "grad_norm": 0.904943869460001, "learning_rate": 5.866968135333769e-06, "loss": 0.0502, "step": 4503 }, { "epoch": 2.0, "eval_loss": 0.04465880244970322, "eval_runtime": 403.5178, "eval_samples_per_second": 37.584, "eval_steps_per_second": 1.175, "step": 4503 }, { "epoch": 2.0004441483455473, "grad_norm": 0.4375728669928517, "learning_rate": 5.8650588972589865e-06, "loss": 0.0263, "step": 4504 }, { "epoch": 2.000888296691095, "grad_norm": 0.9215657711078544, "learning_rate": 5.863149529151154e-06, "loss": 0.0492, "step": 4505 }, { "epoch": 2.0013324450366423, "grad_norm": 0.5271072063506441, "learning_rate": 5.8612400312972865e-06, "loss": 0.0508, "step": 4506 }, { "epoch": 2.0017765933821896, "grad_norm": 0.48411386914362037, "learning_rate": 5.859330403984413e-06, "loss": 0.0371, "step": 4507 }, { "epoch": 2.002220741727737, "grad_norm": 0.4017233429265068, "learning_rate": 5.85742064749958e-06, "loss": 0.0418, "step": 4508 }, { "epoch": 2.0026648900732846, "grad_norm": 0.4120119737365717, "learning_rate": 5.85551076212986e-06, "loss": 0.046, "step": 4509 }, { "epoch": 2.003109038418832, "grad_norm": 0.4366707287416164, "learning_rate": 5.8536007481623406e-06, "loss": 0.0443, "step": 4510 }, { "epoch": 2.003553186764379, "grad_norm": 0.7145508695971371, "learning_rate": 5.851690605884127e-06, "loss": 0.0532, "step": 4511 }, { "epoch": 2.0039973351099265, "grad_norm": 0.5011936156881229, "learning_rate": 5.84978033558235e-06, "loss": 0.0354, "step": 4512 }, { "epoch": 2.0044414834554742, "grad_norm": 0.36581304940892057, "learning_rate": 5.847869937544151e-06, "loss": 0.0294, "step": 4513 }, { "epoch": 2.0048856318010215, "grad_norm": 0.37996248718965503, "learning_rate": 5.845959412056699e-06, "loss": 0.0286, "step": 4514 }, { "epoch": 2.005329780146569, "grad_norm": 0.5097034078222489, "learning_rate": 5.844048759407177e-06, "loss": 0.0414, "step": 4515 }, { "epoch": 2.0057739284921166, "grad_norm": 0.4963020471009325, "learning_rate": 5.842137979882786e-06, "loss": 0.0459, "step": 4516 }, { "epoch": 2.006218076837664, "grad_norm": 0.6161319347902735, "learning_rate": 5.840227073770754e-06, "loss": 0.0629, "step": 4517 }, { "epoch": 2.006662225183211, "grad_norm": 0.3511248203446856, "learning_rate": 5.838316041358319e-06, "loss": 0.0295, "step": 4518 }, { "epoch": 2.0071063735287584, "grad_norm": 0.37830736549229443, "learning_rate": 5.836404882932744e-06, "loss": 0.0319, "step": 4519 }, { "epoch": 2.007550521874306, "grad_norm": 0.6333845141434062, "learning_rate": 5.8344935987813045e-06, "loss": 0.0379, "step": 4520 }, { "epoch": 2.0079946702198535, "grad_norm": 0.5166443794796353, "learning_rate": 5.832582189191304e-06, "loss": 0.0346, "step": 4521 }, { "epoch": 2.0084388185654007, "grad_norm": 0.4317191740162663, "learning_rate": 5.8306706544500544e-06, "loss": 0.0319, "step": 4522 }, { "epoch": 2.0088829669109485, "grad_norm": 0.4388602499538576, "learning_rate": 5.828758994844896e-06, "loss": 0.0377, "step": 4523 }, { "epoch": 2.0093271152564958, "grad_norm": 0.5946947521373457, "learning_rate": 5.826847210663184e-06, "loss": 0.0421, "step": 4524 }, { "epoch": 2.009771263602043, "grad_norm": 0.3821952135869883, "learning_rate": 5.8249353021922895e-06, "loss": 0.0303, "step": 4525 }, { "epoch": 2.0102154119475903, "grad_norm": 0.5519322372289749, "learning_rate": 5.823023269719606e-06, "loss": 0.0513, "step": 4526 }, { "epoch": 2.010659560293138, "grad_norm": 0.6313520947120921, "learning_rate": 5.821111113532545e-06, "loss": 0.0484, "step": 4527 }, { "epoch": 2.0111037086386854, "grad_norm": 0.5368562306090153, "learning_rate": 5.819198833918533e-06, "loss": 0.051, "step": 4528 }, { "epoch": 2.0115478569842327, "grad_norm": 0.39624325698963303, "learning_rate": 5.817286431165024e-06, "loss": 0.0333, "step": 4529 }, { "epoch": 2.01199200532978, "grad_norm": 0.411233856841024, "learning_rate": 5.815373905559478e-06, "loss": 0.0312, "step": 4530 }, { "epoch": 2.0124361536753277, "grad_norm": 0.3742461123258978, "learning_rate": 5.813461257389384e-06, "loss": 0.0268, "step": 4531 }, { "epoch": 2.012880302020875, "grad_norm": 0.3749818385865246, "learning_rate": 5.811548486942246e-06, "loss": 0.0346, "step": 4532 }, { "epoch": 2.0133244503664223, "grad_norm": 0.34116601308623506, "learning_rate": 5.809635594505585e-06, "loss": 0.0251, "step": 4533 }, { "epoch": 2.01376859871197, "grad_norm": 0.5178521236398543, "learning_rate": 5.807722580366939e-06, "loss": 0.0381, "step": 4534 }, { "epoch": 2.0142127470575173, "grad_norm": 0.32707032102652794, "learning_rate": 5.805809444813869e-06, "loss": 0.0275, "step": 4535 }, { "epoch": 2.0146568954030646, "grad_norm": 0.4024749636693741, "learning_rate": 5.80389618813395e-06, "loss": 0.0365, "step": 4536 }, { "epoch": 2.015101043748612, "grad_norm": 0.3506397945464826, "learning_rate": 5.8019828106147805e-06, "loss": 0.0252, "step": 4537 }, { "epoch": 2.0155451920941596, "grad_norm": 0.5822851446048461, "learning_rate": 5.80006931254397e-06, "loss": 0.0369, "step": 4538 }, { "epoch": 2.015989340439707, "grad_norm": 0.44067833638988113, "learning_rate": 5.798155694209151e-06, "loss": 0.0344, "step": 4539 }, { "epoch": 2.016433488785254, "grad_norm": 0.4460004586157456, "learning_rate": 5.796241955897972e-06, "loss": 0.0381, "step": 4540 }, { "epoch": 2.0168776371308015, "grad_norm": 0.3328661208832795, "learning_rate": 5.7943280978981034e-06, "loss": 0.0284, "step": 4541 }, { "epoch": 2.017321785476349, "grad_norm": 0.48357435513498903, "learning_rate": 5.792414120497227e-06, "loss": 0.0361, "step": 4542 }, { "epoch": 2.0177659338218965, "grad_norm": 0.4866683235779641, "learning_rate": 5.790500023983049e-06, "loss": 0.0359, "step": 4543 }, { "epoch": 2.018210082167444, "grad_norm": 0.39359993621118305, "learning_rate": 5.788585808643287e-06, "loss": 0.0281, "step": 4544 }, { "epoch": 2.0186542305129915, "grad_norm": 0.4058975435223384, "learning_rate": 5.786671474765683e-06, "loss": 0.0314, "step": 4545 }, { "epoch": 2.019098378858539, "grad_norm": 0.38121689511830104, "learning_rate": 5.784757022637993e-06, "loss": 0.0291, "step": 4546 }, { "epoch": 2.019542527204086, "grad_norm": 0.4508375438456433, "learning_rate": 5.782842452547992e-06, "loss": 0.0334, "step": 4547 }, { "epoch": 2.0199866755496334, "grad_norm": 0.42673347363830266, "learning_rate": 5.780927764783473e-06, "loss": 0.0249, "step": 4548 }, { "epoch": 2.020430823895181, "grad_norm": 0.43669165432209434, "learning_rate": 5.779012959632244e-06, "loss": 0.0358, "step": 4549 }, { "epoch": 2.0208749722407284, "grad_norm": 0.4022820994220209, "learning_rate": 5.777098037382135e-06, "loss": 0.0293, "step": 4550 }, { "epoch": 2.0213191205862757, "grad_norm": 0.42387266543810437, "learning_rate": 5.77518299832099e-06, "loss": 0.0384, "step": 4551 }, { "epoch": 2.021763268931823, "grad_norm": 0.4585217445539665, "learning_rate": 5.7732678427366725e-06, "loss": 0.0349, "step": 4552 }, { "epoch": 2.0222074172773707, "grad_norm": 0.44358027335422784, "learning_rate": 5.771352570917062e-06, "loss": 0.0332, "step": 4553 }, { "epoch": 2.022651565622918, "grad_norm": 0.43507443913798416, "learning_rate": 5.769437183150057e-06, "loss": 0.0347, "step": 4554 }, { "epoch": 2.0230957139684653, "grad_norm": 0.37205140444853446, "learning_rate": 5.767521679723574e-06, "loss": 0.029, "step": 4555 }, { "epoch": 2.023539862314013, "grad_norm": 0.3603917915607534, "learning_rate": 5.765606060925545e-06, "loss": 0.0309, "step": 4556 }, { "epoch": 2.0239840106595604, "grad_norm": 0.5400074623803027, "learning_rate": 5.763690327043919e-06, "loss": 0.0459, "step": 4557 }, { "epoch": 2.0244281590051076, "grad_norm": 0.39760305609807395, "learning_rate": 5.761774478366664e-06, "loss": 0.0338, "step": 4558 }, { "epoch": 2.024872307350655, "grad_norm": 0.44010489846206996, "learning_rate": 5.759858515181763e-06, "loss": 0.0305, "step": 4559 }, { "epoch": 2.0253164556962027, "grad_norm": 0.38439690133076265, "learning_rate": 5.757942437777222e-06, "loss": 0.0281, "step": 4560 }, { "epoch": 2.02576060404175, "grad_norm": 0.36756874447130394, "learning_rate": 5.756026246441056e-06, "loss": 0.0276, "step": 4561 }, { "epoch": 2.0262047523872972, "grad_norm": 0.3543127710037651, "learning_rate": 5.754109941461302e-06, "loss": 0.0304, "step": 4562 }, { "epoch": 2.026648900732845, "grad_norm": 0.4789471062184959, "learning_rate": 5.7521935231260166e-06, "loss": 0.0361, "step": 4563 }, { "epoch": 2.0270930490783923, "grad_norm": 0.4658261130750723, "learning_rate": 5.7502769917232635e-06, "loss": 0.0297, "step": 4564 }, { "epoch": 2.0275371974239396, "grad_norm": 0.37364282464180315, "learning_rate": 5.748360347541136e-06, "loss": 0.0301, "step": 4565 }, { "epoch": 2.027981345769487, "grad_norm": 0.5624254377961666, "learning_rate": 5.746443590867735e-06, "loss": 0.0375, "step": 4566 }, { "epoch": 2.0284254941150346, "grad_norm": 0.4305222164333765, "learning_rate": 5.7445267219911815e-06, "loss": 0.033, "step": 4567 }, { "epoch": 2.028869642460582, "grad_norm": 0.669501618559028, "learning_rate": 5.742609741199615e-06, "loss": 0.0497, "step": 4568 }, { "epoch": 2.029313790806129, "grad_norm": 0.38137453136293886, "learning_rate": 5.740692648781191e-06, "loss": 0.0377, "step": 4569 }, { "epoch": 2.0297579391516765, "grad_norm": 0.4610447553595522, "learning_rate": 5.738775445024078e-06, "loss": 0.0394, "step": 4570 }, { "epoch": 2.030202087497224, "grad_norm": 0.46477609561023486, "learning_rate": 5.736858130216468e-06, "loss": 0.044, "step": 4571 }, { "epoch": 2.0306462358427715, "grad_norm": 0.34915412378821464, "learning_rate": 5.7349407046465625e-06, "loss": 0.0234, "step": 4572 }, { "epoch": 2.0310903841883188, "grad_norm": 0.47023014530194523, "learning_rate": 5.733023168602584e-06, "loss": 0.0427, "step": 4573 }, { "epoch": 2.0315345325338665, "grad_norm": 0.5052588063022365, "learning_rate": 5.731105522372773e-06, "loss": 0.0406, "step": 4574 }, { "epoch": 2.031978680879414, "grad_norm": 0.5345856053201472, "learning_rate": 5.729187766245382e-06, "loss": 0.0347, "step": 4575 }, { "epoch": 2.032422829224961, "grad_norm": 0.41581297859158317, "learning_rate": 5.727269900508682e-06, "loss": 0.0366, "step": 4576 }, { "epoch": 2.0328669775705084, "grad_norm": 0.5119888393853211, "learning_rate": 5.725351925450964e-06, "loss": 0.0372, "step": 4577 }, { "epoch": 2.033311125916056, "grad_norm": 0.4247836793476431, "learning_rate": 5.723433841360528e-06, "loss": 0.0259, "step": 4578 }, { "epoch": 2.0337552742616034, "grad_norm": 0.3959142952189294, "learning_rate": 5.721515648525698e-06, "loss": 0.0358, "step": 4579 }, { "epoch": 2.0341994226071507, "grad_norm": 0.4826767145510797, "learning_rate": 5.719597347234809e-06, "loss": 0.0404, "step": 4580 }, { "epoch": 2.034643570952698, "grad_norm": 0.4091839747164647, "learning_rate": 5.7176789377762155e-06, "loss": 0.0421, "step": 4581 }, { "epoch": 2.0350877192982457, "grad_norm": 0.41330812118932564, "learning_rate": 5.715760420438284e-06, "loss": 0.025, "step": 4582 }, { "epoch": 2.035531867643793, "grad_norm": 0.3786001612535725, "learning_rate": 5.713841795509405e-06, "loss": 0.0304, "step": 4583 }, { "epoch": 2.0359760159893403, "grad_norm": 0.38932682295296067, "learning_rate": 5.711923063277979e-06, "loss": 0.033, "step": 4584 }, { "epoch": 2.036420164334888, "grad_norm": 0.3925654682001415, "learning_rate": 5.710004224032421e-06, "loss": 0.035, "step": 4585 }, { "epoch": 2.0368643126804353, "grad_norm": 0.40837083418560577, "learning_rate": 5.708085278061167e-06, "loss": 0.0232, "step": 4586 }, { "epoch": 2.0373084610259826, "grad_norm": 0.4396699075372347, "learning_rate": 5.706166225652669e-06, "loss": 0.0404, "step": 4587 }, { "epoch": 2.03775260937153, "grad_norm": 0.486841383129979, "learning_rate": 5.704247067095391e-06, "loss": 0.0325, "step": 4588 }, { "epoch": 2.0381967577170776, "grad_norm": 0.4503221450108852, "learning_rate": 5.702327802677815e-06, "loss": 0.0346, "step": 4589 }, { "epoch": 2.038640906062625, "grad_norm": 0.4565221863051935, "learning_rate": 5.70040843268844e-06, "loss": 0.0404, "step": 4590 }, { "epoch": 2.0390850544081722, "grad_norm": 0.39766027067422877, "learning_rate": 5.698488957415782e-06, "loss": 0.0325, "step": 4591 }, { "epoch": 2.03952920275372, "grad_norm": 0.469121812114123, "learning_rate": 5.6965693771483654e-06, "loss": 0.0361, "step": 4592 }, { "epoch": 2.0399733510992673, "grad_norm": 0.34294205911608633, "learning_rate": 5.6946496921747394e-06, "loss": 0.0274, "step": 4593 }, { "epoch": 2.0404174994448145, "grad_norm": 0.45197263089243245, "learning_rate": 5.692729902783467e-06, "loss": 0.0357, "step": 4594 }, { "epoch": 2.040861647790362, "grad_norm": 0.4143752687268431, "learning_rate": 5.6908100092631215e-06, "loss": 0.0325, "step": 4595 }, { "epoch": 2.0413057961359096, "grad_norm": 0.5126979012787386, "learning_rate": 5.688890011902295e-06, "loss": 0.0402, "step": 4596 }, { "epoch": 2.041749944481457, "grad_norm": 0.38129445526321254, "learning_rate": 5.686969910989599e-06, "loss": 0.0247, "step": 4597 }, { "epoch": 2.042194092827004, "grad_norm": 0.43439418379805417, "learning_rate": 5.685049706813657e-06, "loss": 0.0334, "step": 4598 }, { "epoch": 2.0426382411725514, "grad_norm": 0.3528242448836029, "learning_rate": 5.683129399663105e-06, "loss": 0.0237, "step": 4599 }, { "epoch": 2.043082389518099, "grad_norm": 0.4979008424907728, "learning_rate": 5.681208989826601e-06, "loss": 0.0346, "step": 4600 }, { "epoch": 2.0435265378636465, "grad_norm": 0.4344255344249799, "learning_rate": 5.679288477592815e-06, "loss": 0.0288, "step": 4601 }, { "epoch": 2.0439706862091938, "grad_norm": 0.5875305767400001, "learning_rate": 5.67736786325043e-06, "loss": 0.0336, "step": 4602 }, { "epoch": 2.0444148345547415, "grad_norm": 0.6007666135225987, "learning_rate": 5.675447147088148e-06, "loss": 0.0337, "step": 4603 }, { "epoch": 2.044858982900289, "grad_norm": 0.3501303227446166, "learning_rate": 5.673526329394688e-06, "loss": 0.0316, "step": 4604 }, { "epoch": 2.045303131245836, "grad_norm": 0.5466639051174087, "learning_rate": 5.6716054104587784e-06, "loss": 0.0513, "step": 4605 }, { "epoch": 2.0457472795913834, "grad_norm": 0.3453198669378243, "learning_rate": 5.669684390569167e-06, "loss": 0.0292, "step": 4606 }, { "epoch": 2.046191427936931, "grad_norm": 0.3855523349827862, "learning_rate": 5.667763270014616e-06, "loss": 0.0274, "step": 4607 }, { "epoch": 2.0466355762824784, "grad_norm": 0.34309904151982246, "learning_rate": 5.665842049083902e-06, "loss": 0.0264, "step": 4608 }, { "epoch": 2.0470797246280257, "grad_norm": 0.7952938969316026, "learning_rate": 5.6639207280658194e-06, "loss": 0.0534, "step": 4609 }, { "epoch": 2.047523872973573, "grad_norm": 0.3602547692842435, "learning_rate": 5.6619993072491694e-06, "loss": 0.027, "step": 4610 }, { "epoch": 2.0479680213191207, "grad_norm": 0.33009638203411035, "learning_rate": 5.6600777869227805e-06, "loss": 0.0267, "step": 4611 }, { "epoch": 2.048412169664668, "grad_norm": 0.43117489579425794, "learning_rate": 5.658156167375488e-06, "loss": 0.038, "step": 4612 }, { "epoch": 2.0488563180102153, "grad_norm": 0.7172674302994839, "learning_rate": 5.656234448896142e-06, "loss": 0.0461, "step": 4613 }, { "epoch": 2.049300466355763, "grad_norm": 0.4507173154420471, "learning_rate": 5.654312631773612e-06, "loss": 0.027, "step": 4614 }, { "epoch": 2.0497446147013103, "grad_norm": 0.6398319387217463, "learning_rate": 5.652390716296778e-06, "loss": 0.041, "step": 4615 }, { "epoch": 2.0501887630468576, "grad_norm": 0.43458565414603195, "learning_rate": 5.650468702754537e-06, "loss": 0.0475, "step": 4616 }, { "epoch": 2.050632911392405, "grad_norm": 0.4724564626628951, "learning_rate": 5.6485465914358005e-06, "loss": 0.039, "step": 4617 }, { "epoch": 2.0510770597379526, "grad_norm": 0.4813725140216776, "learning_rate": 5.646624382629495e-06, "loss": 0.0442, "step": 4618 }, { "epoch": 2.0515212080835, "grad_norm": 0.422057899941897, "learning_rate": 5.64470207662456e-06, "loss": 0.0354, "step": 4619 }, { "epoch": 2.051965356429047, "grad_norm": 0.5520244840521104, "learning_rate": 5.6427796737099515e-06, "loss": 0.0393, "step": 4620 }, { "epoch": 2.0524095047745945, "grad_norm": 0.5717238340997483, "learning_rate": 5.64085717417464e-06, "loss": 0.041, "step": 4621 }, { "epoch": 2.0528536531201422, "grad_norm": 0.4162602609657127, "learning_rate": 5.638934578307608e-06, "loss": 0.0233, "step": 4622 }, { "epoch": 2.0532978014656895, "grad_norm": 0.40689474682922516, "learning_rate": 5.637011886397854e-06, "loss": 0.0479, "step": 4623 }, { "epoch": 2.053741949811237, "grad_norm": 0.46291048789069594, "learning_rate": 5.635089098734394e-06, "loss": 0.0345, "step": 4624 }, { "epoch": 2.0541860981567845, "grad_norm": 0.6061112924495777, "learning_rate": 5.633166215606254e-06, "loss": 0.0519, "step": 4625 }, { "epoch": 2.054630246502332, "grad_norm": 0.37385156175737194, "learning_rate": 5.631243237302478e-06, "loss": 0.0432, "step": 4626 }, { "epoch": 2.055074394847879, "grad_norm": 0.32050349885999035, "learning_rate": 5.629320164112116e-06, "loss": 0.031, "step": 4627 }, { "epoch": 2.0555185431934264, "grad_norm": 0.44064173532980155, "learning_rate": 5.627396996324247e-06, "loss": 0.0255, "step": 4628 }, { "epoch": 2.055962691538974, "grad_norm": 0.35873848998988794, "learning_rate": 5.625473734227952e-06, "loss": 0.0352, "step": 4629 }, { "epoch": 2.0564068398845214, "grad_norm": 0.6158412215543486, "learning_rate": 5.623550378112328e-06, "loss": 0.0442, "step": 4630 }, { "epoch": 2.0568509882300687, "grad_norm": 0.4813854059047201, "learning_rate": 5.621626928266489e-06, "loss": 0.0478, "step": 4631 }, { "epoch": 2.0572951365756165, "grad_norm": 0.4790652853444873, "learning_rate": 5.619703384979566e-06, "loss": 0.0463, "step": 4632 }, { "epoch": 2.0577392849211638, "grad_norm": 0.41738179511720797, "learning_rate": 5.617779748540695e-06, "loss": 0.0338, "step": 4633 }, { "epoch": 2.058183433266711, "grad_norm": 0.41477600567937595, "learning_rate": 5.615856019239034e-06, "loss": 0.0361, "step": 4634 }, { "epoch": 2.0586275816122583, "grad_norm": 0.4967975613891594, "learning_rate": 5.613932197363753e-06, "loss": 0.0333, "step": 4635 }, { "epoch": 2.059071729957806, "grad_norm": 0.49458901207002304, "learning_rate": 5.612008283204033e-06, "loss": 0.0427, "step": 4636 }, { "epoch": 2.0595158783033534, "grad_norm": 0.42427512577649273, "learning_rate": 5.610084277049071e-06, "loss": 0.0354, "step": 4637 }, { "epoch": 2.0599600266489007, "grad_norm": 0.43245137715403886, "learning_rate": 5.608160179188079e-06, "loss": 0.0326, "step": 4638 }, { "epoch": 2.060404174994448, "grad_norm": 0.4874634691089771, "learning_rate": 5.6062359899102815e-06, "loss": 0.0371, "step": 4639 }, { "epoch": 2.0608483233399957, "grad_norm": 0.5004987254382077, "learning_rate": 5.604311709504917e-06, "loss": 0.0397, "step": 4640 }, { "epoch": 2.061292471685543, "grad_norm": 0.4751004755592279, "learning_rate": 5.602387338261236e-06, "loss": 0.0414, "step": 4641 }, { "epoch": 2.0617366200310903, "grad_norm": 0.5299506551694105, "learning_rate": 5.600462876468506e-06, "loss": 0.0368, "step": 4642 }, { "epoch": 2.062180768376638, "grad_norm": 0.5312690498368814, "learning_rate": 5.598538324416007e-06, "loss": 0.0449, "step": 4643 }, { "epoch": 2.0626249167221853, "grad_norm": 0.38593169971896735, "learning_rate": 5.5966136823930286e-06, "loss": 0.0281, "step": 4644 }, { "epoch": 2.0630690650677326, "grad_norm": 0.3958642054782115, "learning_rate": 5.594688950688879e-06, "loss": 0.034, "step": 4645 }, { "epoch": 2.06351321341328, "grad_norm": 0.40570614021956714, "learning_rate": 5.592764129592879e-06, "loss": 0.0323, "step": 4646 }, { "epoch": 2.0639573617588276, "grad_norm": 0.45390531771906856, "learning_rate": 5.590839219394361e-06, "loss": 0.0254, "step": 4647 }, { "epoch": 2.064401510104375, "grad_norm": 0.39108513738732803, "learning_rate": 5.58891422038267e-06, "loss": 0.0278, "step": 4648 }, { "epoch": 2.064845658449922, "grad_norm": 0.5223612545370472, "learning_rate": 5.58698913284717e-06, "loss": 0.0367, "step": 4649 }, { "epoch": 2.0652898067954695, "grad_norm": 0.48567914535043527, "learning_rate": 5.585063957077231e-06, "loss": 0.0303, "step": 4650 }, { "epoch": 2.065733955141017, "grad_norm": 0.3022843285079369, "learning_rate": 5.583138693362241e-06, "loss": 0.0268, "step": 4651 }, { "epoch": 2.0661781034865645, "grad_norm": 0.3626425665807493, "learning_rate": 5.5812133419916e-06, "loss": 0.0282, "step": 4652 }, { "epoch": 2.066622251832112, "grad_norm": 0.4001866579471354, "learning_rate": 5.5792879032547205e-06, "loss": 0.0346, "step": 4653 }, { "epoch": 2.0670664001776595, "grad_norm": 0.4654295875105734, "learning_rate": 5.577362377441029e-06, "loss": 0.0348, "step": 4654 }, { "epoch": 2.067510548523207, "grad_norm": 0.39677357569841354, "learning_rate": 5.5754367648399644e-06, "loss": 0.0274, "step": 4655 }, { "epoch": 2.067954696868754, "grad_norm": 0.4143564936380847, "learning_rate": 5.5735110657409775e-06, "loss": 0.0427, "step": 4656 }, { "epoch": 2.0683988452143014, "grad_norm": 0.3681145367593844, "learning_rate": 5.571585280433537e-06, "loss": 0.0321, "step": 4657 }, { "epoch": 2.068842993559849, "grad_norm": 0.46007574827334824, "learning_rate": 5.569659409207119e-06, "loss": 0.0324, "step": 4658 }, { "epoch": 2.0692871419053964, "grad_norm": 0.4377445107919845, "learning_rate": 5.567733452351214e-06, "loss": 0.0364, "step": 4659 }, { "epoch": 2.0697312902509437, "grad_norm": 0.5314428707092943, "learning_rate": 5.565807410155329e-06, "loss": 0.039, "step": 4660 }, { "epoch": 2.0701754385964914, "grad_norm": 0.4445997114256765, "learning_rate": 5.563881282908976e-06, "loss": 0.0451, "step": 4661 }, { "epoch": 2.0706195869420387, "grad_norm": 0.41675191455829885, "learning_rate": 5.561955070901689e-06, "loss": 0.032, "step": 4662 }, { "epoch": 2.071063735287586, "grad_norm": 0.49984516013690966, "learning_rate": 5.56002877442301e-06, "loss": 0.0379, "step": 4663 }, { "epoch": 2.0715078836331333, "grad_norm": 0.48011634971976885, "learning_rate": 5.558102393762491e-06, "loss": 0.0289, "step": 4664 }, { "epoch": 2.071952031978681, "grad_norm": 0.44249526000612904, "learning_rate": 5.556175929209703e-06, "loss": 0.0254, "step": 4665 }, { "epoch": 2.0723961803242283, "grad_norm": 0.40597832272770745, "learning_rate": 5.554249381054224e-06, "loss": 0.0269, "step": 4666 }, { "epoch": 2.0728403286697756, "grad_norm": 0.45148588795717426, "learning_rate": 5.552322749585649e-06, "loss": 0.0388, "step": 4667 }, { "epoch": 2.073284477015323, "grad_norm": 0.598994856507402, "learning_rate": 5.550396035093582e-06, "loss": 0.0423, "step": 4668 }, { "epoch": 2.0737286253608707, "grad_norm": 0.5181098142664673, "learning_rate": 5.548469237867642e-06, "loss": 0.0433, "step": 4669 }, { "epoch": 2.074172773706418, "grad_norm": 0.5101400816921863, "learning_rate": 5.546542358197458e-06, "loss": 0.0463, "step": 4670 }, { "epoch": 2.0746169220519652, "grad_norm": 0.5101937458580871, "learning_rate": 5.544615396372673e-06, "loss": 0.0341, "step": 4671 }, { "epoch": 2.075061070397513, "grad_norm": 0.5392434065378148, "learning_rate": 5.542688352682944e-06, "loss": 0.0451, "step": 4672 }, { "epoch": 2.0755052187430603, "grad_norm": 0.6897926097007651, "learning_rate": 5.540761227417934e-06, "loss": 0.0355, "step": 4673 }, { "epoch": 2.0759493670886076, "grad_norm": 0.28051948597694654, "learning_rate": 5.53883402086733e-06, "loss": 0.0243, "step": 4674 }, { "epoch": 2.076393515434155, "grad_norm": 0.793052180191732, "learning_rate": 5.536906733320816e-06, "loss": 0.0563, "step": 4675 }, { "epoch": 2.0768376637797026, "grad_norm": 0.4215318597715739, "learning_rate": 5.5349793650681006e-06, "loss": 0.0354, "step": 4676 }, { "epoch": 2.07728181212525, "grad_norm": 0.34425051368314813, "learning_rate": 5.533051916398899e-06, "loss": 0.0295, "step": 4677 }, { "epoch": 2.077725960470797, "grad_norm": 0.6685404197751216, "learning_rate": 5.531124387602938e-06, "loss": 0.0399, "step": 4678 }, { "epoch": 2.0781701088163445, "grad_norm": 0.31141908709280325, "learning_rate": 5.529196778969961e-06, "loss": 0.0271, "step": 4679 }, { "epoch": 2.078614257161892, "grad_norm": 0.4606806580907262, "learning_rate": 5.527269090789718e-06, "loss": 0.0394, "step": 4680 }, { "epoch": 2.0790584055074395, "grad_norm": 0.4252999601959849, "learning_rate": 5.525341323351975e-06, "loss": 0.0394, "step": 4681 }, { "epoch": 2.0795025538529868, "grad_norm": 0.5409739246288477, "learning_rate": 5.5234134769465065e-06, "loss": 0.0521, "step": 4682 }, { "epoch": 2.0799467021985345, "grad_norm": 0.4948477331543779, "learning_rate": 5.5214855518631005e-06, "loss": 0.0485, "step": 4683 }, { "epoch": 2.080390850544082, "grad_norm": 0.3543340984392336, "learning_rate": 5.519557548391557e-06, "loss": 0.0242, "step": 4684 }, { "epoch": 2.080834998889629, "grad_norm": 0.3751926442365401, "learning_rate": 5.517629466821691e-06, "loss": 0.036, "step": 4685 }, { "epoch": 2.0812791472351764, "grad_norm": 0.4117924266399266, "learning_rate": 5.515701307443321e-06, "loss": 0.0485, "step": 4686 }, { "epoch": 2.081723295580724, "grad_norm": 0.47251260035381365, "learning_rate": 5.513773070546284e-06, "loss": 0.0419, "step": 4687 }, { "epoch": 2.0821674439262714, "grad_norm": 0.6076861002224307, "learning_rate": 5.5118447564204295e-06, "loss": 0.0391, "step": 4688 }, { "epoch": 2.0826115922718187, "grad_norm": 0.4792646073001442, "learning_rate": 5.50991636535561e-06, "loss": 0.0295, "step": 4689 }, { "epoch": 2.0830557406173664, "grad_norm": 0.32440405189073857, "learning_rate": 5.5079878976417e-06, "loss": 0.022, "step": 4690 }, { "epoch": 2.0834998889629137, "grad_norm": 0.4221580236556979, "learning_rate": 5.506059353568581e-06, "loss": 0.0263, "step": 4691 }, { "epoch": 2.083944037308461, "grad_norm": 0.35561395384075745, "learning_rate": 5.504130733426145e-06, "loss": 0.0273, "step": 4692 }, { "epoch": 2.0843881856540083, "grad_norm": 0.4935918433503542, "learning_rate": 5.502202037504293e-06, "loss": 0.0328, "step": 4693 }, { "epoch": 2.084832333999556, "grad_norm": 0.439030655630297, "learning_rate": 5.500273266092947e-06, "loss": 0.0278, "step": 4694 }, { "epoch": 2.0852764823451033, "grad_norm": 0.5492104667996629, "learning_rate": 5.49834441948203e-06, "loss": 0.0477, "step": 4695 }, { "epoch": 2.0857206306906506, "grad_norm": 0.48997887857295236, "learning_rate": 5.496415497961482e-06, "loss": 0.036, "step": 4696 }, { "epoch": 2.086164779036198, "grad_norm": 0.9251501688761363, "learning_rate": 5.49448650182125e-06, "loss": 0.0543, "step": 4697 }, { "epoch": 2.0866089273817456, "grad_norm": 0.44096687951153934, "learning_rate": 5.492557431351298e-06, "loss": 0.0372, "step": 4698 }, { "epoch": 2.087053075727293, "grad_norm": 0.48261463177180974, "learning_rate": 5.4906282868415974e-06, "loss": 0.0323, "step": 4699 }, { "epoch": 2.08749722407284, "grad_norm": 0.44612947746443893, "learning_rate": 5.488699068582129e-06, "loss": 0.0379, "step": 4700 }, { "epoch": 2.087941372418388, "grad_norm": 0.6019236481517227, "learning_rate": 5.486769776862891e-06, "loss": 0.0392, "step": 4701 }, { "epoch": 2.0883855207639352, "grad_norm": 0.41037964449549674, "learning_rate": 5.484840411973888e-06, "loss": 0.045, "step": 4702 }, { "epoch": 2.0888296691094825, "grad_norm": 0.6139112370737255, "learning_rate": 5.482910974205133e-06, "loss": 0.0434, "step": 4703 }, { "epoch": 2.08927381745503, "grad_norm": 0.47897436644713304, "learning_rate": 5.480981463846655e-06, "loss": 0.0407, "step": 4704 }, { "epoch": 2.0897179658005776, "grad_norm": 0.4257961843862624, "learning_rate": 5.479051881188494e-06, "loss": 0.0402, "step": 4705 }, { "epoch": 2.090162114146125, "grad_norm": 0.4458277075765739, "learning_rate": 5.477122226520698e-06, "loss": 0.0335, "step": 4706 }, { "epoch": 2.090606262491672, "grad_norm": 0.5283109987423077, "learning_rate": 5.475192500133324e-06, "loss": 0.0321, "step": 4707 }, { "epoch": 2.0910504108372194, "grad_norm": 0.6005581697681983, "learning_rate": 5.473262702316447e-06, "loss": 0.0369, "step": 4708 }, { "epoch": 2.091494559182767, "grad_norm": 0.40161109082786944, "learning_rate": 5.471332833360147e-06, "loss": 0.0289, "step": 4709 }, { "epoch": 2.0919387075283145, "grad_norm": 0.4371088945729004, "learning_rate": 5.4694028935545126e-06, "loss": 0.0343, "step": 4710 }, { "epoch": 2.0923828558738617, "grad_norm": 1.1703243713580254, "learning_rate": 5.467472883189653e-06, "loss": 0.0507, "step": 4711 }, { "epoch": 2.0928270042194095, "grad_norm": 0.40364633007175293, "learning_rate": 5.465542802555677e-06, "loss": 0.0295, "step": 4712 }, { "epoch": 2.0932711525649568, "grad_norm": 0.46666883180291846, "learning_rate": 5.4636126519427095e-06, "loss": 0.0412, "step": 4713 }, { "epoch": 2.093715300910504, "grad_norm": 0.4145958391325037, "learning_rate": 5.461682431640885e-06, "loss": 0.0319, "step": 4714 }, { "epoch": 2.0941594492560514, "grad_norm": 0.3217656626768428, "learning_rate": 5.459752141940347e-06, "loss": 0.0318, "step": 4715 }, { "epoch": 2.094603597601599, "grad_norm": 0.5490857014126957, "learning_rate": 5.457821783131254e-06, "loss": 0.046, "step": 4716 }, { "epoch": 2.0950477459471464, "grad_norm": 0.4425046183099989, "learning_rate": 5.455891355503768e-06, "loss": 0.0299, "step": 4717 }, { "epoch": 2.0954918942926937, "grad_norm": 0.6349791543930909, "learning_rate": 5.453960859348069e-06, "loss": 0.0565, "step": 4718 }, { "epoch": 2.095936042638241, "grad_norm": 0.4072328187100098, "learning_rate": 5.4520302949543415e-06, "loss": 0.0349, "step": 4719 }, { "epoch": 2.0963801909837887, "grad_norm": 0.3811118913667539, "learning_rate": 5.450099662612781e-06, "loss": 0.0305, "step": 4720 }, { "epoch": 2.096824339329336, "grad_norm": 0.4774255455908721, "learning_rate": 5.448168962613596e-06, "loss": 0.0368, "step": 4721 }, { "epoch": 2.0972684876748833, "grad_norm": 0.32813215206793156, "learning_rate": 5.446238195247003e-06, "loss": 0.0321, "step": 4722 }, { "epoch": 2.097712636020431, "grad_norm": 0.5512782021205551, "learning_rate": 5.44430736080323e-06, "loss": 0.0387, "step": 4723 }, { "epoch": 2.0981567843659783, "grad_norm": 0.5835802051010996, "learning_rate": 5.44237645957251e-06, "loss": 0.0354, "step": 4724 }, { "epoch": 2.0986009327115256, "grad_norm": 0.3806252215797266, "learning_rate": 5.440445491845095e-06, "loss": 0.0363, "step": 4725 }, { "epoch": 2.099045081057073, "grad_norm": 0.42251102762378473, "learning_rate": 5.438514457911241e-06, "loss": 0.0386, "step": 4726 }, { "epoch": 2.0994892294026206, "grad_norm": 0.35748316520246215, "learning_rate": 5.436583358061215e-06, "loss": 0.0282, "step": 4727 }, { "epoch": 2.099933377748168, "grad_norm": 0.46907671848937027, "learning_rate": 5.434652192585294e-06, "loss": 0.0411, "step": 4728 }, { "epoch": 2.100377526093715, "grad_norm": 0.41478959673973786, "learning_rate": 5.432720961773765e-06, "loss": 0.0323, "step": 4729 }, { "epoch": 2.1008216744392625, "grad_norm": 0.6441994593730457, "learning_rate": 5.430789665916925e-06, "loss": 0.0477, "step": 4730 }, { "epoch": 2.1012658227848102, "grad_norm": 0.7936242646685359, "learning_rate": 5.428858305305079e-06, "loss": 0.0438, "step": 4731 }, { "epoch": 2.1017099711303575, "grad_norm": 0.4542564531841097, "learning_rate": 5.426926880228547e-06, "loss": 0.039, "step": 4732 }, { "epoch": 2.102154119475905, "grad_norm": 0.43879296755150776, "learning_rate": 5.424995390977651e-06, "loss": 0.024, "step": 4733 }, { "epoch": 2.1025982678214525, "grad_norm": 0.4720404728255635, "learning_rate": 5.423063837842728e-06, "loss": 0.0366, "step": 4734 }, { "epoch": 2.103042416167, "grad_norm": 0.3210152022294204, "learning_rate": 5.421132221114124e-06, "loss": 0.0287, "step": 4735 }, { "epoch": 2.103486564512547, "grad_norm": 0.8156671041659774, "learning_rate": 5.419200541082194e-06, "loss": 0.0423, "step": 4736 }, { "epoch": 2.1039307128580944, "grad_norm": 0.5417603660166322, "learning_rate": 5.417268798037303e-06, "loss": 0.0388, "step": 4737 }, { "epoch": 2.104374861203642, "grad_norm": 0.46205560016247954, "learning_rate": 5.415336992269821e-06, "loss": 0.0363, "step": 4738 }, { "epoch": 2.1048190095491894, "grad_norm": 0.3348835489607936, "learning_rate": 5.413405124070134e-06, "loss": 0.0206, "step": 4739 }, { "epoch": 2.1052631578947367, "grad_norm": 0.46193255668613026, "learning_rate": 5.411473193728636e-06, "loss": 0.0295, "step": 4740 }, { "epoch": 2.1057073062402845, "grad_norm": 0.4346087883354204, "learning_rate": 5.409541201535727e-06, "loss": 0.039, "step": 4741 }, { "epoch": 2.1061514545858318, "grad_norm": 0.46994212871132296, "learning_rate": 5.407609147781816e-06, "loss": 0.0374, "step": 4742 }, { "epoch": 2.106595602931379, "grad_norm": 0.4605573678630878, "learning_rate": 5.405677032757329e-06, "loss": 0.0371, "step": 4743 }, { "epoch": 2.1070397512769263, "grad_norm": 0.6425519607388686, "learning_rate": 5.403744856752691e-06, "loss": 0.0337, "step": 4744 }, { "epoch": 2.107483899622474, "grad_norm": 0.34739298130141844, "learning_rate": 5.401812620058343e-06, "loss": 0.0269, "step": 4745 }, { "epoch": 2.1079280479680214, "grad_norm": 0.4490417625063517, "learning_rate": 5.399880322964733e-06, "loss": 0.0372, "step": 4746 }, { "epoch": 2.1083721963135686, "grad_norm": 0.6489786195301251, "learning_rate": 5.397947965762317e-06, "loss": 0.0509, "step": 4747 }, { "epoch": 2.108816344659116, "grad_norm": 0.326521120100658, "learning_rate": 5.396015548741562e-06, "loss": 0.0288, "step": 4748 }, { "epoch": 2.1092604930046637, "grad_norm": 0.4078407477455278, "learning_rate": 5.394083072192944e-06, "loss": 0.0382, "step": 4749 }, { "epoch": 2.109704641350211, "grad_norm": 0.5177065744163067, "learning_rate": 5.392150536406945e-06, "loss": 0.0449, "step": 4750 }, { "epoch": 2.1101487896957583, "grad_norm": 0.33995975799200945, "learning_rate": 5.39021794167406e-06, "loss": 0.0264, "step": 4751 }, { "epoch": 2.110592938041306, "grad_norm": 0.42003616424190965, "learning_rate": 5.388285288284787e-06, "loss": 0.0372, "step": 4752 }, { "epoch": 2.1110370863868533, "grad_norm": 0.3472432854029145, "learning_rate": 5.386352576529641e-06, "loss": 0.0307, "step": 4753 }, { "epoch": 2.1114812347324006, "grad_norm": 0.4130591691843907, "learning_rate": 5.384419806699141e-06, "loss": 0.0276, "step": 4754 }, { "epoch": 2.111925383077948, "grad_norm": 0.4591952950693401, "learning_rate": 5.382486979083812e-06, "loss": 0.0363, "step": 4755 }, { "epoch": 2.1123695314234956, "grad_norm": 0.3857971037693101, "learning_rate": 5.380554093974193e-06, "loss": 0.0359, "step": 4756 }, { "epoch": 2.112813679769043, "grad_norm": 0.37782187453237637, "learning_rate": 5.37862115166083e-06, "loss": 0.0358, "step": 4757 }, { "epoch": 2.11325782811459, "grad_norm": 0.3395886410665437, "learning_rate": 5.376688152434275e-06, "loss": 0.0243, "step": 4758 }, { "epoch": 2.1137019764601375, "grad_norm": 0.4470609752189812, "learning_rate": 5.374755096585093e-06, "loss": 0.0246, "step": 4759 }, { "epoch": 2.114146124805685, "grad_norm": 0.4371584818717651, "learning_rate": 5.372821984403854e-06, "loss": 0.028, "step": 4760 }, { "epoch": 2.1145902731512325, "grad_norm": 0.49305971027495465, "learning_rate": 5.370888816181138e-06, "loss": 0.0362, "step": 4761 }, { "epoch": 2.11503442149678, "grad_norm": 0.4175857188422132, "learning_rate": 5.368955592207531e-06, "loss": 0.033, "step": 4762 }, { "epoch": 2.1154785698423275, "grad_norm": 0.5139037264567338, "learning_rate": 5.367022312773633e-06, "loss": 0.0323, "step": 4763 }, { "epoch": 2.115922718187875, "grad_norm": 0.8385325013800425, "learning_rate": 5.365088978170045e-06, "loss": 0.0387, "step": 4764 }, { "epoch": 2.116366866533422, "grad_norm": 0.4295996848147971, "learning_rate": 5.363155588687383e-06, "loss": 0.0297, "step": 4765 }, { "epoch": 2.1168110148789694, "grad_norm": 0.5951452068654114, "learning_rate": 5.361222144616267e-06, "loss": 0.0395, "step": 4766 }, { "epoch": 2.117255163224517, "grad_norm": 0.3679591157867813, "learning_rate": 5.359288646247326e-06, "loss": 0.0324, "step": 4767 }, { "epoch": 2.1176993115700644, "grad_norm": 0.351749833643755, "learning_rate": 5.357355093871199e-06, "loss": 0.0262, "step": 4768 }, { "epoch": 2.1181434599156117, "grad_norm": 0.4550233701632367, "learning_rate": 5.355421487778529e-06, "loss": 0.0405, "step": 4769 }, { "epoch": 2.1185876082611594, "grad_norm": 0.5196049007489856, "learning_rate": 5.353487828259973e-06, "loss": 0.0354, "step": 4770 }, { "epoch": 2.1190317566067067, "grad_norm": 0.5003818333286508, "learning_rate": 5.351554115606194e-06, "loss": 0.0346, "step": 4771 }, { "epoch": 2.119475904952254, "grad_norm": 0.48260320470669443, "learning_rate": 5.349620350107857e-06, "loss": 0.0363, "step": 4772 }, { "epoch": 2.1199200532978013, "grad_norm": 0.5154065971255504, "learning_rate": 5.347686532055643e-06, "loss": 0.037, "step": 4773 }, { "epoch": 2.120364201643349, "grad_norm": 0.5641510479972823, "learning_rate": 5.345752661740236e-06, "loss": 0.0418, "step": 4774 }, { "epoch": 2.1208083499888963, "grad_norm": 0.3842072493781642, "learning_rate": 5.343818739452332e-06, "loss": 0.0289, "step": 4775 }, { "epoch": 2.1212524983344436, "grad_norm": 0.48007816649878726, "learning_rate": 5.34188476548263e-06, "loss": 0.0382, "step": 4776 }, { "epoch": 2.121696646679991, "grad_norm": 0.403727363474673, "learning_rate": 5.339950740121842e-06, "loss": 0.0355, "step": 4777 }, { "epoch": 2.1221407950255387, "grad_norm": 0.5060901595070586, "learning_rate": 5.338016663660681e-06, "loss": 0.0372, "step": 4778 }, { "epoch": 2.122584943371086, "grad_norm": 0.991711640962585, "learning_rate": 5.336082536389875e-06, "loss": 0.0575, "step": 4779 }, { "epoch": 2.1230290917166332, "grad_norm": 0.6280177866580242, "learning_rate": 5.334148358600154e-06, "loss": 0.0524, "step": 4780 }, { "epoch": 2.123473240062181, "grad_norm": 0.38877559204447126, "learning_rate": 5.332214130582259e-06, "loss": 0.0252, "step": 4781 }, { "epoch": 2.1239173884077283, "grad_norm": 0.5153735769890214, "learning_rate": 5.330279852626936e-06, "loss": 0.0399, "step": 4782 }, { "epoch": 2.1243615367532755, "grad_norm": 0.4835493880223582, "learning_rate": 5.32834552502494e-06, "loss": 0.0398, "step": 4783 }, { "epoch": 2.124805685098823, "grad_norm": 0.5440156606024599, "learning_rate": 5.326411148067036e-06, "loss": 0.0359, "step": 4784 }, { "epoch": 2.1252498334443706, "grad_norm": 0.47437512421776024, "learning_rate": 5.324476722043991e-06, "loss": 0.0357, "step": 4785 }, { "epoch": 2.125693981789918, "grad_norm": 0.4015404790313753, "learning_rate": 5.322542247246583e-06, "loss": 0.0259, "step": 4786 }, { "epoch": 2.126138130135465, "grad_norm": 0.3327411043647303, "learning_rate": 5.320607723965594e-06, "loss": 0.0288, "step": 4787 }, { "epoch": 2.1265822784810124, "grad_norm": 0.4112477949684857, "learning_rate": 5.318673152491821e-06, "loss": 0.0306, "step": 4788 }, { "epoch": 2.12702642682656, "grad_norm": 0.4352341389187628, "learning_rate": 5.316738533116058e-06, "loss": 0.0294, "step": 4789 }, { "epoch": 2.1274705751721075, "grad_norm": 0.5247331669668354, "learning_rate": 5.314803866129114e-06, "loss": 0.0455, "step": 4790 }, { "epoch": 2.1279147235176548, "grad_norm": 0.5740961919642005, "learning_rate": 5.3128691518218015e-06, "loss": 0.0313, "step": 4791 }, { "epoch": 2.1283588718632025, "grad_norm": 0.3931985647767843, "learning_rate": 5.310934390484939e-06, "loss": 0.0303, "step": 4792 }, { "epoch": 2.12880302020875, "grad_norm": 0.4665172928488037, "learning_rate": 5.308999582409357e-06, "loss": 0.0324, "step": 4793 }, { "epoch": 2.129247168554297, "grad_norm": 0.394722347425358, "learning_rate": 5.307064727885889e-06, "loss": 0.0386, "step": 4794 }, { "epoch": 2.1296913168998444, "grad_norm": 0.3880325853085057, "learning_rate": 5.305129827205375e-06, "loss": 0.0305, "step": 4795 }, { "epoch": 2.130135465245392, "grad_norm": 0.33971233702286885, "learning_rate": 5.303194880658668e-06, "loss": 0.0336, "step": 4796 }, { "epoch": 2.1305796135909394, "grad_norm": 0.5540350723444334, "learning_rate": 5.301259888536616e-06, "loss": 0.0369, "step": 4797 }, { "epoch": 2.1310237619364867, "grad_norm": 0.562001973004502, "learning_rate": 5.299324851130086e-06, "loss": 0.0463, "step": 4798 }, { "epoch": 2.1314679102820344, "grad_norm": 0.40052051170910113, "learning_rate": 5.297389768729949e-06, "loss": 0.0343, "step": 4799 }, { "epoch": 2.1319120586275817, "grad_norm": 0.37582805444109113, "learning_rate": 5.295454641627076e-06, "loss": 0.0279, "step": 4800 }, { "epoch": 2.132356206973129, "grad_norm": 0.556183127195747, "learning_rate": 5.293519470112351e-06, "loss": 0.055, "step": 4801 }, { "epoch": 2.1328003553186763, "grad_norm": 0.43600853079083646, "learning_rate": 5.2915842544766645e-06, "loss": 0.0341, "step": 4802 }, { "epoch": 2.133244503664224, "grad_norm": 0.43024160171177206, "learning_rate": 5.289648995010912e-06, "loss": 0.0311, "step": 4803 }, { "epoch": 2.1336886520097713, "grad_norm": 0.4112510401014529, "learning_rate": 5.287713692005993e-06, "loss": 0.0319, "step": 4804 }, { "epoch": 2.1341328003553186, "grad_norm": 0.4594812201613338, "learning_rate": 5.285778345752821e-06, "loss": 0.0372, "step": 4805 }, { "epoch": 2.134576948700866, "grad_norm": 0.3657072390383706, "learning_rate": 5.2838429565423074e-06, "loss": 0.0283, "step": 4806 }, { "epoch": 2.1350210970464136, "grad_norm": 0.6140456097709794, "learning_rate": 5.281907524665377e-06, "loss": 0.0454, "step": 4807 }, { "epoch": 2.135465245391961, "grad_norm": 0.39045919917216204, "learning_rate": 5.279972050412957e-06, "loss": 0.0378, "step": 4808 }, { "epoch": 2.135909393737508, "grad_norm": 0.3927097404516126, "learning_rate": 5.278036534075981e-06, "loss": 0.0333, "step": 4809 }, { "epoch": 2.136353542083056, "grad_norm": 0.5754764647926375, "learning_rate": 5.276100975945393e-06, "loss": 0.0327, "step": 4810 }, { "epoch": 2.1367976904286032, "grad_norm": 0.9692035662201081, "learning_rate": 5.274165376312136e-06, "loss": 0.0433, "step": 4811 }, { "epoch": 2.1372418387741505, "grad_norm": 0.3898618853951006, "learning_rate": 5.272229735467166e-06, "loss": 0.0301, "step": 4812 }, { "epoch": 2.137685987119698, "grad_norm": 0.5041899434970386, "learning_rate": 5.270294053701442e-06, "loss": 0.0293, "step": 4813 }, { "epoch": 2.1381301354652456, "grad_norm": 0.4402127129653806, "learning_rate": 5.268358331305931e-06, "loss": 0.0321, "step": 4814 }, { "epoch": 2.138574283810793, "grad_norm": 0.45642467547731175, "learning_rate": 5.266422568571604e-06, "loss": 0.0359, "step": 4815 }, { "epoch": 2.13901843215634, "grad_norm": 0.3935008736888044, "learning_rate": 5.264486765789439e-06, "loss": 0.034, "step": 4816 }, { "epoch": 2.1394625805018874, "grad_norm": 0.334289910641262, "learning_rate": 5.262550923250421e-06, "loss": 0.0235, "step": 4817 }, { "epoch": 2.139906728847435, "grad_norm": 0.45188823216686197, "learning_rate": 5.260615041245538e-06, "loss": 0.0293, "step": 4818 }, { "epoch": 2.1403508771929824, "grad_norm": 0.5579134180377057, "learning_rate": 5.25867912006579e-06, "loss": 0.0428, "step": 4819 }, { "epoch": 2.1407950255385297, "grad_norm": 0.33710370134564305, "learning_rate": 5.256743160002174e-06, "loss": 0.0265, "step": 4820 }, { "epoch": 2.1412391738840775, "grad_norm": 0.5496665050765797, "learning_rate": 5.254807161345699e-06, "loss": 0.0482, "step": 4821 }, { "epoch": 2.1416833222296248, "grad_norm": 0.41971477565058973, "learning_rate": 5.2528711243873795e-06, "loss": 0.032, "step": 4822 }, { "epoch": 2.142127470575172, "grad_norm": 0.4604575178968341, "learning_rate": 5.2509350494182365e-06, "loss": 0.0366, "step": 4823 }, { "epoch": 2.1425716189207193, "grad_norm": 0.3714626261982935, "learning_rate": 5.2489989367292916e-06, "loss": 0.0256, "step": 4824 }, { "epoch": 2.143015767266267, "grad_norm": 0.40647254628304297, "learning_rate": 5.247062786611575e-06, "loss": 0.034, "step": 4825 }, { "epoch": 2.1434599156118144, "grad_norm": 0.534798459704081, "learning_rate": 5.245126599356126e-06, "loss": 0.0344, "step": 4826 }, { "epoch": 2.1439040639573617, "grad_norm": 0.3767967274585948, "learning_rate": 5.243190375253987e-06, "loss": 0.0388, "step": 4827 }, { "epoch": 2.1443482123029094, "grad_norm": 0.43416732973374494, "learning_rate": 5.241254114596201e-06, "loss": 0.036, "step": 4828 }, { "epoch": 2.1447923606484567, "grad_norm": 0.37510655690708544, "learning_rate": 5.2393178176738246e-06, "loss": 0.0339, "step": 4829 }, { "epoch": 2.145236508994004, "grad_norm": 0.509295659153276, "learning_rate": 5.237381484777914e-06, "loss": 0.0449, "step": 4830 }, { "epoch": 2.1456806573395513, "grad_norm": 0.43018107465023353, "learning_rate": 5.235445116199536e-06, "loss": 0.0386, "step": 4831 }, { "epoch": 2.146124805685099, "grad_norm": 0.532799480118876, "learning_rate": 5.2335087122297545e-06, "loss": 0.0536, "step": 4832 }, { "epoch": 2.1465689540306463, "grad_norm": 0.3898351411048676, "learning_rate": 5.231572273159649e-06, "loss": 0.0328, "step": 4833 }, { "epoch": 2.1470131023761936, "grad_norm": 0.34769775883890974, "learning_rate": 5.229635799280298e-06, "loss": 0.0297, "step": 4834 }, { "epoch": 2.147457250721741, "grad_norm": 0.4711513587854737, "learning_rate": 5.2276992908827825e-06, "loss": 0.0422, "step": 4835 }, { "epoch": 2.1479013990672886, "grad_norm": 0.405896587319854, "learning_rate": 5.2257627482581985e-06, "loss": 0.033, "step": 4836 }, { "epoch": 2.148345547412836, "grad_norm": 0.3745089869031137, "learning_rate": 5.2238261716976375e-06, "loss": 0.0246, "step": 4837 }, { "epoch": 2.148789695758383, "grad_norm": 0.40688545389600017, "learning_rate": 5.2218895614922e-06, "loss": 0.044, "step": 4838 }, { "epoch": 2.1492338441039305, "grad_norm": 0.48659815613858365, "learning_rate": 5.219952917932993e-06, "loss": 0.0412, "step": 4839 }, { "epoch": 2.149677992449478, "grad_norm": 0.5300858396288847, "learning_rate": 5.218016241311126e-06, "loss": 0.0441, "step": 4840 }, { "epoch": 2.1501221407950255, "grad_norm": 0.5970494126272853, "learning_rate": 5.216079531917714e-06, "loss": 0.0377, "step": 4841 }, { "epoch": 2.150566289140573, "grad_norm": 0.4023665494232216, "learning_rate": 5.2141427900438765e-06, "loss": 0.0261, "step": 4842 }, { "epoch": 2.1510104374861205, "grad_norm": 0.3360515019130085, "learning_rate": 5.212206015980742e-06, "loss": 0.0232, "step": 4843 }, { "epoch": 2.151454585831668, "grad_norm": 0.37156982937954675, "learning_rate": 5.210269210019438e-06, "loss": 0.0313, "step": 4844 }, { "epoch": 2.151898734177215, "grad_norm": 0.43881581226491895, "learning_rate": 5.2083323724511e-06, "loss": 0.0387, "step": 4845 }, { "epoch": 2.1523428825227624, "grad_norm": 0.5133014074927194, "learning_rate": 5.206395503566867e-06, "loss": 0.033, "step": 4846 }, { "epoch": 2.15278703086831, "grad_norm": 0.7393084909457972, "learning_rate": 5.204458603657885e-06, "loss": 0.0451, "step": 4847 }, { "epoch": 2.1532311792138574, "grad_norm": 0.5276996020099548, "learning_rate": 5.2025216730153016e-06, "loss": 0.0244, "step": 4848 }, { "epoch": 2.1536753275594047, "grad_norm": 0.47830343576785345, "learning_rate": 5.200584711930267e-06, "loss": 0.0387, "step": 4849 }, { "epoch": 2.1541194759049525, "grad_norm": 0.5578861068890496, "learning_rate": 5.198647720693948e-06, "loss": 0.0421, "step": 4850 }, { "epoch": 2.1545636242504997, "grad_norm": 0.3280166182279637, "learning_rate": 5.1967106995975e-06, "loss": 0.0233, "step": 4851 }, { "epoch": 2.155007772596047, "grad_norm": 0.5452793800296186, "learning_rate": 5.194773648932092e-06, "loss": 0.0403, "step": 4852 }, { "epoch": 2.1554519209415943, "grad_norm": 0.5130821544913388, "learning_rate": 5.192836568988895e-06, "loss": 0.0307, "step": 4853 }, { "epoch": 2.155896069287142, "grad_norm": 0.33770672647996586, "learning_rate": 5.190899460059088e-06, "loss": 0.0232, "step": 4854 }, { "epoch": 2.1563402176326893, "grad_norm": 0.6783231395478261, "learning_rate": 5.188962322433848e-06, "loss": 0.0359, "step": 4855 }, { "epoch": 2.1567843659782366, "grad_norm": 0.49312348081568724, "learning_rate": 5.187025156404361e-06, "loss": 0.051, "step": 4856 }, { "epoch": 2.1572285143237844, "grad_norm": 0.5188160057025217, "learning_rate": 5.185087962261817e-06, "loss": 0.037, "step": 4857 }, { "epoch": 2.1576726626693317, "grad_norm": 0.4979475713257613, "learning_rate": 5.183150740297407e-06, "loss": 0.0457, "step": 4858 }, { "epoch": 2.158116811014879, "grad_norm": 0.8466081637820392, "learning_rate": 5.181213490802329e-06, "loss": 0.0467, "step": 4859 }, { "epoch": 2.1585609593604262, "grad_norm": 0.4708804574175826, "learning_rate": 5.179276214067788e-06, "loss": 0.0404, "step": 4860 }, { "epoch": 2.159005107705974, "grad_norm": 0.3951062024143596, "learning_rate": 5.1773389103849835e-06, "loss": 0.0262, "step": 4861 }, { "epoch": 2.1594492560515213, "grad_norm": 0.5862820412725568, "learning_rate": 5.175401580045131e-06, "loss": 0.0471, "step": 4862 }, { "epoch": 2.1598934043970686, "grad_norm": 0.3583235113891142, "learning_rate": 5.173464223339438e-06, "loss": 0.0261, "step": 4863 }, { "epoch": 2.160337552742616, "grad_norm": 0.7139304922452345, "learning_rate": 5.171526840559129e-06, "loss": 0.0581, "step": 4864 }, { "epoch": 2.1607817010881636, "grad_norm": 0.41880853503590465, "learning_rate": 5.169589431995421e-06, "loss": 0.0305, "step": 4865 }, { "epoch": 2.161225849433711, "grad_norm": 0.974797863480534, "learning_rate": 5.16765199793954e-06, "loss": 0.0476, "step": 4866 }, { "epoch": 2.161669997779258, "grad_norm": 0.6058306397735719, "learning_rate": 5.165714538682716e-06, "loss": 0.0603, "step": 4867 }, { "epoch": 2.1621141461248055, "grad_norm": 0.6017608849107138, "learning_rate": 5.163777054516182e-06, "loss": 0.0415, "step": 4868 }, { "epoch": 2.162558294470353, "grad_norm": 1.5666330014886598, "learning_rate": 5.161839545731175e-06, "loss": 0.0424, "step": 4869 }, { "epoch": 2.1630024428159005, "grad_norm": 0.4625370484241095, "learning_rate": 5.159902012618933e-06, "loss": 0.0337, "step": 4870 }, { "epoch": 2.1634465911614478, "grad_norm": 0.6306574373250589, "learning_rate": 5.1579644554707054e-06, "loss": 0.057, "step": 4871 }, { "epoch": 2.1638907395069955, "grad_norm": 0.9235120056329936, "learning_rate": 5.156026874577735e-06, "loss": 0.0561, "step": 4872 }, { "epoch": 2.164334887852543, "grad_norm": 0.4148788359576288, "learning_rate": 5.154089270231275e-06, "loss": 0.0365, "step": 4873 }, { "epoch": 2.16477903619809, "grad_norm": 0.5384050767235271, "learning_rate": 5.152151642722582e-06, "loss": 0.0397, "step": 4874 }, { "epoch": 2.1652231845436374, "grad_norm": 0.42398179308700884, "learning_rate": 5.15021399234291e-06, "loss": 0.0244, "step": 4875 }, { "epoch": 2.165667332889185, "grad_norm": 0.4428879862840884, "learning_rate": 5.148276319383525e-06, "loss": 0.0338, "step": 4876 }, { "epoch": 2.1661114812347324, "grad_norm": 0.5151742619891833, "learning_rate": 5.146338624135689e-06, "loss": 0.0429, "step": 4877 }, { "epoch": 2.1665556295802797, "grad_norm": 0.5906956484118873, "learning_rate": 5.144400906890672e-06, "loss": 0.0486, "step": 4878 }, { "epoch": 2.1669997779258274, "grad_norm": 0.5089563089876176, "learning_rate": 5.142463167939748e-06, "loss": 0.0336, "step": 4879 }, { "epoch": 2.1674439262713747, "grad_norm": 0.4990120356759535, "learning_rate": 5.140525407574187e-06, "loss": 0.0397, "step": 4880 }, { "epoch": 2.167888074616922, "grad_norm": 0.47055574764670566, "learning_rate": 5.138587626085271e-06, "loss": 0.0411, "step": 4881 }, { "epoch": 2.1683322229624693, "grad_norm": 0.49103425451011984, "learning_rate": 5.136649823764281e-06, "loss": 0.0422, "step": 4882 }, { "epoch": 2.168776371308017, "grad_norm": 0.44674321466653016, "learning_rate": 5.1347120009025005e-06, "loss": 0.0312, "step": 4883 }, { "epoch": 2.1692205196535643, "grad_norm": 0.2849178715545674, "learning_rate": 5.132774157791218e-06, "loss": 0.0226, "step": 4884 }, { "epoch": 2.1696646679991116, "grad_norm": 0.46152868939886915, "learning_rate": 5.130836294721726e-06, "loss": 0.0364, "step": 4885 }, { "epoch": 2.170108816344659, "grad_norm": 0.43325439116729864, "learning_rate": 5.128898411985315e-06, "loss": 0.0288, "step": 4886 }, { "epoch": 2.1705529646902066, "grad_norm": 0.39479226787289406, "learning_rate": 5.1269605098732825e-06, "loss": 0.0251, "step": 4887 }, { "epoch": 2.170997113035754, "grad_norm": 0.41019425261041265, "learning_rate": 5.12502258867693e-06, "loss": 0.031, "step": 4888 }, { "epoch": 2.1714412613813012, "grad_norm": 0.508850109174841, "learning_rate": 5.123084648687557e-06, "loss": 0.0474, "step": 4889 }, { "epoch": 2.171885409726849, "grad_norm": 0.42681095953850656, "learning_rate": 5.121146690196472e-06, "loss": 0.0244, "step": 4890 }, { "epoch": 2.1723295580723962, "grad_norm": 0.32346458488317453, "learning_rate": 5.1192087134949804e-06, "loss": 0.0264, "step": 4891 }, { "epoch": 2.1727737064179435, "grad_norm": 0.40304790268945045, "learning_rate": 5.1172707188743955e-06, "loss": 0.0264, "step": 4892 }, { "epoch": 2.173217854763491, "grad_norm": 0.5433823527011491, "learning_rate": 5.115332706626028e-06, "loss": 0.0459, "step": 4893 }, { "epoch": 2.1736620031090386, "grad_norm": 0.33519824313183216, "learning_rate": 5.113394677041197e-06, "loss": 0.0289, "step": 4894 }, { "epoch": 2.174106151454586, "grad_norm": 0.399864916190364, "learning_rate": 5.111456630411218e-06, "loss": 0.026, "step": 4895 }, { "epoch": 2.174550299800133, "grad_norm": 0.49817314831150256, "learning_rate": 5.109518567027416e-06, "loss": 0.0341, "step": 4896 }, { "epoch": 2.1749944481456804, "grad_norm": 0.41814385729810516, "learning_rate": 5.107580487181112e-06, "loss": 0.0315, "step": 4897 }, { "epoch": 2.175438596491228, "grad_norm": 0.44895548729597046, "learning_rate": 5.105642391163633e-06, "loss": 0.0403, "step": 4898 }, { "epoch": 2.1758827448367755, "grad_norm": 0.3688472653021723, "learning_rate": 5.10370427926631e-06, "loss": 0.0345, "step": 4899 }, { "epoch": 2.1763268931823228, "grad_norm": 0.4629343116213356, "learning_rate": 5.1017661517804694e-06, "loss": 0.0353, "step": 4900 }, { "epoch": 2.1767710415278705, "grad_norm": 0.4559917864661212, "learning_rate": 5.099828008997448e-06, "loss": 0.0398, "step": 4901 }, { "epoch": 2.1772151898734178, "grad_norm": 0.45480161362407634, "learning_rate": 5.097889851208583e-06, "loss": 0.035, "step": 4902 }, { "epoch": 2.177659338218965, "grad_norm": 0.42473449296712756, "learning_rate": 5.0959516787052085e-06, "loss": 0.0317, "step": 4903 }, { "epoch": 2.1781034865645124, "grad_norm": 0.4952860166933172, "learning_rate": 5.094013491778668e-06, "loss": 0.0443, "step": 4904 }, { "epoch": 2.17854763491006, "grad_norm": 0.4001101623153066, "learning_rate": 5.092075290720302e-06, "loss": 0.0298, "step": 4905 }, { "epoch": 2.1789917832556074, "grad_norm": 0.5225430247317463, "learning_rate": 5.0901370758214565e-06, "loss": 0.0398, "step": 4906 }, { "epoch": 2.1794359316011547, "grad_norm": 0.6146833193819635, "learning_rate": 5.088198847373477e-06, "loss": 0.0464, "step": 4907 }, { "epoch": 2.1798800799467024, "grad_norm": 0.47802919024440194, "learning_rate": 5.086260605667712e-06, "loss": 0.037, "step": 4908 }, { "epoch": 2.1803242282922497, "grad_norm": 0.5368694025591854, "learning_rate": 5.084322350995512e-06, "loss": 0.0488, "step": 4909 }, { "epoch": 2.180768376637797, "grad_norm": 0.32415608827454506, "learning_rate": 5.0823840836482316e-06, "loss": 0.0233, "step": 4910 }, { "epoch": 2.1812125249833443, "grad_norm": 0.4761380556003533, "learning_rate": 5.080445803917225e-06, "loss": 0.0471, "step": 4911 }, { "epoch": 2.181656673328892, "grad_norm": 0.3515304190735633, "learning_rate": 5.078507512093844e-06, "loss": 0.0259, "step": 4912 }, { "epoch": 2.1821008216744393, "grad_norm": 0.6155251528359741, "learning_rate": 5.076569208469454e-06, "loss": 0.0474, "step": 4913 }, { "epoch": 2.1825449700199866, "grad_norm": 0.4580217790122366, "learning_rate": 5.0746308933354105e-06, "loss": 0.0438, "step": 4914 }, { "epoch": 2.182989118365534, "grad_norm": 0.408141050974567, "learning_rate": 5.072692566983074e-06, "loss": 0.0376, "step": 4915 }, { "epoch": 2.1834332667110816, "grad_norm": 0.4622430926137514, "learning_rate": 5.070754229703811e-06, "loss": 0.0325, "step": 4916 }, { "epoch": 2.183877415056629, "grad_norm": 0.49891927431787836, "learning_rate": 5.068815881788986e-06, "loss": 0.0383, "step": 4917 }, { "epoch": 2.184321563402176, "grad_norm": 0.3944468340370449, "learning_rate": 5.0668775235299636e-06, "loss": 0.0364, "step": 4918 }, { "epoch": 2.184765711747724, "grad_norm": 0.5143753892518631, "learning_rate": 5.064939155218115e-06, "loss": 0.0327, "step": 4919 }, { "epoch": 2.1852098600932712, "grad_norm": 0.4613419832071711, "learning_rate": 5.0630007771448064e-06, "loss": 0.0365, "step": 4920 }, { "epoch": 2.1856540084388185, "grad_norm": 0.42353949789442463, "learning_rate": 5.061062389601413e-06, "loss": 0.0305, "step": 4921 }, { "epoch": 2.186098156784366, "grad_norm": 0.4615529011211128, "learning_rate": 5.059123992879303e-06, "loss": 0.0363, "step": 4922 }, { "epoch": 2.1865423051299135, "grad_norm": 0.38154251403565953, "learning_rate": 5.057185587269854e-06, "loss": 0.0237, "step": 4923 }, { "epoch": 2.186986453475461, "grad_norm": 0.45850860345485767, "learning_rate": 5.05524717306444e-06, "loss": 0.0309, "step": 4924 }, { "epoch": 2.187430601821008, "grad_norm": 0.3296860610369256, "learning_rate": 5.053308750554437e-06, "loss": 0.0311, "step": 4925 }, { "epoch": 2.1878747501665554, "grad_norm": 0.3804383024783399, "learning_rate": 5.051370320031221e-06, "loss": 0.0337, "step": 4926 }, { "epoch": 2.188318898512103, "grad_norm": 0.44377911056623215, "learning_rate": 5.049431881786176e-06, "loss": 0.0451, "step": 4927 }, { "epoch": 2.1887630468576504, "grad_norm": 0.3897615063112148, "learning_rate": 5.04749343611068e-06, "loss": 0.0264, "step": 4928 }, { "epoch": 2.1892071952031977, "grad_norm": 0.857708991433585, "learning_rate": 5.045554983296111e-06, "loss": 0.0421, "step": 4929 }, { "epoch": 2.1896513435487455, "grad_norm": 0.4460864492390492, "learning_rate": 5.043616523633856e-06, "loss": 0.0348, "step": 4930 }, { "epoch": 2.1900954918942928, "grad_norm": 0.46033992290883, "learning_rate": 5.0416780574152976e-06, "loss": 0.0391, "step": 4931 }, { "epoch": 2.19053964023984, "grad_norm": 0.36797085648755834, "learning_rate": 5.0397395849318165e-06, "loss": 0.0379, "step": 4932 }, { "epoch": 2.1909837885853873, "grad_norm": 0.3675945320964035, "learning_rate": 5.0378011064748025e-06, "loss": 0.0252, "step": 4933 }, { "epoch": 2.191427936930935, "grad_norm": 0.6163310761797228, "learning_rate": 5.035862622335641e-06, "loss": 0.0477, "step": 4934 }, { "epoch": 2.1918720852764824, "grad_norm": 0.5755380695413213, "learning_rate": 5.0339241328057164e-06, "loss": 0.0321, "step": 4935 }, { "epoch": 2.1923162336220297, "grad_norm": 0.7365554315970313, "learning_rate": 5.0319856381764175e-06, "loss": 0.0421, "step": 4936 }, { "epoch": 2.1927603819675774, "grad_norm": 0.3786316850609603, "learning_rate": 5.030047138739136e-06, "loss": 0.0274, "step": 4937 }, { "epoch": 2.1932045303131247, "grad_norm": 0.46503560614578676, "learning_rate": 5.028108634785258e-06, "loss": 0.0329, "step": 4938 }, { "epoch": 2.193648678658672, "grad_norm": 0.39835692305695053, "learning_rate": 5.0261701266061746e-06, "loss": 0.0267, "step": 4939 }, { "epoch": 2.1940928270042193, "grad_norm": 0.3883823936267887, "learning_rate": 5.024231614493277e-06, "loss": 0.0284, "step": 4940 }, { "epoch": 2.194536975349767, "grad_norm": 0.38466420585767425, "learning_rate": 5.022293098737957e-06, "loss": 0.0302, "step": 4941 }, { "epoch": 2.1949811236953143, "grad_norm": 0.5034584168375414, "learning_rate": 5.0203545796316044e-06, "loss": 0.05, "step": 4942 }, { "epoch": 2.1954252720408616, "grad_norm": 0.5196054438026021, "learning_rate": 5.0184160574656125e-06, "loss": 0.0348, "step": 4943 }, { "epoch": 2.195869420386409, "grad_norm": 0.4645537912636916, "learning_rate": 5.0164775325313755e-06, "loss": 0.0376, "step": 4944 }, { "epoch": 2.1963135687319566, "grad_norm": 0.4894769238899238, "learning_rate": 5.0145390051202846e-06, "loss": 0.0357, "step": 4945 }, { "epoch": 2.196757717077504, "grad_norm": 0.3225076116998955, "learning_rate": 5.012600475523733e-06, "loss": 0.0268, "step": 4946 }, { "epoch": 2.197201865423051, "grad_norm": 0.46007489982886224, "learning_rate": 5.010661944033118e-06, "loss": 0.0406, "step": 4947 }, { "epoch": 2.197646013768599, "grad_norm": 0.6157542604470476, "learning_rate": 5.008723410939832e-06, "loss": 0.0329, "step": 4948 }, { "epoch": 2.198090162114146, "grad_norm": 0.41686085632858955, "learning_rate": 5.006784876535268e-06, "loss": 0.0416, "step": 4949 }, { "epoch": 2.1985343104596935, "grad_norm": 0.5468745101745486, "learning_rate": 5.004846341110822e-06, "loss": 0.0499, "step": 4950 }, { "epoch": 2.198978458805241, "grad_norm": 0.43554432758066686, "learning_rate": 5.002907804957889e-06, "loss": 0.0353, "step": 4951 }, { "epoch": 2.1994226071507885, "grad_norm": 0.4925752472694445, "learning_rate": 5.000969268367862e-06, "loss": 0.0367, "step": 4952 }, { "epoch": 2.199866755496336, "grad_norm": 0.4262017675003274, "learning_rate": 4.999030731632139e-06, "loss": 0.0444, "step": 4953 }, { "epoch": 2.200310903841883, "grad_norm": 0.3899535716527509, "learning_rate": 4.997092195042113e-06, "loss": 0.0299, "step": 4954 }, { "epoch": 2.2007550521874304, "grad_norm": 0.4534373375430896, "learning_rate": 4.995153658889181e-06, "loss": 0.0346, "step": 4955 }, { "epoch": 2.201199200532978, "grad_norm": 0.32786374874254637, "learning_rate": 4.993215123464734e-06, "loss": 0.0292, "step": 4956 }, { "epoch": 2.2016433488785254, "grad_norm": 0.48869610889723925, "learning_rate": 4.991276589060169e-06, "loss": 0.0435, "step": 4957 }, { "epoch": 2.2020874972240727, "grad_norm": 0.41141706707817044, "learning_rate": 4.989338055966883e-06, "loss": 0.0317, "step": 4958 }, { "epoch": 2.2025316455696204, "grad_norm": 0.35799145890163087, "learning_rate": 4.987399524476268e-06, "loss": 0.0343, "step": 4959 }, { "epoch": 2.2029757939151677, "grad_norm": 0.39541269197255546, "learning_rate": 4.985460994879717e-06, "loss": 0.0303, "step": 4960 }, { "epoch": 2.203419942260715, "grad_norm": 0.4446866574845067, "learning_rate": 4.983522467468627e-06, "loss": 0.0458, "step": 4961 }, { "epoch": 2.2038640906062623, "grad_norm": 0.44841266243980166, "learning_rate": 4.981583942534388e-06, "loss": 0.0401, "step": 4962 }, { "epoch": 2.20430823895181, "grad_norm": 0.40609549354200053, "learning_rate": 4.979645420368397e-06, "loss": 0.0318, "step": 4963 }, { "epoch": 2.2047523872973573, "grad_norm": 0.4655683020461079, "learning_rate": 4.977706901262045e-06, "loss": 0.0318, "step": 4964 }, { "epoch": 2.2051965356429046, "grad_norm": 0.4105935741895686, "learning_rate": 4.975768385506725e-06, "loss": 0.0365, "step": 4965 }, { "epoch": 2.2056406839884524, "grad_norm": 0.4840996358141117, "learning_rate": 4.973829873393827e-06, "loss": 0.0288, "step": 4966 }, { "epoch": 2.2060848323339997, "grad_norm": 0.43687024211685477, "learning_rate": 4.971891365214743e-06, "loss": 0.0284, "step": 4967 }, { "epoch": 2.206528980679547, "grad_norm": 0.5919063970733193, "learning_rate": 4.969952861260865e-06, "loss": 0.0316, "step": 4968 }, { "epoch": 2.2069731290250942, "grad_norm": 0.34222056631687486, "learning_rate": 4.968014361823583e-06, "loss": 0.0269, "step": 4969 }, { "epoch": 2.207417277370642, "grad_norm": 0.577057583701676, "learning_rate": 4.966075867194285e-06, "loss": 0.0391, "step": 4970 }, { "epoch": 2.2078614257161893, "grad_norm": 0.45130320193766277, "learning_rate": 4.964137377664362e-06, "loss": 0.0358, "step": 4971 }, { "epoch": 2.2083055740617366, "grad_norm": 0.37843493642246295, "learning_rate": 4.9621988935252e-06, "loss": 0.0356, "step": 4972 }, { "epoch": 2.208749722407284, "grad_norm": 0.37308118817776387, "learning_rate": 4.9602604150681835e-06, "loss": 0.0244, "step": 4973 }, { "epoch": 2.2091938707528316, "grad_norm": 0.5336868565377078, "learning_rate": 4.958321942584703e-06, "loss": 0.0488, "step": 4974 }, { "epoch": 2.209638019098379, "grad_norm": 0.3742330589994002, "learning_rate": 4.956383476366145e-06, "loss": 0.0309, "step": 4975 }, { "epoch": 2.210082167443926, "grad_norm": 0.46020603183712594, "learning_rate": 4.95444501670389e-06, "loss": 0.0323, "step": 4976 }, { "epoch": 2.2105263157894735, "grad_norm": 0.8475632066917597, "learning_rate": 4.9525065638893226e-06, "loss": 0.0465, "step": 4977 }, { "epoch": 2.210970464135021, "grad_norm": 0.5219077605882717, "learning_rate": 4.950568118213825e-06, "loss": 0.0393, "step": 4978 }, { "epoch": 2.2114146124805685, "grad_norm": 0.37860034305038076, "learning_rate": 4.948629679968778e-06, "loss": 0.0366, "step": 4979 }, { "epoch": 2.2118587608261158, "grad_norm": 0.6128046042996554, "learning_rate": 4.946691249445565e-06, "loss": 0.04, "step": 4980 }, { "epoch": 2.2123029091716635, "grad_norm": 0.4839071616267329, "learning_rate": 4.944752826935562e-06, "loss": 0.035, "step": 4981 }, { "epoch": 2.212747057517211, "grad_norm": 0.45065394012575055, "learning_rate": 4.942814412730147e-06, "loss": 0.0267, "step": 4982 }, { "epoch": 2.213191205862758, "grad_norm": 0.5986492760384308, "learning_rate": 4.940876007120699e-06, "loss": 0.0289, "step": 4983 }, { "epoch": 2.2136353542083054, "grad_norm": 0.5082807449831283, "learning_rate": 4.938937610398588e-06, "loss": 0.034, "step": 4984 }, { "epoch": 2.214079502553853, "grad_norm": 0.46534139162341914, "learning_rate": 4.9369992228551935e-06, "loss": 0.0323, "step": 4985 }, { "epoch": 2.2145236508994004, "grad_norm": 0.4943061938684834, "learning_rate": 4.935060844781886e-06, "loss": 0.0353, "step": 4986 }, { "epoch": 2.2149677992449477, "grad_norm": 0.4336318300801724, "learning_rate": 4.933122476470038e-06, "loss": 0.0275, "step": 4987 }, { "epoch": 2.2154119475904954, "grad_norm": 0.4360854695437589, "learning_rate": 4.931184118211016e-06, "loss": 0.0349, "step": 4988 }, { "epoch": 2.2158560959360427, "grad_norm": 0.6620193372024554, "learning_rate": 4.929245770296191e-06, "loss": 0.0483, "step": 4989 }, { "epoch": 2.21630024428159, "grad_norm": 0.48994703833403463, "learning_rate": 4.927307433016927e-06, "loss": 0.0432, "step": 4990 }, { "epoch": 2.2167443926271373, "grad_norm": 0.4622061500888628, "learning_rate": 4.925369106664591e-06, "loss": 0.0387, "step": 4991 }, { "epoch": 2.217188540972685, "grad_norm": 0.4624898194765989, "learning_rate": 4.923430791530547e-06, "loss": 0.0196, "step": 4992 }, { "epoch": 2.2176326893182323, "grad_norm": 0.3453109472117995, "learning_rate": 4.9214924879061565e-06, "loss": 0.0286, "step": 4993 }, { "epoch": 2.2180768376637796, "grad_norm": 0.41871877730955476, "learning_rate": 4.919554196082778e-06, "loss": 0.0397, "step": 4994 }, { "epoch": 2.2185209860093273, "grad_norm": 0.42587370028059823, "learning_rate": 4.91761591635177e-06, "loss": 0.0408, "step": 4995 }, { "epoch": 2.2189651343548746, "grad_norm": 0.36571656649505513, "learning_rate": 4.9156776490044875e-06, "loss": 0.0335, "step": 4996 }, { "epoch": 2.219409282700422, "grad_norm": 0.4229412174299105, "learning_rate": 4.91373939433229e-06, "loss": 0.0396, "step": 4997 }, { "epoch": 2.219853431045969, "grad_norm": 0.3904502538208118, "learning_rate": 4.911801152626525e-06, "loss": 0.0352, "step": 4998 }, { "epoch": 2.220297579391517, "grad_norm": 0.7376575867139856, "learning_rate": 4.909862924178545e-06, "loss": 0.04, "step": 4999 }, { "epoch": 2.2207417277370642, "grad_norm": 0.45886434872272697, "learning_rate": 4.9079247092797e-06, "loss": 0.0375, "step": 5000 }, { "epoch": 2.2211858760826115, "grad_norm": 0.38398725710621795, "learning_rate": 4.905986508221333e-06, "loss": 0.034, "step": 5001 }, { "epoch": 2.221630024428159, "grad_norm": 0.4363686982956202, "learning_rate": 4.904048321294791e-06, "loss": 0.0395, "step": 5002 }, { "epoch": 2.2220741727737066, "grad_norm": 0.47908384837669443, "learning_rate": 4.9021101487914185e-06, "loss": 0.0451, "step": 5003 }, { "epoch": 2.222518321119254, "grad_norm": 0.49404577583056375, "learning_rate": 4.900171991002553e-06, "loss": 0.0382, "step": 5004 }, { "epoch": 2.222962469464801, "grad_norm": 0.4254636742864719, "learning_rate": 4.898233848219532e-06, "loss": 0.0385, "step": 5005 }, { "epoch": 2.2234066178103484, "grad_norm": 0.5219692883962996, "learning_rate": 4.896295720733694e-06, "loss": 0.04, "step": 5006 }, { "epoch": 2.223850766155896, "grad_norm": 0.8849555604502084, "learning_rate": 4.894357608836368e-06, "loss": 0.0357, "step": 5007 }, { "epoch": 2.2242949145014435, "grad_norm": 0.772611179602613, "learning_rate": 4.89241951281889e-06, "loss": 0.0418, "step": 5008 }, { "epoch": 2.2247390628469907, "grad_norm": 0.5532053379610344, "learning_rate": 4.890481432972586e-06, "loss": 0.0352, "step": 5009 }, { "epoch": 2.2251832111925385, "grad_norm": 0.4003154527748429, "learning_rate": 4.8885433695887836e-06, "loss": 0.032, "step": 5010 }, { "epoch": 2.2256273595380858, "grad_norm": 0.45204263035502396, "learning_rate": 4.886605322958806e-06, "loss": 0.0303, "step": 5011 }, { "epoch": 2.226071507883633, "grad_norm": 0.3440942170404326, "learning_rate": 4.884667293373973e-06, "loss": 0.022, "step": 5012 }, { "epoch": 2.2265156562291804, "grad_norm": 0.7116582097517625, "learning_rate": 4.882729281125605e-06, "loss": 0.0523, "step": 5013 }, { "epoch": 2.226959804574728, "grad_norm": 0.4053030773514669, "learning_rate": 4.88079128650502e-06, "loss": 0.027, "step": 5014 }, { "epoch": 2.2274039529202754, "grad_norm": 0.6656245142980673, "learning_rate": 4.878853309803529e-06, "loss": 0.0462, "step": 5015 }, { "epoch": 2.2278481012658227, "grad_norm": 0.38760410447982047, "learning_rate": 4.876915351312444e-06, "loss": 0.0265, "step": 5016 }, { "epoch": 2.2282922496113704, "grad_norm": 0.407666066744564, "learning_rate": 4.874977411323073e-06, "loss": 0.0354, "step": 5017 }, { "epoch": 2.2287363979569177, "grad_norm": 0.483920298386755, "learning_rate": 4.873039490126718e-06, "loss": 0.0367, "step": 5018 }, { "epoch": 2.229180546302465, "grad_norm": 0.45353775305411304, "learning_rate": 4.871101588014686e-06, "loss": 0.0358, "step": 5019 }, { "epoch": 2.2296246946480123, "grad_norm": 0.4339146999685688, "learning_rate": 4.869163705278276e-06, "loss": 0.0326, "step": 5020 }, { "epoch": 2.23006884299356, "grad_norm": 1.02624057870213, "learning_rate": 4.867225842208783e-06, "loss": 0.0598, "step": 5021 }, { "epoch": 2.2305129913391073, "grad_norm": 0.44283161622309714, "learning_rate": 4.8652879990975e-06, "loss": 0.0378, "step": 5022 }, { "epoch": 2.2309571396846546, "grad_norm": 0.44468206860911264, "learning_rate": 4.863350176235721e-06, "loss": 0.0435, "step": 5023 }, { "epoch": 2.2314012880302023, "grad_norm": 0.4317246743183953, "learning_rate": 4.861412373914729e-06, "loss": 0.0339, "step": 5024 }, { "epoch": 2.2318454363757496, "grad_norm": 1.9956444643720284, "learning_rate": 4.8594745924258144e-06, "loss": 0.0611, "step": 5025 }, { "epoch": 2.232289584721297, "grad_norm": 0.5268379152925193, "learning_rate": 4.857536832060255e-06, "loss": 0.0408, "step": 5026 }, { "epoch": 2.232733733066844, "grad_norm": 0.4937129380739942, "learning_rate": 4.85559909310933e-06, "loss": 0.0523, "step": 5027 }, { "epoch": 2.233177881412392, "grad_norm": 0.42104370790377565, "learning_rate": 4.853661375864313e-06, "loss": 0.0226, "step": 5028 }, { "epoch": 2.233622029757939, "grad_norm": 0.38358747557836487, "learning_rate": 4.851723680616477e-06, "loss": 0.0288, "step": 5029 }, { "epoch": 2.2340661781034865, "grad_norm": 0.35278248179931115, "learning_rate": 4.84978600765709e-06, "loss": 0.0273, "step": 5030 }, { "epoch": 2.234510326449034, "grad_norm": 0.5070929341551949, "learning_rate": 4.84784835727742e-06, "loss": 0.0421, "step": 5031 }, { "epoch": 2.2349544747945815, "grad_norm": 0.4414918243805681, "learning_rate": 4.845910729768726e-06, "loss": 0.0299, "step": 5032 }, { "epoch": 2.235398623140129, "grad_norm": 0.4272744129564785, "learning_rate": 4.843973125422266e-06, "loss": 0.0377, "step": 5033 }, { "epoch": 2.235842771485676, "grad_norm": 0.4042780850426815, "learning_rate": 4.842035544529296e-06, "loss": 0.0273, "step": 5034 }, { "epoch": 2.2362869198312234, "grad_norm": 0.384529185725243, "learning_rate": 4.8400979873810675e-06, "loss": 0.0297, "step": 5035 }, { "epoch": 2.236731068176771, "grad_norm": 0.5143165626824721, "learning_rate": 4.838160454268827e-06, "loss": 0.0378, "step": 5036 }, { "epoch": 2.2371752165223184, "grad_norm": 0.48737479457986954, "learning_rate": 4.8362229454838185e-06, "loss": 0.0343, "step": 5037 }, { "epoch": 2.2376193648678657, "grad_norm": 0.4465431723189805, "learning_rate": 4.834285461317286e-06, "loss": 0.0367, "step": 5038 }, { "epoch": 2.2380635132134135, "grad_norm": 0.5195671115110356, "learning_rate": 4.832348002060461e-06, "loss": 0.0396, "step": 5039 }, { "epoch": 2.2385076615589607, "grad_norm": 0.46391257174900896, "learning_rate": 4.830410568004581e-06, "loss": 0.036, "step": 5040 }, { "epoch": 2.238951809904508, "grad_norm": 0.5290347592358237, "learning_rate": 4.8284731594408715e-06, "loss": 0.0448, "step": 5041 }, { "epoch": 2.2393959582500553, "grad_norm": 0.4737259034687236, "learning_rate": 4.826535776660562e-06, "loss": 0.0328, "step": 5042 }, { "epoch": 2.239840106595603, "grad_norm": 0.6929115316947699, "learning_rate": 4.824598419954871e-06, "loss": 0.0473, "step": 5043 }, { "epoch": 2.2402842549411504, "grad_norm": 0.6258064825937694, "learning_rate": 4.822661089615017e-06, "loss": 0.0594, "step": 5044 }, { "epoch": 2.2407284032866976, "grad_norm": 0.4537831813620375, "learning_rate": 4.8207237859322144e-06, "loss": 0.0413, "step": 5045 }, { "epoch": 2.2411725516322454, "grad_norm": 0.37664633790712776, "learning_rate": 4.818786509197672e-06, "loss": 0.0303, "step": 5046 }, { "epoch": 2.2416166999777927, "grad_norm": 0.3886976621007685, "learning_rate": 4.816849259702594e-06, "loss": 0.0311, "step": 5047 }, { "epoch": 2.24206084832334, "grad_norm": 0.4486972497289991, "learning_rate": 4.814912037738185e-06, "loss": 0.0442, "step": 5048 }, { "epoch": 2.2425049966688873, "grad_norm": 0.35002545439343274, "learning_rate": 4.812974843595641e-06, "loss": 0.0318, "step": 5049 }, { "epoch": 2.242949145014435, "grad_norm": 0.48300556886418516, "learning_rate": 4.811037677566154e-06, "loss": 0.0395, "step": 5050 }, { "epoch": 2.2433932933599823, "grad_norm": 0.45349978147674774, "learning_rate": 4.8091005399409145e-06, "loss": 0.0371, "step": 5051 }, { "epoch": 2.2438374417055296, "grad_norm": 0.4518413356936792, "learning_rate": 4.807163431011107e-06, "loss": 0.0311, "step": 5052 }, { "epoch": 2.244281590051077, "grad_norm": 0.3777032140049169, "learning_rate": 4.80522635106791e-06, "loss": 0.0292, "step": 5053 }, { "epoch": 2.2447257383966246, "grad_norm": 0.4369243999495971, "learning_rate": 4.8032893004025016e-06, "loss": 0.0264, "step": 5054 }, { "epoch": 2.245169886742172, "grad_norm": 0.6447452509815467, "learning_rate": 4.801352279306054e-06, "loss": 0.0427, "step": 5055 }, { "epoch": 2.245614035087719, "grad_norm": 0.5030519012852677, "learning_rate": 4.799415288069733e-06, "loss": 0.0381, "step": 5056 }, { "epoch": 2.246058183433267, "grad_norm": 0.4348315875534344, "learning_rate": 4.797478326984702e-06, "loss": 0.0235, "step": 5057 }, { "epoch": 2.246502331778814, "grad_norm": 0.47850023846789697, "learning_rate": 4.795541396342116e-06, "loss": 0.036, "step": 5058 }, { "epoch": 2.2469464801243615, "grad_norm": 0.5227923023136899, "learning_rate": 4.793604496433133e-06, "loss": 0.0382, "step": 5059 }, { "epoch": 2.247390628469909, "grad_norm": 0.5404616232699001, "learning_rate": 4.791667627548902e-06, "loss": 0.0426, "step": 5060 }, { "epoch": 2.2478347768154565, "grad_norm": 0.5031844080280006, "learning_rate": 4.7897307899805624e-06, "loss": 0.0296, "step": 5061 }, { "epoch": 2.248278925161004, "grad_norm": 0.557884313184048, "learning_rate": 4.78779398401926e-06, "loss": 0.0324, "step": 5062 }, { "epoch": 2.248723073506551, "grad_norm": 0.46196149242671997, "learning_rate": 4.785857209956124e-06, "loss": 0.031, "step": 5063 }, { "epoch": 2.2491672218520984, "grad_norm": 0.40473001188667446, "learning_rate": 4.783920468082288e-06, "loss": 0.0341, "step": 5064 }, { "epoch": 2.249611370197646, "grad_norm": 0.558756976897473, "learning_rate": 4.781983758688876e-06, "loss": 0.0257, "step": 5065 }, { "epoch": 2.2500555185431934, "grad_norm": 0.4215840569154271, "learning_rate": 4.780047082067009e-06, "loss": 0.0402, "step": 5066 }, { "epoch": 2.2504996668887407, "grad_norm": 0.41373536715916187, "learning_rate": 4.778110438507801e-06, "loss": 0.0352, "step": 5067 }, { "epoch": 2.2509438152342884, "grad_norm": 0.41096920941351195, "learning_rate": 4.776173828302365e-06, "loss": 0.0411, "step": 5068 }, { "epoch": 2.2513879635798357, "grad_norm": 0.4730245858374104, "learning_rate": 4.774237251741805e-06, "loss": 0.0327, "step": 5069 }, { "epoch": 2.251832111925383, "grad_norm": 0.38391334338943445, "learning_rate": 4.7723007091172175e-06, "loss": 0.0368, "step": 5070 }, { "epoch": 2.2522762602709303, "grad_norm": 0.34213411038422287, "learning_rate": 4.770364200719703e-06, "loss": 0.0288, "step": 5071 }, { "epoch": 2.252720408616478, "grad_norm": 0.3470528030999554, "learning_rate": 4.7684277268403515e-06, "loss": 0.0218, "step": 5072 }, { "epoch": 2.2531645569620253, "grad_norm": 0.48424365825920085, "learning_rate": 4.766491287770246e-06, "loss": 0.047, "step": 5073 }, { "epoch": 2.2536087053075726, "grad_norm": 0.44693843285607443, "learning_rate": 4.7645548838004665e-06, "loss": 0.0314, "step": 5074 }, { "epoch": 2.2540528536531204, "grad_norm": 0.43485150995828376, "learning_rate": 4.762618515222085e-06, "loss": 0.0342, "step": 5075 }, { "epoch": 2.2544970019986676, "grad_norm": 0.4279292260890883, "learning_rate": 4.760682182326176e-06, "loss": 0.0407, "step": 5076 }, { "epoch": 2.254941150344215, "grad_norm": 0.4726345938038404, "learning_rate": 4.7587458854038e-06, "loss": 0.0366, "step": 5077 }, { "epoch": 2.2553852986897622, "grad_norm": 0.4982236299340504, "learning_rate": 4.756809624746015e-06, "loss": 0.0351, "step": 5078 }, { "epoch": 2.25582944703531, "grad_norm": 0.416498003591409, "learning_rate": 4.754873400643875e-06, "loss": 0.0345, "step": 5079 }, { "epoch": 2.2562735953808573, "grad_norm": 0.5784538425140838, "learning_rate": 4.7529372133884265e-06, "loss": 0.0351, "step": 5080 }, { "epoch": 2.2567177437264045, "grad_norm": 0.4069962515786798, "learning_rate": 4.75100106327071e-06, "loss": 0.0287, "step": 5081 }, { "epoch": 2.2571618920719523, "grad_norm": 0.44000239841542776, "learning_rate": 4.749064950581765e-06, "loss": 0.0318, "step": 5082 }, { "epoch": 2.2576060404174996, "grad_norm": 0.4673614175152977, "learning_rate": 4.747128875612621e-06, "loss": 0.0298, "step": 5083 }, { "epoch": 2.258050188763047, "grad_norm": 0.577011350942782, "learning_rate": 4.745192838654304e-06, "loss": 0.0388, "step": 5084 }, { "epoch": 2.258494337108594, "grad_norm": 0.40314614436836343, "learning_rate": 4.743256839997828e-06, "loss": 0.0318, "step": 5085 }, { "epoch": 2.2589384854541414, "grad_norm": 0.3926832668709334, "learning_rate": 4.741320879934213e-06, "loss": 0.0361, "step": 5086 }, { "epoch": 2.259382633799689, "grad_norm": 0.5101559666731487, "learning_rate": 4.739384958754461e-06, "loss": 0.0451, "step": 5087 }, { "epoch": 2.2598267821452365, "grad_norm": 0.35566284403310655, "learning_rate": 4.73744907674958e-06, "loss": 0.0228, "step": 5088 }, { "epoch": 2.2602709304907838, "grad_norm": 0.5266642057409271, "learning_rate": 4.7355132342105615e-06, "loss": 0.0387, "step": 5089 }, { "epoch": 2.2607150788363315, "grad_norm": 0.4021115546007318, "learning_rate": 4.733577431428398e-06, "loss": 0.0313, "step": 5090 }, { "epoch": 2.261159227181879, "grad_norm": 0.48902419392318525, "learning_rate": 4.73164166869407e-06, "loss": 0.0465, "step": 5091 }, { "epoch": 2.261603375527426, "grad_norm": 0.4831670488263336, "learning_rate": 4.729705946298557e-06, "loss": 0.0298, "step": 5092 }, { "epoch": 2.2620475238729734, "grad_norm": 1.0045520985152046, "learning_rate": 4.727770264532835e-06, "loss": 0.0386, "step": 5093 }, { "epoch": 2.262491672218521, "grad_norm": 0.5424294421568248, "learning_rate": 4.725834623687866e-06, "loss": 0.0448, "step": 5094 }, { "epoch": 2.2629358205640684, "grad_norm": 0.37090231243704935, "learning_rate": 4.723899024054609e-06, "loss": 0.0276, "step": 5095 }, { "epoch": 2.2633799689096157, "grad_norm": 0.41587596025980633, "learning_rate": 4.7219634659240195e-06, "loss": 0.037, "step": 5096 }, { "epoch": 2.2638241172551634, "grad_norm": 0.4579599445978751, "learning_rate": 4.720027949587046e-06, "loss": 0.0355, "step": 5097 }, { "epoch": 2.2642682656007107, "grad_norm": 0.33053691402038254, "learning_rate": 4.718092475334623e-06, "loss": 0.0237, "step": 5098 }, { "epoch": 2.264712413946258, "grad_norm": 0.4734298805101396, "learning_rate": 4.716157043457692e-06, "loss": 0.0406, "step": 5099 }, { "epoch": 2.2651565622918053, "grad_norm": 0.495566512713766, "learning_rate": 4.71422165424718e-06, "loss": 0.0446, "step": 5100 }, { "epoch": 2.265600710637353, "grad_norm": 0.5845335631241523, "learning_rate": 4.712286307994008e-06, "loss": 0.0483, "step": 5101 }, { "epoch": 2.2660448589829003, "grad_norm": 0.5632138786502331, "learning_rate": 4.71035100498909e-06, "loss": 0.0301, "step": 5102 }, { "epoch": 2.2664890073284476, "grad_norm": 0.4679864832864739, "learning_rate": 4.708415745523338e-06, "loss": 0.0282, "step": 5103 }, { "epoch": 2.2669331556739953, "grad_norm": 0.3347447433418278, "learning_rate": 4.70648052988765e-06, "loss": 0.0284, "step": 5104 }, { "epoch": 2.2673773040195426, "grad_norm": 0.3922788691977759, "learning_rate": 4.704545358372926e-06, "loss": 0.0376, "step": 5105 }, { "epoch": 2.26782145236509, "grad_norm": 0.574421309924243, "learning_rate": 4.702610231270053e-06, "loss": 0.0357, "step": 5106 }, { "epoch": 2.268265600710637, "grad_norm": 0.3970980176428026, "learning_rate": 4.7006751488699145e-06, "loss": 0.0352, "step": 5107 }, { "epoch": 2.268709749056185, "grad_norm": 0.48320849905313684, "learning_rate": 4.698740111463386e-06, "loss": 0.0482, "step": 5108 }, { "epoch": 2.2691538974017322, "grad_norm": 0.48488451433259316, "learning_rate": 4.696805119341334e-06, "loss": 0.0323, "step": 5109 }, { "epoch": 2.2695980457472795, "grad_norm": 0.38461440123857293, "learning_rate": 4.694870172794625e-06, "loss": 0.0256, "step": 5110 }, { "epoch": 2.270042194092827, "grad_norm": 0.5557330047140479, "learning_rate": 4.692935272114113e-06, "loss": 0.0303, "step": 5111 }, { "epoch": 2.2704863424383745, "grad_norm": 0.4256236131352179, "learning_rate": 4.6910004175906435e-06, "loss": 0.0233, "step": 5112 }, { "epoch": 2.270930490783922, "grad_norm": 0.46988918438082106, "learning_rate": 4.689065609515062e-06, "loss": 0.0363, "step": 5113 }, { "epoch": 2.271374639129469, "grad_norm": 0.49373029930466805, "learning_rate": 4.687130848178202e-06, "loss": 0.0483, "step": 5114 }, { "epoch": 2.2718187874750164, "grad_norm": 0.34972548300588746, "learning_rate": 4.685196133870887e-06, "loss": 0.0255, "step": 5115 }, { "epoch": 2.272262935820564, "grad_norm": 0.581571662658351, "learning_rate": 4.683261466883942e-06, "loss": 0.0414, "step": 5116 }, { "epoch": 2.2727070841661114, "grad_norm": 0.44401771745072516, "learning_rate": 4.681326847508181e-06, "loss": 0.032, "step": 5117 }, { "epoch": 2.2731512325116587, "grad_norm": 0.5278060546522586, "learning_rate": 4.6793922760344065e-06, "loss": 0.0418, "step": 5118 }, { "epoch": 2.2735953808572065, "grad_norm": 0.5206864377402288, "learning_rate": 4.6774577527534195e-06, "loss": 0.0388, "step": 5119 }, { "epoch": 2.2740395292027538, "grad_norm": 0.4082716881176943, "learning_rate": 4.675523277956011e-06, "loss": 0.0283, "step": 5120 }, { "epoch": 2.274483677548301, "grad_norm": 0.4188100723565611, "learning_rate": 4.673588851932964e-06, "loss": 0.0377, "step": 5121 }, { "epoch": 2.2749278258938483, "grad_norm": 0.32957447472088847, "learning_rate": 4.671654474975061e-06, "loss": 0.0284, "step": 5122 }, { "epoch": 2.275371974239396, "grad_norm": 0.5325173700148402, "learning_rate": 4.669720147373065e-06, "loss": 0.0304, "step": 5123 }, { "epoch": 2.2758161225849434, "grad_norm": 0.7259350026566369, "learning_rate": 4.667785869417744e-06, "loss": 0.0484, "step": 5124 }, { "epoch": 2.2762602709304907, "grad_norm": 0.34263569388234766, "learning_rate": 4.6658516413998486e-06, "loss": 0.0302, "step": 5125 }, { "epoch": 2.2767044192760384, "grad_norm": 0.4050161385405534, "learning_rate": 4.663917463610128e-06, "loss": 0.0327, "step": 5126 }, { "epoch": 2.2771485676215857, "grad_norm": 0.4351043792797754, "learning_rate": 4.661983336339319e-06, "loss": 0.0347, "step": 5127 }, { "epoch": 2.277592715967133, "grad_norm": 0.4544801574632252, "learning_rate": 4.66004925987816e-06, "loss": 0.0335, "step": 5128 }, { "epoch": 2.2780368643126803, "grad_norm": 0.5298789728156417, "learning_rate": 4.6581152345173714e-06, "loss": 0.0481, "step": 5129 }, { "epoch": 2.278481012658228, "grad_norm": 0.42879788700967447, "learning_rate": 4.656181260547669e-06, "loss": 0.0306, "step": 5130 }, { "epoch": 2.2789251610037753, "grad_norm": 0.432457604882885, "learning_rate": 4.654247338259766e-06, "loss": 0.0348, "step": 5131 }, { "epoch": 2.2793693093493226, "grad_norm": 0.4238026290724064, "learning_rate": 4.652313467944358e-06, "loss": 0.0362, "step": 5132 }, { "epoch": 2.2798134576948703, "grad_norm": 0.37078187284702663, "learning_rate": 4.650379649892145e-06, "loss": 0.0225, "step": 5133 }, { "epoch": 2.2802576060404176, "grad_norm": 0.541283465006368, "learning_rate": 4.648445884393808e-06, "loss": 0.0427, "step": 5134 }, { "epoch": 2.280701754385965, "grad_norm": 0.5756809781245248, "learning_rate": 4.646512171740028e-06, "loss": 0.0416, "step": 5135 }, { "epoch": 2.281145902731512, "grad_norm": 0.6172689341906578, "learning_rate": 4.6445785122214715e-06, "loss": 0.0468, "step": 5136 }, { "epoch": 2.28159005107706, "grad_norm": 0.4664467318852636, "learning_rate": 4.6426449061288035e-06, "loss": 0.0394, "step": 5137 }, { "epoch": 2.282034199422607, "grad_norm": 0.3708061416037573, "learning_rate": 4.640711353752675e-06, "loss": 0.0308, "step": 5138 }, { "epoch": 2.2824783477681545, "grad_norm": 0.4312428309303351, "learning_rate": 4.638777855383735e-06, "loss": 0.0361, "step": 5139 }, { "epoch": 2.282922496113702, "grad_norm": 0.3963057474327594, "learning_rate": 4.636844411312618e-06, "loss": 0.0244, "step": 5140 }, { "epoch": 2.2833666444592495, "grad_norm": 0.48035729602544386, "learning_rate": 4.634911021829956e-06, "loss": 0.0388, "step": 5141 }, { "epoch": 2.283810792804797, "grad_norm": 0.3951503991589514, "learning_rate": 4.63297768722637e-06, "loss": 0.0311, "step": 5142 }, { "epoch": 2.284254941150344, "grad_norm": 0.3422700971686053, "learning_rate": 4.6310444077924705e-06, "loss": 0.0244, "step": 5143 }, { "epoch": 2.2846990894958914, "grad_norm": 0.6947651223374093, "learning_rate": 4.629111183818863e-06, "loss": 0.0351, "step": 5144 }, { "epoch": 2.285143237841439, "grad_norm": 0.4898604396865276, "learning_rate": 4.627178015596147e-06, "loss": 0.0369, "step": 5145 }, { "epoch": 2.2855873861869864, "grad_norm": 0.384272758272605, "learning_rate": 4.625244903414908e-06, "loss": 0.0378, "step": 5146 }, { "epoch": 2.2860315345325337, "grad_norm": 0.6383973512520991, "learning_rate": 4.623311847565725e-06, "loss": 0.0371, "step": 5147 }, { "epoch": 2.2864756828780815, "grad_norm": 0.43116354217379305, "learning_rate": 4.621378848339172e-06, "loss": 0.0385, "step": 5148 }, { "epoch": 2.2869198312236287, "grad_norm": 0.37732659096182064, "learning_rate": 4.619445906025807e-06, "loss": 0.0302, "step": 5149 }, { "epoch": 2.287363979569176, "grad_norm": 0.38619197439251224, "learning_rate": 4.6175130209161894e-06, "loss": 0.0309, "step": 5150 }, { "epoch": 2.2878081279147233, "grad_norm": 0.3996619270485742, "learning_rate": 4.615580193300861e-06, "loss": 0.0342, "step": 5151 }, { "epoch": 2.288252276260271, "grad_norm": 0.4975335885399077, "learning_rate": 4.613647423470361e-06, "loss": 0.0295, "step": 5152 }, { "epoch": 2.2886964246058183, "grad_norm": 0.3800465801231585, "learning_rate": 4.611714711715215e-06, "loss": 0.0287, "step": 5153 }, { "epoch": 2.2891405729513656, "grad_norm": 0.4187996600512099, "learning_rate": 4.609782058325944e-06, "loss": 0.0357, "step": 5154 }, { "epoch": 2.2895847212969134, "grad_norm": 0.6627247371766289, "learning_rate": 4.607849463593056e-06, "loss": 0.0319, "step": 5155 }, { "epoch": 2.2900288696424607, "grad_norm": 0.44696110840918063, "learning_rate": 4.6059169278070576e-06, "loss": 0.0439, "step": 5156 }, { "epoch": 2.290473017988008, "grad_norm": 0.43597000773880085, "learning_rate": 4.603984451258439e-06, "loss": 0.0309, "step": 5157 }, { "epoch": 2.2909171663335552, "grad_norm": 0.3416433662316786, "learning_rate": 4.602052034237684e-06, "loss": 0.0244, "step": 5158 }, { "epoch": 2.291361314679103, "grad_norm": 0.4086413859197415, "learning_rate": 4.600119677035269e-06, "loss": 0.0357, "step": 5159 }, { "epoch": 2.2918054630246503, "grad_norm": 0.4905561101374712, "learning_rate": 4.598187379941659e-06, "loss": 0.036, "step": 5160 }, { "epoch": 2.2922496113701976, "grad_norm": 0.47425089286645844, "learning_rate": 4.59625514324731e-06, "loss": 0.0344, "step": 5161 }, { "epoch": 2.2926937597157453, "grad_norm": 0.4768903592454644, "learning_rate": 4.594322967242673e-06, "loss": 0.0445, "step": 5162 }, { "epoch": 2.2931379080612926, "grad_norm": 0.6604194357601725, "learning_rate": 4.592390852218185e-06, "loss": 0.0353, "step": 5163 }, { "epoch": 2.29358205640684, "grad_norm": 0.6477706525687013, "learning_rate": 4.590458798464275e-06, "loss": 0.0346, "step": 5164 }, { "epoch": 2.294026204752387, "grad_norm": 0.4782427676732749, "learning_rate": 4.588526806271366e-06, "loss": 0.0332, "step": 5165 }, { "epoch": 2.294470353097935, "grad_norm": 0.33598954590538455, "learning_rate": 4.5865948759298656e-06, "loss": 0.0242, "step": 5166 }, { "epoch": 2.294914501443482, "grad_norm": 0.5314954081213854, "learning_rate": 4.58466300773018e-06, "loss": 0.0375, "step": 5167 }, { "epoch": 2.2953586497890295, "grad_norm": 0.45504963601939913, "learning_rate": 4.582731201962699e-06, "loss": 0.0325, "step": 5168 }, { "epoch": 2.2958027981345768, "grad_norm": 0.5020378540227112, "learning_rate": 4.5807994589178066e-06, "loss": 0.0397, "step": 5169 }, { "epoch": 2.2962469464801245, "grad_norm": 0.44086322018058155, "learning_rate": 4.578867778885877e-06, "loss": 0.0385, "step": 5170 }, { "epoch": 2.296691094825672, "grad_norm": 0.4190415887827377, "learning_rate": 4.5769361621572735e-06, "loss": 0.0351, "step": 5171 }, { "epoch": 2.297135243171219, "grad_norm": 0.40356235040543037, "learning_rate": 4.575004609022349e-06, "loss": 0.0384, "step": 5172 }, { "epoch": 2.2975793915167664, "grad_norm": 0.5214338427099589, "learning_rate": 4.573073119771455e-06, "loss": 0.0414, "step": 5173 }, { "epoch": 2.298023539862314, "grad_norm": 0.6042586649916282, "learning_rate": 4.571141694694922e-06, "loss": 0.0398, "step": 5174 }, { "epoch": 2.2984676882078614, "grad_norm": 0.5451011557742893, "learning_rate": 4.569210334083077e-06, "loss": 0.0284, "step": 5175 }, { "epoch": 2.2989118365534087, "grad_norm": 0.6693775358695208, "learning_rate": 4.567279038226237e-06, "loss": 0.0393, "step": 5176 }, { "epoch": 2.2993559848989564, "grad_norm": 0.4565914726092485, "learning_rate": 4.565347807414709e-06, "loss": 0.0369, "step": 5177 }, { "epoch": 2.2998001332445037, "grad_norm": 0.3909149508886464, "learning_rate": 4.563416641938786e-06, "loss": 0.0312, "step": 5178 }, { "epoch": 2.300244281590051, "grad_norm": 0.48805937276628275, "learning_rate": 4.5614855420887595e-06, "loss": 0.0417, "step": 5179 }, { "epoch": 2.3006884299355983, "grad_norm": 0.47062502463527334, "learning_rate": 4.559554508154906e-06, "loss": 0.0359, "step": 5180 }, { "epoch": 2.301132578281146, "grad_norm": 0.41598990819931514, "learning_rate": 4.557623540427492e-06, "loss": 0.0298, "step": 5181 }, { "epoch": 2.3015767266266933, "grad_norm": 0.5301634737897044, "learning_rate": 4.555692639196774e-06, "loss": 0.0419, "step": 5182 }, { "epoch": 2.3020208749722406, "grad_norm": 0.3965912885980032, "learning_rate": 4.553761804752997e-06, "loss": 0.0268, "step": 5183 }, { "epoch": 2.3024650233177884, "grad_norm": 0.3853380797199742, "learning_rate": 4.551831037386405e-06, "loss": 0.0296, "step": 5184 }, { "epoch": 2.3029091716633356, "grad_norm": 0.4626560137060772, "learning_rate": 4.54990033738722e-06, "loss": 0.0347, "step": 5185 }, { "epoch": 2.303353320008883, "grad_norm": 0.4662767664074809, "learning_rate": 4.54796970504566e-06, "loss": 0.0382, "step": 5186 }, { "epoch": 2.3037974683544302, "grad_norm": 0.5212735203918284, "learning_rate": 4.546039140651932e-06, "loss": 0.0347, "step": 5187 }, { "epoch": 2.304241616699978, "grad_norm": 0.6660381074871294, "learning_rate": 4.544108644496232e-06, "loss": 0.0414, "step": 5188 }, { "epoch": 2.3046857650455252, "grad_norm": 0.49995652041639127, "learning_rate": 4.542178216868746e-06, "loss": 0.0394, "step": 5189 }, { "epoch": 2.3051299133910725, "grad_norm": 0.38693338487127227, "learning_rate": 4.540247858059654e-06, "loss": 0.0244, "step": 5190 }, { "epoch": 2.3055740617366203, "grad_norm": 0.42802350164666275, "learning_rate": 4.538317568359117e-06, "loss": 0.0234, "step": 5191 }, { "epoch": 2.3060182100821676, "grad_norm": 0.33959690635237333, "learning_rate": 4.536387348057292e-06, "loss": 0.0273, "step": 5192 }, { "epoch": 2.306462358427715, "grad_norm": 0.560656839558957, "learning_rate": 4.5344571974443255e-06, "loss": 0.0361, "step": 5193 }, { "epoch": 2.306906506773262, "grad_norm": 0.3618868056091977, "learning_rate": 4.5325271168103496e-06, "loss": 0.0289, "step": 5194 }, { "epoch": 2.3073506551188094, "grad_norm": 0.38869936597852134, "learning_rate": 4.530597106445487e-06, "loss": 0.0335, "step": 5195 }, { "epoch": 2.307794803464357, "grad_norm": 0.451335091229613, "learning_rate": 4.528667166639855e-06, "loss": 0.0293, "step": 5196 }, { "epoch": 2.3082389518099045, "grad_norm": 0.3756798518286319, "learning_rate": 4.526737297683554e-06, "loss": 0.0307, "step": 5197 }, { "epoch": 2.3086831001554518, "grad_norm": 0.41638755620692586, "learning_rate": 4.524807499866678e-06, "loss": 0.0313, "step": 5198 }, { "epoch": 2.3091272485009995, "grad_norm": 0.49512523947166825, "learning_rate": 4.522877773479305e-06, "loss": 0.0341, "step": 5199 }, { "epoch": 2.3095713968465468, "grad_norm": 0.38110036222157506, "learning_rate": 4.520948118811508e-06, "loss": 0.0301, "step": 5200 }, { "epoch": 2.310015545192094, "grad_norm": 0.4176831056196918, "learning_rate": 4.519018536153346e-06, "loss": 0.0325, "step": 5201 }, { "epoch": 2.3104596935376414, "grad_norm": 0.37358674774847195, "learning_rate": 4.517089025794869e-06, "loss": 0.0223, "step": 5202 }, { "epoch": 2.310903841883189, "grad_norm": 0.5722574491295905, "learning_rate": 4.515159588026114e-06, "loss": 0.0385, "step": 5203 }, { "epoch": 2.3113479902287364, "grad_norm": 0.3452449788849875, "learning_rate": 4.51323022313711e-06, "loss": 0.0239, "step": 5204 }, { "epoch": 2.3117921385742837, "grad_norm": 0.5594235169002529, "learning_rate": 4.511300931417872e-06, "loss": 0.0378, "step": 5205 }, { "epoch": 2.3122362869198314, "grad_norm": 0.4629383899400429, "learning_rate": 4.509371713158404e-06, "loss": 0.0465, "step": 5206 }, { "epoch": 2.3126804352653787, "grad_norm": 0.344536109032876, "learning_rate": 4.507442568648702e-06, "loss": 0.0214, "step": 5207 }, { "epoch": 2.313124583610926, "grad_norm": 0.5327665523608216, "learning_rate": 4.505513498178752e-06, "loss": 0.0474, "step": 5208 }, { "epoch": 2.3135687319564733, "grad_norm": 0.43197164521390413, "learning_rate": 4.503584502038521e-06, "loss": 0.0307, "step": 5209 }, { "epoch": 2.314012880302021, "grad_norm": 0.46940506061763837, "learning_rate": 4.501655580517972e-06, "loss": 0.0405, "step": 5210 }, { "epoch": 2.3144570286475683, "grad_norm": 0.3043189475892609, "learning_rate": 4.499726733907056e-06, "loss": 0.0208, "step": 5211 }, { "epoch": 2.3149011769931156, "grad_norm": 0.5162978477725472, "learning_rate": 4.497797962495707e-06, "loss": 0.0417, "step": 5212 }, { "epoch": 2.3153453253386633, "grad_norm": 0.4166068328105937, "learning_rate": 4.495869266573857e-06, "loss": 0.0363, "step": 5213 }, { "epoch": 2.3157894736842106, "grad_norm": 0.5916403656140393, "learning_rate": 4.49394064643142e-06, "loss": 0.0401, "step": 5214 }, { "epoch": 2.316233622029758, "grad_norm": 0.6022617123765319, "learning_rate": 4.492012102358301e-06, "loss": 0.0307, "step": 5215 }, { "epoch": 2.316677770375305, "grad_norm": 0.4369169634282751, "learning_rate": 4.490083634644391e-06, "loss": 0.042, "step": 5216 }, { "epoch": 2.317121918720853, "grad_norm": 0.5201884422005028, "learning_rate": 4.488155243579574e-06, "loss": 0.0353, "step": 5217 }, { "epoch": 2.3175660670664002, "grad_norm": 0.4192988414508763, "learning_rate": 4.486226929453716e-06, "loss": 0.0347, "step": 5218 }, { "epoch": 2.3180102154119475, "grad_norm": 0.47826969893341476, "learning_rate": 4.4842986925566805e-06, "loss": 0.0412, "step": 5219 }, { "epoch": 2.3184543637574953, "grad_norm": 0.4411896998427744, "learning_rate": 4.482370533178311e-06, "loss": 0.0473, "step": 5220 }, { "epoch": 2.3188985121030425, "grad_norm": 0.7154366399859141, "learning_rate": 4.4804424516084435e-06, "loss": 0.0318, "step": 5221 }, { "epoch": 2.31934266044859, "grad_norm": 0.4486899236378927, "learning_rate": 4.478514448136901e-06, "loss": 0.03, "step": 5222 }, { "epoch": 2.319786808794137, "grad_norm": 0.6696629896177303, "learning_rate": 4.476586523053494e-06, "loss": 0.0447, "step": 5223 }, { "epoch": 2.3202309571396844, "grad_norm": 0.36764590834428945, "learning_rate": 4.474658676648025e-06, "loss": 0.0317, "step": 5224 }, { "epoch": 2.320675105485232, "grad_norm": 0.4074350155089678, "learning_rate": 4.4727309092102825e-06, "loss": 0.0338, "step": 5225 }, { "epoch": 2.3211192538307794, "grad_norm": 0.5410060660419875, "learning_rate": 4.47080322103004e-06, "loss": 0.0406, "step": 5226 }, { "epoch": 2.3215634021763267, "grad_norm": 0.3869245785320258, "learning_rate": 4.4688756123970625e-06, "loss": 0.025, "step": 5227 }, { "epoch": 2.3220075505218745, "grad_norm": 0.3306613006808203, "learning_rate": 4.466948083601103e-06, "loss": 0.0216, "step": 5228 }, { "epoch": 2.3224516988674218, "grad_norm": 0.3885322170673341, "learning_rate": 4.4650206349319e-06, "loss": 0.0334, "step": 5229 }, { "epoch": 2.322895847212969, "grad_norm": 0.45727532111911046, "learning_rate": 4.463093266679185e-06, "loss": 0.0314, "step": 5230 }, { "epoch": 2.3233399955585163, "grad_norm": 0.40470162685593614, "learning_rate": 4.4611659791326726e-06, "loss": 0.0225, "step": 5231 }, { "epoch": 2.323784143904064, "grad_norm": 0.3709631261080152, "learning_rate": 4.459238772582067e-06, "loss": 0.0281, "step": 5232 }, { "epoch": 2.3242282922496114, "grad_norm": 0.5000330379973269, "learning_rate": 4.457311647317058e-06, "loss": 0.0408, "step": 5233 }, { "epoch": 2.3246724405951587, "grad_norm": 0.40795990787328895, "learning_rate": 4.4553846036273294e-06, "loss": 0.0304, "step": 5234 }, { "epoch": 2.3251165889407064, "grad_norm": 0.41652506953547214, "learning_rate": 4.453457641802542e-06, "loss": 0.0283, "step": 5235 }, { "epoch": 2.3255607372862537, "grad_norm": 0.422544506737885, "learning_rate": 4.451530762132359e-06, "loss": 0.0247, "step": 5236 }, { "epoch": 2.326004885631801, "grad_norm": 0.42941673106868544, "learning_rate": 4.4496039649064185e-06, "loss": 0.0271, "step": 5237 }, { "epoch": 2.3264490339773483, "grad_norm": 0.5518213591251218, "learning_rate": 4.4476772504143525e-06, "loss": 0.0283, "step": 5238 }, { "epoch": 2.326893182322896, "grad_norm": 0.3905942457168205, "learning_rate": 4.445750618945778e-06, "loss": 0.0338, "step": 5239 }, { "epoch": 2.3273373306684433, "grad_norm": 0.5264154088819042, "learning_rate": 4.443824070790298e-06, "loss": 0.0373, "step": 5240 }, { "epoch": 2.3277814790139906, "grad_norm": 0.4859563195815842, "learning_rate": 4.4418976062375095e-06, "loss": 0.0472, "step": 5241 }, { "epoch": 2.3282256273595383, "grad_norm": 0.43082619257296717, "learning_rate": 4.439971225576992e-06, "loss": 0.0321, "step": 5242 }, { "epoch": 2.3286697757050856, "grad_norm": 0.4148209810054243, "learning_rate": 4.438044929098312e-06, "loss": 0.027, "step": 5243 }, { "epoch": 2.329113924050633, "grad_norm": 0.3906796792959702, "learning_rate": 4.436118717091025e-06, "loss": 0.0302, "step": 5244 }, { "epoch": 2.32955807239618, "grad_norm": 0.5074169188006878, "learning_rate": 4.434192589844674e-06, "loss": 0.033, "step": 5245 }, { "epoch": 2.330002220741728, "grad_norm": 0.5700168097501661, "learning_rate": 4.432266547648786e-06, "loss": 0.0386, "step": 5246 }, { "epoch": 2.330446369087275, "grad_norm": 0.7002894794054985, "learning_rate": 4.430340590792883e-06, "loss": 0.0452, "step": 5247 }, { "epoch": 2.3308905174328225, "grad_norm": 0.43587113620478884, "learning_rate": 4.428414719566464e-06, "loss": 0.0455, "step": 5248 }, { "epoch": 2.33133466577837, "grad_norm": 0.4414200442172319, "learning_rate": 4.426488934259023e-06, "loss": 0.038, "step": 5249 }, { "epoch": 2.3317788141239175, "grad_norm": 0.3603980656114873, "learning_rate": 4.424563235160039e-06, "loss": 0.0284, "step": 5250 }, { "epoch": 2.332222962469465, "grad_norm": 0.3831150016755091, "learning_rate": 4.422637622558973e-06, "loss": 0.0302, "step": 5251 }, { "epoch": 2.332667110815012, "grad_norm": 0.3992416324305334, "learning_rate": 4.42071209674528e-06, "loss": 0.0324, "step": 5252 }, { "epoch": 2.3331112591605594, "grad_norm": 0.4795655074267854, "learning_rate": 4.4187866580084005e-06, "loss": 0.0456, "step": 5253 }, { "epoch": 2.333555407506107, "grad_norm": 0.3484662534276821, "learning_rate": 4.41686130663776e-06, "loss": 0.0223, "step": 5254 }, { "epoch": 2.3339995558516544, "grad_norm": 0.4234114697000452, "learning_rate": 4.4149360429227695e-06, "loss": 0.0296, "step": 5255 }, { "epoch": 2.3344437041972017, "grad_norm": 0.4632049279941834, "learning_rate": 4.4130108671528315e-06, "loss": 0.0264, "step": 5256 }, { "epoch": 2.3348878525427494, "grad_norm": 0.5054504530004318, "learning_rate": 4.41108577961733e-06, "loss": 0.0423, "step": 5257 }, { "epoch": 2.3353320008882967, "grad_norm": 0.4132515605214912, "learning_rate": 4.40916078060564e-06, "loss": 0.026, "step": 5258 }, { "epoch": 2.335776149233844, "grad_norm": 0.4886399938932667, "learning_rate": 4.407235870407122e-06, "loss": 0.0378, "step": 5259 }, { "epoch": 2.3362202975793913, "grad_norm": 0.48169156288056764, "learning_rate": 4.4053110493111226e-06, "loss": 0.0366, "step": 5260 }, { "epoch": 2.336664445924939, "grad_norm": 0.5264104312524703, "learning_rate": 4.403386317606972e-06, "loss": 0.0427, "step": 5261 }, { "epoch": 2.3371085942704863, "grad_norm": 0.3726652002849068, "learning_rate": 4.4014616755839955e-06, "loss": 0.026, "step": 5262 }, { "epoch": 2.3375527426160336, "grad_norm": 0.531223484070124, "learning_rate": 4.399537123531494e-06, "loss": 0.0389, "step": 5263 }, { "epoch": 2.3379968909615814, "grad_norm": 0.5377515081195285, "learning_rate": 4.3976126617387645e-06, "loss": 0.0466, "step": 5264 }, { "epoch": 2.3384410393071287, "grad_norm": 0.4256267011064791, "learning_rate": 4.395688290495084e-06, "loss": 0.0381, "step": 5265 }, { "epoch": 2.338885187652676, "grad_norm": 0.4792927506755932, "learning_rate": 4.393764010089719e-06, "loss": 0.046, "step": 5266 }, { "epoch": 2.3393293359982232, "grad_norm": 0.5281025771868061, "learning_rate": 4.391839820811923e-06, "loss": 0.0396, "step": 5267 }, { "epoch": 2.339773484343771, "grad_norm": 0.4893018865311666, "learning_rate": 4.389915722950931e-06, "loss": 0.0368, "step": 5268 }, { "epoch": 2.3402176326893183, "grad_norm": 0.4896668042908599, "learning_rate": 4.387991716795968e-06, "loss": 0.0406, "step": 5269 }, { "epoch": 2.3406617810348656, "grad_norm": 0.39529432125848185, "learning_rate": 4.386067802636249e-06, "loss": 0.0323, "step": 5270 }, { "epoch": 2.3411059293804133, "grad_norm": 0.38393804306481244, "learning_rate": 4.384143980760968e-06, "loss": 0.0321, "step": 5271 }, { "epoch": 2.3415500777259606, "grad_norm": 0.4030478157940605, "learning_rate": 4.382220251459306e-06, "loss": 0.0348, "step": 5272 }, { "epoch": 2.341994226071508, "grad_norm": 0.46335758617758316, "learning_rate": 4.380296615020437e-06, "loss": 0.0481, "step": 5273 }, { "epoch": 2.342438374417055, "grad_norm": 0.39799741674310996, "learning_rate": 4.3783730717335124e-06, "loss": 0.0365, "step": 5274 }, { "epoch": 2.342882522762603, "grad_norm": 0.6313177165283153, "learning_rate": 4.376449621887674e-06, "loss": 0.0357, "step": 5275 }, { "epoch": 2.34332667110815, "grad_norm": 0.46331907567288616, "learning_rate": 4.37452626577205e-06, "loss": 0.0404, "step": 5276 }, { "epoch": 2.3437708194536975, "grad_norm": 0.45182075889541223, "learning_rate": 4.372603003675755e-06, "loss": 0.0273, "step": 5277 }, { "epoch": 2.3442149677992448, "grad_norm": 0.45396241594109277, "learning_rate": 4.370679835887885e-06, "loss": 0.0334, "step": 5278 }, { "epoch": 2.3446591161447925, "grad_norm": 0.4008935982764722, "learning_rate": 4.368756762697525e-06, "loss": 0.0295, "step": 5279 }, { "epoch": 2.34510326449034, "grad_norm": 0.4497120560175991, "learning_rate": 4.366833784393746e-06, "loss": 0.034, "step": 5280 }, { "epoch": 2.345547412835887, "grad_norm": 0.5274538552861234, "learning_rate": 4.364910901265607e-06, "loss": 0.0358, "step": 5281 }, { "epoch": 2.3459915611814344, "grad_norm": 0.5012732757402258, "learning_rate": 4.362988113602147e-06, "loss": 0.0351, "step": 5282 }, { "epoch": 2.346435709526982, "grad_norm": 0.41702907826337277, "learning_rate": 4.361065421692394e-06, "loss": 0.0311, "step": 5283 }, { "epoch": 2.3468798578725294, "grad_norm": 0.4723862662210319, "learning_rate": 4.3591428258253634e-06, "loss": 0.0283, "step": 5284 }, { "epoch": 2.3473240062180767, "grad_norm": 0.4642888220640226, "learning_rate": 4.35722032629005e-06, "loss": 0.0341, "step": 5285 }, { "epoch": 2.3477681545636244, "grad_norm": 0.5515201989991365, "learning_rate": 4.35529792337544e-06, "loss": 0.0426, "step": 5286 }, { "epoch": 2.3482123029091717, "grad_norm": 0.42814364810008887, "learning_rate": 4.353375617370506e-06, "loss": 0.0386, "step": 5287 }, { "epoch": 2.348656451254719, "grad_norm": 0.422713836528967, "learning_rate": 4.3514534085642e-06, "loss": 0.0477, "step": 5288 }, { "epoch": 2.3491005996002663, "grad_norm": 0.388857891163739, "learning_rate": 4.349531297245464e-06, "loss": 0.0284, "step": 5289 }, { "epoch": 2.349544747945814, "grad_norm": 0.33430013810360537, "learning_rate": 4.347609283703224e-06, "loss": 0.0222, "step": 5290 }, { "epoch": 2.3499888962913613, "grad_norm": 0.4550663748035299, "learning_rate": 4.345687368226391e-06, "loss": 0.0484, "step": 5291 }, { "epoch": 2.3504330446369086, "grad_norm": 0.4904858360330206, "learning_rate": 4.343765551103859e-06, "loss": 0.0431, "step": 5292 }, { "epoch": 2.3508771929824563, "grad_norm": 0.418865690563144, "learning_rate": 4.3418438326245134e-06, "loss": 0.0264, "step": 5293 }, { "epoch": 2.3513213413280036, "grad_norm": 0.3313326958383252, "learning_rate": 4.33992221307722e-06, "loss": 0.0359, "step": 5294 }, { "epoch": 2.351765489673551, "grad_norm": 0.38342759888274064, "learning_rate": 4.338000692750832e-06, "loss": 0.03, "step": 5295 }, { "epoch": 2.352209638019098, "grad_norm": 0.6016424768870177, "learning_rate": 4.336079271934184e-06, "loss": 0.0432, "step": 5296 }, { "epoch": 2.352653786364646, "grad_norm": 0.5554548071020693, "learning_rate": 4.334157950916098e-06, "loss": 0.0659, "step": 5297 }, { "epoch": 2.3530979347101932, "grad_norm": 0.5014661231015702, "learning_rate": 4.332236729985385e-06, "loss": 0.0343, "step": 5298 }, { "epoch": 2.3535420830557405, "grad_norm": 0.5146747988924121, "learning_rate": 4.330315609430835e-06, "loss": 0.0351, "step": 5299 }, { "epoch": 2.3539862314012883, "grad_norm": 0.3876173155044, "learning_rate": 4.328394589541223e-06, "loss": 0.0282, "step": 5300 }, { "epoch": 2.3544303797468356, "grad_norm": 0.414824658223231, "learning_rate": 4.326473670605315e-06, "loss": 0.0323, "step": 5301 }, { "epoch": 2.354874528092383, "grad_norm": 0.47728404717752354, "learning_rate": 4.324552852911854e-06, "loss": 0.0459, "step": 5302 }, { "epoch": 2.35531867643793, "grad_norm": 0.40142086992053616, "learning_rate": 4.322632136749572e-06, "loss": 0.0329, "step": 5303 }, { "epoch": 2.355762824783478, "grad_norm": 0.4704980131084727, "learning_rate": 4.3207115224071874e-06, "loss": 0.0366, "step": 5304 }, { "epoch": 2.356206973129025, "grad_norm": 0.402777301290612, "learning_rate": 4.318791010173401e-06, "loss": 0.0319, "step": 5305 }, { "epoch": 2.3566511214745725, "grad_norm": 0.41278158938376935, "learning_rate": 4.316870600336896e-06, "loss": 0.0217, "step": 5306 }, { "epoch": 2.3570952698201197, "grad_norm": 0.5124207452989102, "learning_rate": 4.314950293186346e-06, "loss": 0.0398, "step": 5307 }, { "epoch": 2.3575394181656675, "grad_norm": 0.4244779016373727, "learning_rate": 4.3130300890104035e-06, "loss": 0.0263, "step": 5308 }, { "epoch": 2.3579835665112148, "grad_norm": 0.6692076129286184, "learning_rate": 4.311109988097706e-06, "loss": 0.0347, "step": 5309 }, { "epoch": 2.358427714856762, "grad_norm": 0.5345819569804751, "learning_rate": 4.30918999073688e-06, "loss": 0.0406, "step": 5310 }, { "epoch": 2.3588718632023093, "grad_norm": 0.4416731105459865, "learning_rate": 4.307270097216535e-06, "loss": 0.0478, "step": 5311 }, { "epoch": 2.359316011547857, "grad_norm": 0.4789720007194078, "learning_rate": 4.305350307825261e-06, "loss": 0.0347, "step": 5312 }, { "epoch": 2.3597601598934044, "grad_norm": 0.39908333172506805, "learning_rate": 4.303430622851635e-06, "loss": 0.0346, "step": 5313 }, { "epoch": 2.3602043082389517, "grad_norm": 0.49076992885563064, "learning_rate": 4.301511042584219e-06, "loss": 0.035, "step": 5314 }, { "epoch": 2.3606484565844994, "grad_norm": 0.3862322602452697, "learning_rate": 4.29959156731156e-06, "loss": 0.0409, "step": 5315 }, { "epoch": 2.3610926049300467, "grad_norm": 0.4148195535388891, "learning_rate": 4.297672197322186e-06, "loss": 0.0382, "step": 5316 }, { "epoch": 2.361536753275594, "grad_norm": 0.5373449885131869, "learning_rate": 4.29575293290461e-06, "loss": 0.0371, "step": 5317 }, { "epoch": 2.3619809016211413, "grad_norm": 0.41088601906859734, "learning_rate": 4.293833774347333e-06, "loss": 0.0297, "step": 5318 }, { "epoch": 2.362425049966689, "grad_norm": 1.1416110662239514, "learning_rate": 4.291914721938835e-06, "loss": 0.0382, "step": 5319 }, { "epoch": 2.3628691983122363, "grad_norm": 0.4504483504568535, "learning_rate": 4.289995775967581e-06, "loss": 0.0297, "step": 5320 }, { "epoch": 2.3633133466577836, "grad_norm": 0.38320982061969394, "learning_rate": 4.2880769367220234e-06, "loss": 0.024, "step": 5321 }, { "epoch": 2.3637574950033313, "grad_norm": 0.3866521874991146, "learning_rate": 4.2861582044905966e-06, "loss": 0.0264, "step": 5322 }, { "epoch": 2.3642016433488786, "grad_norm": 0.5381622631772952, "learning_rate": 4.284239579561718e-06, "loss": 0.0514, "step": 5323 }, { "epoch": 2.364645791694426, "grad_norm": 0.4285272602850889, "learning_rate": 4.282321062223788e-06, "loss": 0.0365, "step": 5324 }, { "epoch": 2.365089940039973, "grad_norm": 0.5903609551581849, "learning_rate": 4.280402652765194e-06, "loss": 0.0428, "step": 5325 }, { "epoch": 2.365534088385521, "grad_norm": 0.6051857396994949, "learning_rate": 4.278484351474303e-06, "loss": 0.0336, "step": 5326 }, { "epoch": 2.365978236731068, "grad_norm": 0.569315522539356, "learning_rate": 4.2765661586394736e-06, "loss": 0.0334, "step": 5327 }, { "epoch": 2.3664223850766155, "grad_norm": 0.6976731675123818, "learning_rate": 4.2746480745490385e-06, "loss": 0.0367, "step": 5328 }, { "epoch": 2.3668665334221632, "grad_norm": 0.5611945388042203, "learning_rate": 4.272730099491319e-06, "loss": 0.0407, "step": 5329 }, { "epoch": 2.3673106817677105, "grad_norm": 0.49730153273797245, "learning_rate": 4.27081223375462e-06, "loss": 0.038, "step": 5330 }, { "epoch": 2.367754830113258, "grad_norm": 0.5313463154313218, "learning_rate": 4.268894477627229e-06, "loss": 0.0435, "step": 5331 }, { "epoch": 2.368198978458805, "grad_norm": 0.48983566744126533, "learning_rate": 4.2669768313974155e-06, "loss": 0.0373, "step": 5332 }, { "epoch": 2.3686431268043524, "grad_norm": 0.5179274904316602, "learning_rate": 4.265059295353439e-06, "loss": 0.0445, "step": 5333 }, { "epoch": 2.3690872751499, "grad_norm": 0.5588093911942073, "learning_rate": 4.2631418697835335e-06, "loss": 0.0351, "step": 5334 }, { "epoch": 2.3695314234954474, "grad_norm": 0.3586724840114786, "learning_rate": 4.261224554975923e-06, "loss": 0.0214, "step": 5335 }, { "epoch": 2.3699755718409947, "grad_norm": 0.418317991811463, "learning_rate": 4.259307351218812e-06, "loss": 0.0301, "step": 5336 }, { "epoch": 2.3704197201865425, "grad_norm": 0.33540637763336467, "learning_rate": 4.2573902588003844e-06, "loss": 0.023, "step": 5337 }, { "epoch": 2.3708638685320897, "grad_norm": 0.45375734251758604, "learning_rate": 4.2554732780088185e-06, "loss": 0.0394, "step": 5338 }, { "epoch": 2.371308016877637, "grad_norm": 0.5337079104144298, "learning_rate": 4.253556409132267e-06, "loss": 0.0441, "step": 5339 }, { "epoch": 2.3717521652231843, "grad_norm": 0.5739232408785224, "learning_rate": 4.251639652458866e-06, "loss": 0.0417, "step": 5340 }, { "epoch": 2.372196313568732, "grad_norm": 0.6130343197365772, "learning_rate": 4.249723008276737e-06, "loss": 0.0387, "step": 5341 }, { "epoch": 2.3726404619142794, "grad_norm": 0.4104117521503695, "learning_rate": 4.247806476873987e-06, "loss": 0.033, "step": 5342 }, { "epoch": 2.3730846102598266, "grad_norm": 0.48677001689089716, "learning_rate": 4.245890058538697e-06, "loss": 0.0377, "step": 5343 }, { "epoch": 2.3735287586053744, "grad_norm": 0.3714192923602575, "learning_rate": 4.2439737535589455e-06, "loss": 0.0294, "step": 5344 }, { "epoch": 2.3739729069509217, "grad_norm": 0.40375546238165533, "learning_rate": 4.2420575622227786e-06, "loss": 0.0314, "step": 5345 }, { "epoch": 2.374417055296469, "grad_norm": 0.7948951370956706, "learning_rate": 4.240141484818238e-06, "loss": 0.0393, "step": 5346 }, { "epoch": 2.3748612036420162, "grad_norm": 0.7705250912114215, "learning_rate": 4.238225521633339e-06, "loss": 0.0446, "step": 5347 }, { "epoch": 2.375305351987564, "grad_norm": 0.5226003605622348, "learning_rate": 4.2363096729560824e-06, "loss": 0.0397, "step": 5348 }, { "epoch": 2.3757495003331113, "grad_norm": 0.37536659049033705, "learning_rate": 4.234393939074456e-06, "loss": 0.0264, "step": 5349 }, { "epoch": 2.3761936486786586, "grad_norm": 0.35662293585999577, "learning_rate": 4.2324783202764265e-06, "loss": 0.0294, "step": 5350 }, { "epoch": 2.3766377970242063, "grad_norm": 0.5673307864129802, "learning_rate": 4.230562816849944e-06, "loss": 0.0471, "step": 5351 }, { "epoch": 2.3770819453697536, "grad_norm": 0.42418386406735814, "learning_rate": 4.228647429082939e-06, "loss": 0.0306, "step": 5352 }, { "epoch": 2.377526093715301, "grad_norm": 0.5746067334007909, "learning_rate": 4.22673215726333e-06, "loss": 0.031, "step": 5353 }, { "epoch": 2.377970242060848, "grad_norm": 0.3492096810181002, "learning_rate": 4.224817001679011e-06, "loss": 0.021, "step": 5354 }, { "epoch": 2.378414390406396, "grad_norm": 0.5532015823221401, "learning_rate": 4.222901962617867e-06, "loss": 0.0432, "step": 5355 }, { "epoch": 2.378858538751943, "grad_norm": 0.5532405005608227, "learning_rate": 4.220987040367757e-06, "loss": 0.0407, "step": 5356 }, { "epoch": 2.3793026870974905, "grad_norm": 0.35180832498879067, "learning_rate": 4.219072235216529e-06, "loss": 0.028, "step": 5357 }, { "epoch": 2.379746835443038, "grad_norm": 0.7448501299403424, "learning_rate": 4.2171575474520084e-06, "loss": 0.0328, "step": 5358 }, { "epoch": 2.3801909837885855, "grad_norm": 0.4554992316854567, "learning_rate": 4.215242977362009e-06, "loss": 0.0301, "step": 5359 }, { "epoch": 2.380635132134133, "grad_norm": 0.5712009195638152, "learning_rate": 4.213328525234317e-06, "loss": 0.0344, "step": 5360 }, { "epoch": 2.38107928047968, "grad_norm": 0.46746160638085626, "learning_rate": 4.211414191356714e-06, "loss": 0.0387, "step": 5361 }, { "epoch": 2.3815234288252274, "grad_norm": 0.4318040553120997, "learning_rate": 4.209499976016953e-06, "loss": 0.0359, "step": 5362 }, { "epoch": 2.381967577170775, "grad_norm": 0.4278461319811184, "learning_rate": 4.2075858795027745e-06, "loss": 0.0368, "step": 5363 }, { "epoch": 2.3824117255163224, "grad_norm": 0.32663597876713696, "learning_rate": 4.205671902101899e-06, "loss": 0.0256, "step": 5364 }, { "epoch": 2.3828558738618697, "grad_norm": 0.389353159520307, "learning_rate": 4.203758044102029e-06, "loss": 0.0259, "step": 5365 }, { "epoch": 2.3833000222074174, "grad_norm": 0.5653053154481631, "learning_rate": 4.2018443057908495e-06, "loss": 0.047, "step": 5366 }, { "epoch": 2.3837441705529647, "grad_norm": 0.34982509990328564, "learning_rate": 4.199930687456031e-06, "loss": 0.0267, "step": 5367 }, { "epoch": 2.384188318898512, "grad_norm": 0.41740396260225204, "learning_rate": 4.198017189385221e-06, "loss": 0.0293, "step": 5368 }, { "epoch": 2.3846324672440593, "grad_norm": 0.38322501636096945, "learning_rate": 4.1961038118660504e-06, "loss": 0.0329, "step": 5369 }, { "epoch": 2.385076615589607, "grad_norm": 0.45223291597674525, "learning_rate": 4.194190555186133e-06, "loss": 0.0347, "step": 5370 }, { "epoch": 2.3855207639351543, "grad_norm": 0.5113109910932524, "learning_rate": 4.1922774196330614e-06, "loss": 0.037, "step": 5371 }, { "epoch": 2.3859649122807016, "grad_norm": 0.4161439641942477, "learning_rate": 4.190364405494417e-06, "loss": 0.032, "step": 5372 }, { "epoch": 2.3864090606262494, "grad_norm": 0.3241401849079966, "learning_rate": 4.1884515130577545e-06, "loss": 0.0277, "step": 5373 }, { "epoch": 2.3868532089717966, "grad_norm": 0.41604888730983397, "learning_rate": 4.1865387426106165e-06, "loss": 0.0342, "step": 5374 }, { "epoch": 2.387297357317344, "grad_norm": 0.4441137282989938, "learning_rate": 4.184626094440524e-06, "loss": 0.0284, "step": 5375 }, { "epoch": 2.3877415056628912, "grad_norm": 0.5249221470565395, "learning_rate": 4.182713568834979e-06, "loss": 0.0363, "step": 5376 }, { "epoch": 2.388185654008439, "grad_norm": 0.5338153166350271, "learning_rate": 4.180801166081466e-06, "loss": 0.0505, "step": 5377 }, { "epoch": 2.3886298023539863, "grad_norm": 0.6473388727476899, "learning_rate": 4.178888886467457e-06, "loss": 0.0426, "step": 5378 }, { "epoch": 2.3890739506995335, "grad_norm": 0.362371921356082, "learning_rate": 4.176976730280396e-06, "loss": 0.0367, "step": 5379 }, { "epoch": 2.3895180990450813, "grad_norm": 0.401397401331949, "learning_rate": 4.175064697807712e-06, "loss": 0.0315, "step": 5380 }, { "epoch": 2.3899622473906286, "grad_norm": 0.4918164976839154, "learning_rate": 4.173152789336818e-06, "loss": 0.0377, "step": 5381 }, { "epoch": 2.390406395736176, "grad_norm": 0.4428239856696088, "learning_rate": 4.171241005155105e-06, "loss": 0.0424, "step": 5382 }, { "epoch": 2.390850544081723, "grad_norm": 0.5071830732288508, "learning_rate": 4.169329345549945e-06, "loss": 0.0343, "step": 5383 }, { "epoch": 2.391294692427271, "grad_norm": 0.5154632693129214, "learning_rate": 4.167417810808698e-06, "loss": 0.0342, "step": 5384 }, { "epoch": 2.391738840772818, "grad_norm": 0.6827121614415474, "learning_rate": 4.165506401218697e-06, "loss": 0.0537, "step": 5385 }, { "epoch": 2.3921829891183655, "grad_norm": 0.5360898324290861, "learning_rate": 4.163595117067258e-06, "loss": 0.0415, "step": 5386 }, { "epoch": 2.3926271374639128, "grad_norm": 0.4268060401836661, "learning_rate": 4.1616839586416825e-06, "loss": 0.0295, "step": 5387 }, { "epoch": 2.3930712858094605, "grad_norm": 0.6427383929847603, "learning_rate": 4.159772926229247e-06, "loss": 0.0312, "step": 5388 }, { "epoch": 2.393515434155008, "grad_norm": 0.5325554425749491, "learning_rate": 4.1578620201172144e-06, "loss": 0.0354, "step": 5389 }, { "epoch": 2.393959582500555, "grad_norm": 0.43938556377430205, "learning_rate": 4.155951240592825e-06, "loss": 0.0372, "step": 5390 }, { "epoch": 2.3944037308461024, "grad_norm": 0.5152442233098135, "learning_rate": 4.154040587943303e-06, "loss": 0.0402, "step": 5391 }, { "epoch": 2.39484787919165, "grad_norm": 0.42703323082019723, "learning_rate": 4.1521300624558516e-06, "loss": 0.0373, "step": 5392 }, { "epoch": 2.3952920275371974, "grad_norm": 0.402722970762066, "learning_rate": 4.150219664417653e-06, "loss": 0.0311, "step": 5393 }, { "epoch": 2.3957361758827447, "grad_norm": 0.6214432623647976, "learning_rate": 4.148309394115872e-06, "loss": 0.0364, "step": 5394 }, { "epoch": 2.3961803242282924, "grad_norm": 0.44276990850604514, "learning_rate": 4.14639925183766e-06, "loss": 0.0341, "step": 5395 }, { "epoch": 2.3966244725738397, "grad_norm": 0.4294877304529703, "learning_rate": 4.144489237870141e-06, "loss": 0.0298, "step": 5396 }, { "epoch": 2.397068620919387, "grad_norm": 0.46553831317336336, "learning_rate": 4.142579352500421e-06, "loss": 0.0318, "step": 5397 }, { "epoch": 2.3975127692649343, "grad_norm": 0.4319310212143851, "learning_rate": 4.14066959601559e-06, "loss": 0.0412, "step": 5398 }, { "epoch": 2.397956917610482, "grad_norm": 0.6999580389169097, "learning_rate": 4.138759968702716e-06, "loss": 0.0496, "step": 5399 }, { "epoch": 2.3984010659560293, "grad_norm": 0.6261420455213397, "learning_rate": 4.1368504708488476e-06, "loss": 0.0387, "step": 5400 }, { "epoch": 2.3988452143015766, "grad_norm": 0.352239243573041, "learning_rate": 4.134941102741016e-06, "loss": 0.0286, "step": 5401 }, { "epoch": 2.3992893626471243, "grad_norm": 0.4855267366461917, "learning_rate": 4.133031864666232e-06, "loss": 0.042, "step": 5402 }, { "epoch": 2.3997335109926716, "grad_norm": 0.48074181838542007, "learning_rate": 4.1311227569114855e-06, "loss": 0.0516, "step": 5403 }, { "epoch": 2.400177659338219, "grad_norm": 0.4770686273769997, "learning_rate": 4.12921377976375e-06, "loss": 0.0378, "step": 5404 }, { "epoch": 2.400621807683766, "grad_norm": 0.5247613321525735, "learning_rate": 4.127304933509972e-06, "loss": 0.0414, "step": 5405 }, { "epoch": 2.401065956029314, "grad_norm": 0.4623711119275284, "learning_rate": 4.125396218437089e-06, "loss": 0.0589, "step": 5406 }, { "epoch": 2.4015101043748612, "grad_norm": 0.49698176360744994, "learning_rate": 4.123487634832011e-06, "loss": 0.0456, "step": 5407 }, { "epoch": 2.4019542527204085, "grad_norm": 0.4203922139028562, "learning_rate": 4.121579182981632e-06, "loss": 0.0312, "step": 5408 }, { "epoch": 2.4023984010659563, "grad_norm": 0.3669217976875289, "learning_rate": 4.119670863172824e-06, "loss": 0.0289, "step": 5409 }, { "epoch": 2.4028425494115035, "grad_norm": 0.38804870930885565, "learning_rate": 4.117762675692437e-06, "loss": 0.0251, "step": 5410 }, { "epoch": 2.403286697757051, "grad_norm": 0.45435459736787537, "learning_rate": 4.115854620827306e-06, "loss": 0.0257, "step": 5411 }, { "epoch": 2.403730846102598, "grad_norm": 0.46297467981750196, "learning_rate": 4.1139466988642475e-06, "loss": 0.0433, "step": 5412 }, { "epoch": 2.404174994448146, "grad_norm": 0.4824849195132302, "learning_rate": 4.11203891009005e-06, "loss": 0.0484, "step": 5413 }, { "epoch": 2.404619142793693, "grad_norm": 0.45065972785277525, "learning_rate": 4.110131254791489e-06, "loss": 0.028, "step": 5414 }, { "epoch": 2.4050632911392404, "grad_norm": 0.5431313239180846, "learning_rate": 4.108223733255316e-06, "loss": 0.0442, "step": 5415 }, { "epoch": 2.4055074394847877, "grad_norm": 0.46821071422858296, "learning_rate": 4.106316345768265e-06, "loss": 0.027, "step": 5416 }, { "epoch": 2.4059515878303355, "grad_norm": 0.4155972801004647, "learning_rate": 4.104409092617047e-06, "loss": 0.033, "step": 5417 }, { "epoch": 2.4063957361758828, "grad_norm": 0.6893775471251186, "learning_rate": 4.1025019740883556e-06, "loss": 0.0504, "step": 5418 }, { "epoch": 2.40683988452143, "grad_norm": 0.4083036792334355, "learning_rate": 4.100594990468865e-06, "loss": 0.0333, "step": 5419 }, { "epoch": 2.4072840328669773, "grad_norm": 0.4849564201026396, "learning_rate": 4.0986881420452254e-06, "loss": 0.0338, "step": 5420 }, { "epoch": 2.407728181212525, "grad_norm": 0.4268462015661056, "learning_rate": 4.096781429104068e-06, "loss": 0.0341, "step": 5421 }, { "epoch": 2.4081723295580724, "grad_norm": 0.4834546635420465, "learning_rate": 4.094874851932002e-06, "loss": 0.035, "step": 5422 }, { "epoch": 2.4086164779036197, "grad_norm": 0.46585094752698974, "learning_rate": 4.092968410815625e-06, "loss": 0.0288, "step": 5423 }, { "epoch": 2.4090606262491674, "grad_norm": 0.6920655509720604, "learning_rate": 4.091062106041504e-06, "loss": 0.0296, "step": 5424 }, { "epoch": 2.4095047745947147, "grad_norm": 0.5102739827979796, "learning_rate": 4.089155937896187e-06, "loss": 0.0316, "step": 5425 }, { "epoch": 2.409948922940262, "grad_norm": 0.36415630770650137, "learning_rate": 4.087249906666206e-06, "loss": 0.0296, "step": 5426 }, { "epoch": 2.4103930712858093, "grad_norm": 0.3263196544116076, "learning_rate": 4.085344012638067e-06, "loss": 0.0273, "step": 5427 }, { "epoch": 2.410837219631357, "grad_norm": 0.5467141941835238, "learning_rate": 4.083438256098261e-06, "loss": 0.0368, "step": 5428 }, { "epoch": 2.4112813679769043, "grad_norm": 0.5887360709523032, "learning_rate": 4.081532637333255e-06, "loss": 0.0389, "step": 5429 }, { "epoch": 2.4117255163224516, "grad_norm": 0.4074807509483992, "learning_rate": 4.079627156629497e-06, "loss": 0.0239, "step": 5430 }, { "epoch": 2.4121696646679993, "grad_norm": 0.3424789553984037, "learning_rate": 4.07772181427341e-06, "loss": 0.0301, "step": 5431 }, { "epoch": 2.4126138130135466, "grad_norm": 0.607202231589749, "learning_rate": 4.075816610551402e-06, "loss": 0.0496, "step": 5432 }, { "epoch": 2.413057961359094, "grad_norm": 0.47864092786503787, "learning_rate": 4.073911545749857e-06, "loss": 0.0316, "step": 5433 }, { "epoch": 2.413502109704641, "grad_norm": 0.5034557440074656, "learning_rate": 4.072006620155136e-06, "loss": 0.026, "step": 5434 }, { "epoch": 2.413946258050189, "grad_norm": 0.36403294815194714, "learning_rate": 4.070101834053585e-06, "loss": 0.0304, "step": 5435 }, { "epoch": 2.414390406395736, "grad_norm": 0.4715313044352068, "learning_rate": 4.068197187731526e-06, "loss": 0.0369, "step": 5436 }, { "epoch": 2.4148345547412835, "grad_norm": 0.27768669661494494, "learning_rate": 4.066292681475257e-06, "loss": 0.0174, "step": 5437 }, { "epoch": 2.4152787030868312, "grad_norm": 0.6691350505746541, "learning_rate": 4.064388315571059e-06, "loss": 0.0318, "step": 5438 }, { "epoch": 2.4157228514323785, "grad_norm": 0.4180829624879483, "learning_rate": 4.062484090305191e-06, "loss": 0.0416, "step": 5439 }, { "epoch": 2.416166999777926, "grad_norm": 0.3034927493418548, "learning_rate": 4.060580005963888e-06, "loss": 0.0249, "step": 5440 }, { "epoch": 2.416611148123473, "grad_norm": 0.42896525131188157, "learning_rate": 4.05867606283337e-06, "loss": 0.0345, "step": 5441 }, { "epoch": 2.417055296469021, "grad_norm": 0.356634005220255, "learning_rate": 4.0567722611998285e-06, "loss": 0.0244, "step": 5442 }, { "epoch": 2.417499444814568, "grad_norm": 0.49362452123065936, "learning_rate": 4.054868601349441e-06, "loss": 0.0349, "step": 5443 }, { "epoch": 2.4179435931601154, "grad_norm": 0.3476206524561192, "learning_rate": 4.052965083568356e-06, "loss": 0.0262, "step": 5444 }, { "epoch": 2.4183877415056627, "grad_norm": 0.4046428177539252, "learning_rate": 4.051061708142705e-06, "loss": 0.0308, "step": 5445 }, { "epoch": 2.4188318898512104, "grad_norm": 0.6886851057569472, "learning_rate": 4.0491584753586e-06, "loss": 0.0489, "step": 5446 }, { "epoch": 2.4192760381967577, "grad_norm": 0.4422724381498167, "learning_rate": 4.047255385502129e-06, "loss": 0.0338, "step": 5447 }, { "epoch": 2.419720186542305, "grad_norm": 0.35168629623661846, "learning_rate": 4.045352438859359e-06, "loss": 0.0202, "step": 5448 }, { "epoch": 2.4201643348878523, "grad_norm": 0.449273667282489, "learning_rate": 4.043449635716332e-06, "loss": 0.0396, "step": 5449 }, { "epoch": 2.4206084832334, "grad_norm": 0.5131783172439474, "learning_rate": 4.0415469763590745e-06, "loss": 0.0416, "step": 5450 }, { "epoch": 2.4210526315789473, "grad_norm": 0.46807114802560923, "learning_rate": 4.0396444610735865e-06, "loss": 0.0439, "step": 5451 }, { "epoch": 2.4214967799244946, "grad_norm": 0.6426317590879459, "learning_rate": 4.037742090145851e-06, "loss": 0.05, "step": 5452 }, { "epoch": 2.4219409282700424, "grad_norm": 0.5241231307427713, "learning_rate": 4.0358398638618245e-06, "loss": 0.0279, "step": 5453 }, { "epoch": 2.4223850766155897, "grad_norm": 0.3603709645668666, "learning_rate": 4.033937782507445e-06, "loss": 0.0323, "step": 5454 }, { "epoch": 2.422829224961137, "grad_norm": 0.4940232401715231, "learning_rate": 4.032035846368627e-06, "loss": 0.0316, "step": 5455 }, { "epoch": 2.4232733733066842, "grad_norm": 0.4547687513915308, "learning_rate": 4.030134055731266e-06, "loss": 0.0398, "step": 5456 }, { "epoch": 2.423717521652232, "grad_norm": 0.42013634054715737, "learning_rate": 4.028232410881228e-06, "loss": 0.0218, "step": 5457 }, { "epoch": 2.4241616699977793, "grad_norm": 0.44875489093858906, "learning_rate": 4.026330912104369e-06, "loss": 0.0447, "step": 5458 }, { "epoch": 2.4246058183433266, "grad_norm": 0.827939430580337, "learning_rate": 4.024429559686513e-06, "loss": 0.0678, "step": 5459 }, { "epoch": 2.4250499666888743, "grad_norm": 0.7485263459812835, "learning_rate": 4.022528353913466e-06, "loss": 0.0442, "step": 5460 }, { "epoch": 2.4254941150344216, "grad_norm": 0.3550612167644366, "learning_rate": 4.020627295071012e-06, "loss": 0.0334, "step": 5461 }, { "epoch": 2.425938263379969, "grad_norm": 0.5079952410434588, "learning_rate": 4.018726383444911e-06, "loss": 0.0424, "step": 5462 }, { "epoch": 2.426382411725516, "grad_norm": 0.3588389906173018, "learning_rate": 4.016825619320904e-06, "loss": 0.0256, "step": 5463 }, { "epoch": 2.426826560071064, "grad_norm": 0.37557127358260195, "learning_rate": 4.014925002984708e-06, "loss": 0.027, "step": 5464 }, { "epoch": 2.427270708416611, "grad_norm": 0.3878661608771635, "learning_rate": 4.013024534722018e-06, "loss": 0.0331, "step": 5465 }, { "epoch": 2.4277148567621585, "grad_norm": 0.5433675019773777, "learning_rate": 4.011124214818506e-06, "loss": 0.032, "step": 5466 }, { "epoch": 2.428159005107706, "grad_norm": 0.3899751784915578, "learning_rate": 4.0092240435598225e-06, "loss": 0.0356, "step": 5467 }, { "epoch": 2.4286031534532535, "grad_norm": 0.4327878275713323, "learning_rate": 4.007324021231594e-06, "loss": 0.0319, "step": 5468 }, { "epoch": 2.429047301798801, "grad_norm": 0.49698866083093113, "learning_rate": 4.00542414811943e-06, "loss": 0.0305, "step": 5469 }, { "epoch": 2.429491450144348, "grad_norm": 0.4026428626742401, "learning_rate": 4.00352442450891e-06, "loss": 0.0348, "step": 5470 }, { "epoch": 2.4299355984898954, "grad_norm": 0.48636219393391805, "learning_rate": 4.001624850685598e-06, "loss": 0.0346, "step": 5471 }, { "epoch": 2.430379746835443, "grad_norm": 0.4112453328110524, "learning_rate": 3.999725426935029e-06, "loss": 0.0428, "step": 5472 }, { "epoch": 2.4308238951809904, "grad_norm": 0.5337831095831886, "learning_rate": 3.99782615354272e-06, "loss": 0.0442, "step": 5473 }, { "epoch": 2.4312680435265377, "grad_norm": 0.37305930397141557, "learning_rate": 3.995927030794163e-06, "loss": 0.0357, "step": 5474 }, { "epoch": 2.4317121918720854, "grad_norm": 0.4326214227306239, "learning_rate": 3.994028058974832e-06, "loss": 0.0294, "step": 5475 }, { "epoch": 2.4321563402176327, "grad_norm": 0.35769209966498045, "learning_rate": 3.992129238370171e-06, "loss": 0.033, "step": 5476 }, { "epoch": 2.43260048856318, "grad_norm": 0.5540164347858187, "learning_rate": 3.9902305692656056e-06, "loss": 0.0396, "step": 5477 }, { "epoch": 2.4330446369087273, "grad_norm": 0.6878308326999009, "learning_rate": 3.98833205194654e-06, "loss": 0.0291, "step": 5478 }, { "epoch": 2.433488785254275, "grad_norm": 0.6656069204526499, "learning_rate": 3.98643368669835e-06, "loss": 0.0377, "step": 5479 }, { "epoch": 2.4339329335998223, "grad_norm": 0.36440064361824326, "learning_rate": 3.984535473806395e-06, "loss": 0.031, "step": 5480 }, { "epoch": 2.4343770819453696, "grad_norm": 0.46753874608394186, "learning_rate": 3.98263741355601e-06, "loss": 0.0296, "step": 5481 }, { "epoch": 2.4348212302909173, "grad_norm": 0.41167122039004567, "learning_rate": 3.980739506232503e-06, "loss": 0.0308, "step": 5482 }, { "epoch": 2.4352653786364646, "grad_norm": 0.4469078274911405, "learning_rate": 3.978841752121161e-06, "loss": 0.0355, "step": 5483 }, { "epoch": 2.435709526982012, "grad_norm": 0.39826295605317574, "learning_rate": 3.976944151507251e-06, "loss": 0.0288, "step": 5484 }, { "epoch": 2.436153675327559, "grad_norm": 0.4535357297321569, "learning_rate": 3.975046704676014e-06, "loss": 0.0286, "step": 5485 }, { "epoch": 2.436597823673107, "grad_norm": 0.4396407655662091, "learning_rate": 3.973149411912668e-06, "loss": 0.0305, "step": 5486 }, { "epoch": 2.4370419720186542, "grad_norm": 0.3507364407629326, "learning_rate": 3.971252273502407e-06, "loss": 0.0248, "step": 5487 }, { "epoch": 2.4374861203642015, "grad_norm": 0.3450849459589068, "learning_rate": 3.969355289730407e-06, "loss": 0.0206, "step": 5488 }, { "epoch": 2.4379302687097493, "grad_norm": 0.503140049702594, "learning_rate": 3.967458460881815e-06, "loss": 0.04, "step": 5489 }, { "epoch": 2.4383744170552966, "grad_norm": 0.5640472250485181, "learning_rate": 3.965561787241754e-06, "loss": 0.0404, "step": 5490 }, { "epoch": 2.438818565400844, "grad_norm": 0.6056306520006238, "learning_rate": 3.963665269095328e-06, "loss": 0.0404, "step": 5491 }, { "epoch": 2.439262713746391, "grad_norm": 0.4258100611472551, "learning_rate": 3.961768906727618e-06, "loss": 0.028, "step": 5492 }, { "epoch": 2.439706862091939, "grad_norm": 0.38264655994704755, "learning_rate": 3.959872700423678e-06, "loss": 0.0256, "step": 5493 }, { "epoch": 2.440151010437486, "grad_norm": 0.389861248826636, "learning_rate": 3.957976650468539e-06, "loss": 0.0333, "step": 5494 }, { "epoch": 2.4405951587830335, "grad_norm": 0.6710261752550767, "learning_rate": 3.956080757147211e-06, "loss": 0.0288, "step": 5495 }, { "epoch": 2.441039307128581, "grad_norm": 0.533539192209703, "learning_rate": 3.9541850207446754e-06, "loss": 0.029, "step": 5496 }, { "epoch": 2.4414834554741285, "grad_norm": 0.7167183091153162, "learning_rate": 3.9522894415459e-06, "loss": 0.0473, "step": 5497 }, { "epoch": 2.4419276038196758, "grad_norm": 0.322074964494659, "learning_rate": 3.950394019835817e-06, "loss": 0.0258, "step": 5498 }, { "epoch": 2.442371752165223, "grad_norm": 0.3900552673739537, "learning_rate": 3.948498755899344e-06, "loss": 0.0287, "step": 5499 }, { "epoch": 2.4428159005107704, "grad_norm": 0.4466770793345454, "learning_rate": 3.94660365002137e-06, "loss": 0.0292, "step": 5500 }, { "epoch": 2.443260048856318, "grad_norm": 0.5418862551373604, "learning_rate": 3.94470870248676e-06, "loss": 0.0434, "step": 5501 }, { "epoch": 2.4437041972018654, "grad_norm": 0.5170924051250978, "learning_rate": 3.942813913580358e-06, "loss": 0.044, "step": 5502 }, { "epoch": 2.4441483455474127, "grad_norm": 0.5438284418911795, "learning_rate": 3.940919283586985e-06, "loss": 0.0448, "step": 5503 }, { "epoch": 2.4445924938929604, "grad_norm": 0.5198158479237611, "learning_rate": 3.9390248127914325e-06, "loss": 0.0528, "step": 5504 }, { "epoch": 2.4450366422385077, "grad_norm": 0.520564892893801, "learning_rate": 3.937130501478475e-06, "loss": 0.0488, "step": 5505 }, { "epoch": 2.445480790584055, "grad_norm": 0.46307199806507215, "learning_rate": 3.935236349932858e-06, "loss": 0.0334, "step": 5506 }, { "epoch": 2.4459249389296023, "grad_norm": 0.5703039507859231, "learning_rate": 3.933342358439304e-06, "loss": 0.0388, "step": 5507 }, { "epoch": 2.44636908727515, "grad_norm": 0.480094036827002, "learning_rate": 3.931448527282512e-06, "loss": 0.0383, "step": 5508 }, { "epoch": 2.4468132356206973, "grad_norm": 0.5416338368617786, "learning_rate": 3.9295548567471595e-06, "loss": 0.044, "step": 5509 }, { "epoch": 2.4472573839662446, "grad_norm": 0.3789299998421564, "learning_rate": 3.927661347117896e-06, "loss": 0.0308, "step": 5510 }, { "epoch": 2.4477015323117923, "grad_norm": 0.42767639025063464, "learning_rate": 3.925767998679347e-06, "loss": 0.032, "step": 5511 }, { "epoch": 2.4481456806573396, "grad_norm": 0.3723173022056738, "learning_rate": 3.923874811716116e-06, "loss": 0.0299, "step": 5512 }, { "epoch": 2.448589829002887, "grad_norm": 0.647876325917259, "learning_rate": 3.92198178651278e-06, "loss": 0.0334, "step": 5513 }, { "epoch": 2.449033977348434, "grad_norm": 0.3651110551254141, "learning_rate": 3.920088923353895e-06, "loss": 0.0404, "step": 5514 }, { "epoch": 2.449478125693982, "grad_norm": 0.43323931222905593, "learning_rate": 3.918196222523989e-06, "loss": 0.0458, "step": 5515 }, { "epoch": 2.4499222740395292, "grad_norm": 0.4066604237779864, "learning_rate": 3.916303684307568e-06, "loss": 0.0291, "step": 5516 }, { "epoch": 2.4503664223850765, "grad_norm": 0.4166876666433253, "learning_rate": 3.914411308989113e-06, "loss": 0.027, "step": 5517 }, { "epoch": 2.4508105707306242, "grad_norm": 0.45791540033658323, "learning_rate": 3.9125190968530766e-06, "loss": 0.0271, "step": 5518 }, { "epoch": 2.4512547190761715, "grad_norm": 0.4364649579290464, "learning_rate": 3.910627048183893e-06, "loss": 0.0325, "step": 5519 }, { "epoch": 2.451698867421719, "grad_norm": 0.34758907020261137, "learning_rate": 3.908735163265971e-06, "loss": 0.0225, "step": 5520 }, { "epoch": 2.452143015767266, "grad_norm": 0.386095366133868, "learning_rate": 3.906843442383691e-06, "loss": 0.0253, "step": 5521 }, { "epoch": 2.452587164112814, "grad_norm": 0.5285696022742041, "learning_rate": 3.90495188582141e-06, "loss": 0.0288, "step": 5522 }, { "epoch": 2.453031312458361, "grad_norm": 0.5685349400918212, "learning_rate": 3.903060493863463e-06, "loss": 0.0384, "step": 5523 }, { "epoch": 2.4534754608039084, "grad_norm": 0.33536529902813805, "learning_rate": 3.901169266794158e-06, "loss": 0.0256, "step": 5524 }, { "epoch": 2.4539196091494557, "grad_norm": 0.3367209757616414, "learning_rate": 3.899278204897777e-06, "loss": 0.0308, "step": 5525 }, { "epoch": 2.4543637574950035, "grad_norm": 0.35301257937407804, "learning_rate": 3.89738730845858e-06, "loss": 0.0326, "step": 5526 }, { "epoch": 2.4548079058405508, "grad_norm": 0.44674618292196533, "learning_rate": 3.895496577760802e-06, "loss": 0.0378, "step": 5527 }, { "epoch": 2.455252054186098, "grad_norm": 0.33090638899042374, "learning_rate": 3.893606013088649e-06, "loss": 0.0287, "step": 5528 }, { "epoch": 2.4556962025316453, "grad_norm": 0.6629654415706786, "learning_rate": 3.8917156147263075e-06, "loss": 0.0361, "step": 5529 }, { "epoch": 2.456140350877193, "grad_norm": 0.4137805309428058, "learning_rate": 3.889825382957935e-06, "loss": 0.0381, "step": 5530 }, { "epoch": 2.4565844992227404, "grad_norm": 0.48510449046292115, "learning_rate": 3.887935318067665e-06, "loss": 0.0369, "step": 5531 }, { "epoch": 2.4570286475682876, "grad_norm": 0.4301148483997256, "learning_rate": 3.886045420339608e-06, "loss": 0.0288, "step": 5532 }, { "epoch": 2.4574727959138354, "grad_norm": 0.42246466717722975, "learning_rate": 3.884155690057849e-06, "loss": 0.0301, "step": 5533 }, { "epoch": 2.4579169442593827, "grad_norm": 0.4345894945232888, "learning_rate": 3.882266127506444e-06, "loss": 0.0327, "step": 5534 }, { "epoch": 2.45836109260493, "grad_norm": 0.47014262773033944, "learning_rate": 3.880376732969427e-06, "loss": 0.0406, "step": 5535 }, { "epoch": 2.4588052409504773, "grad_norm": 0.5529245797017762, "learning_rate": 3.8784875067308035e-06, "loss": 0.0428, "step": 5536 }, { "epoch": 2.459249389296025, "grad_norm": 0.5853637583441337, "learning_rate": 3.876598449074561e-06, "loss": 0.0484, "step": 5537 }, { "epoch": 2.4596935376415723, "grad_norm": 0.4308021015437979, "learning_rate": 3.874709560284655e-06, "loss": 0.0331, "step": 5538 }, { "epoch": 2.4601376859871196, "grad_norm": 0.41105828222307955, "learning_rate": 3.872820840645017e-06, "loss": 0.0291, "step": 5539 }, { "epoch": 2.4605818343326673, "grad_norm": 0.4669061473028007, "learning_rate": 3.8709322904395556e-06, "loss": 0.0275, "step": 5540 }, { "epoch": 2.4610259826782146, "grad_norm": 0.4419324556460313, "learning_rate": 3.869043909952149e-06, "loss": 0.046, "step": 5541 }, { "epoch": 2.461470131023762, "grad_norm": 0.4298170656814399, "learning_rate": 3.867155699466653e-06, "loss": 0.0376, "step": 5542 }, { "epoch": 2.461914279369309, "grad_norm": 0.9256029510191909, "learning_rate": 3.865267659266901e-06, "loss": 0.0365, "step": 5543 }, { "epoch": 2.462358427714857, "grad_norm": 0.4535797528392373, "learning_rate": 3.863379789636696e-06, "loss": 0.0302, "step": 5544 }, { "epoch": 2.462802576060404, "grad_norm": 0.5482760330729667, "learning_rate": 3.861492090859816e-06, "loss": 0.0365, "step": 5545 }, { "epoch": 2.4632467244059515, "grad_norm": 0.36157460865248275, "learning_rate": 3.8596045632200126e-06, "loss": 0.0266, "step": 5546 }, { "epoch": 2.4636908727514992, "grad_norm": 0.4873814379100533, "learning_rate": 3.857717207001017e-06, "loss": 0.037, "step": 5547 }, { "epoch": 2.4641350210970465, "grad_norm": 0.7370873255745318, "learning_rate": 3.855830022486528e-06, "loss": 0.0415, "step": 5548 }, { "epoch": 2.464579169442594, "grad_norm": 0.493249372834125, "learning_rate": 3.853943009960225e-06, "loss": 0.0386, "step": 5549 }, { "epoch": 2.465023317788141, "grad_norm": 0.6919203726918821, "learning_rate": 3.852056169705753e-06, "loss": 0.0389, "step": 5550 }, { "epoch": 2.465467466133689, "grad_norm": 0.4116990565345076, "learning_rate": 3.850169502006741e-06, "loss": 0.0302, "step": 5551 }, { "epoch": 2.465911614479236, "grad_norm": 0.34358727354238183, "learning_rate": 3.848283007146784e-06, "loss": 0.0325, "step": 5552 }, { "epoch": 2.4663557628247834, "grad_norm": 0.35614401203480484, "learning_rate": 3.846396685409455e-06, "loss": 0.0322, "step": 5553 }, { "epoch": 2.4667999111703307, "grad_norm": 0.25982925770375787, "learning_rate": 3.8445105370782995e-06, "loss": 0.0262, "step": 5554 }, { "epoch": 2.4672440595158784, "grad_norm": 0.3510865325198374, "learning_rate": 3.842624562436841e-06, "loss": 0.0386, "step": 5555 }, { "epoch": 2.4676882078614257, "grad_norm": 0.4049390820424808, "learning_rate": 3.8407387617685696e-06, "loss": 0.0247, "step": 5556 }, { "epoch": 2.468132356206973, "grad_norm": 0.42859540412907965, "learning_rate": 3.838853135356956e-06, "loss": 0.0368, "step": 5557 }, { "epoch": 2.4685765045525203, "grad_norm": 0.35853923916250774, "learning_rate": 3.836967683485441e-06, "loss": 0.0329, "step": 5558 }, { "epoch": 2.469020652898068, "grad_norm": 0.4278309028616722, "learning_rate": 3.835082406437437e-06, "loss": 0.0362, "step": 5559 }, { "epoch": 2.4694648012436153, "grad_norm": 1.0923622589898494, "learning_rate": 3.833197304496336e-06, "loss": 0.044, "step": 5560 }, { "epoch": 2.4699089495891626, "grad_norm": 0.37654012560879, "learning_rate": 3.8313123779455035e-06, "loss": 0.0262, "step": 5561 }, { "epoch": 2.4703530979347104, "grad_norm": 0.41898160595248646, "learning_rate": 3.829427627068272e-06, "loss": 0.0344, "step": 5562 }, { "epoch": 2.4707972462802577, "grad_norm": 0.3844261706628325, "learning_rate": 3.827543052147952e-06, "loss": 0.0273, "step": 5563 }, { "epoch": 2.471241394625805, "grad_norm": 0.4489319056107684, "learning_rate": 3.8256586534678285e-06, "loss": 0.0318, "step": 5564 }, { "epoch": 2.4716855429713522, "grad_norm": 0.6833288857085646, "learning_rate": 3.8237744313111565e-06, "loss": 0.0404, "step": 5565 }, { "epoch": 2.4721296913169, "grad_norm": 0.430664114118146, "learning_rate": 3.82189038596117e-06, "loss": 0.0352, "step": 5566 }, { "epoch": 2.4725738396624473, "grad_norm": 0.4030254224585019, "learning_rate": 3.820006517701069e-06, "loss": 0.0321, "step": 5567 }, { "epoch": 2.4730179880079945, "grad_norm": 0.5685844819493571, "learning_rate": 3.8181228268140354e-06, "loss": 0.0507, "step": 5568 }, { "epoch": 2.4734621363535423, "grad_norm": 0.44264389289592554, "learning_rate": 3.816239313583217e-06, "loss": 0.0376, "step": 5569 }, { "epoch": 2.4739062846990896, "grad_norm": 0.3377904339220037, "learning_rate": 3.814355978291736e-06, "loss": 0.0327, "step": 5570 }, { "epoch": 2.474350433044637, "grad_norm": 0.630683589245786, "learning_rate": 3.8124728212226938e-06, "loss": 0.0471, "step": 5571 }, { "epoch": 2.474794581390184, "grad_norm": 0.42602963126310434, "learning_rate": 3.810589842659159e-06, "loss": 0.0297, "step": 5572 }, { "epoch": 2.475238729735732, "grad_norm": 0.4633136103256966, "learning_rate": 3.808707042884176e-06, "loss": 0.028, "step": 5573 }, { "epoch": 2.475682878081279, "grad_norm": 0.5782431299481687, "learning_rate": 3.8068244221807606e-06, "loss": 0.0383, "step": 5574 }, { "epoch": 2.4761270264268265, "grad_norm": 0.5372901069709559, "learning_rate": 3.8049419808319033e-06, "loss": 0.036, "step": 5575 }, { "epoch": 2.476571174772374, "grad_norm": 0.2611922701250809, "learning_rate": 3.8030597191205643e-06, "loss": 0.0217, "step": 5576 }, { "epoch": 2.4770153231179215, "grad_norm": 0.36585043972090653, "learning_rate": 3.8011776373296837e-06, "loss": 0.0286, "step": 5577 }, { "epoch": 2.477459471463469, "grad_norm": 0.5157568238144473, "learning_rate": 3.79929573574217e-06, "loss": 0.0343, "step": 5578 }, { "epoch": 2.477903619809016, "grad_norm": 0.4610943022253674, "learning_rate": 3.797414014640903e-06, "loss": 0.0318, "step": 5579 }, { "epoch": 2.478347768154564, "grad_norm": 0.47373180317876196, "learning_rate": 3.795532474308737e-06, "loss": 0.0427, "step": 5580 }, { "epoch": 2.478791916500111, "grad_norm": 0.4145736062776239, "learning_rate": 3.7936511150285014e-06, "loss": 0.038, "step": 5581 }, { "epoch": 2.4792360648456584, "grad_norm": 0.5254698644896055, "learning_rate": 3.7917699370829935e-06, "loss": 0.0349, "step": 5582 }, { "epoch": 2.4796802131912057, "grad_norm": 0.4171576347283061, "learning_rate": 3.789888940754991e-06, "loss": 0.0303, "step": 5583 }, { "epoch": 2.4801243615367534, "grad_norm": 0.39892945172204647, "learning_rate": 3.788008126327235e-06, "loss": 0.041, "step": 5584 }, { "epoch": 2.4805685098823007, "grad_norm": 0.4156599218368941, "learning_rate": 3.7861274940824473e-06, "loss": 0.0308, "step": 5585 }, { "epoch": 2.481012658227848, "grad_norm": 0.3949693643427857, "learning_rate": 3.784247044303317e-06, "loss": 0.0311, "step": 5586 }, { "epoch": 2.4814568065733953, "grad_norm": 0.3619897883910703, "learning_rate": 3.782366777272506e-06, "loss": 0.0304, "step": 5587 }, { "epoch": 2.481900954918943, "grad_norm": 0.3797535629139442, "learning_rate": 3.7804866932726535e-06, "loss": 0.0265, "step": 5588 }, { "epoch": 2.4823451032644903, "grad_norm": 0.4514704220606347, "learning_rate": 3.778606792586368e-06, "loss": 0.0242, "step": 5589 }, { "epoch": 2.4827892516100376, "grad_norm": 0.5885462347335189, "learning_rate": 3.7767270754962294e-06, "loss": 0.0462, "step": 5590 }, { "epoch": 2.4832333999555853, "grad_norm": 0.46737727413834057, "learning_rate": 3.7748475422847896e-06, "loss": 0.0476, "step": 5591 }, { "epoch": 2.4836775483011326, "grad_norm": 0.34347582817428995, "learning_rate": 3.7729681932345776e-06, "loss": 0.0309, "step": 5592 }, { "epoch": 2.48412169664668, "grad_norm": 0.5416760213809743, "learning_rate": 3.771089028628087e-06, "loss": 0.0301, "step": 5593 }, { "epoch": 2.484565844992227, "grad_norm": 0.5279294508428536, "learning_rate": 3.7692100487477936e-06, "loss": 0.0338, "step": 5594 }, { "epoch": 2.485009993337775, "grad_norm": 0.40771734537444654, "learning_rate": 3.7673312538761362e-06, "loss": 0.0244, "step": 5595 }, { "epoch": 2.4854541416833222, "grad_norm": 0.3225283085841615, "learning_rate": 3.765452644295532e-06, "loss": 0.0252, "step": 5596 }, { "epoch": 2.4858982900288695, "grad_norm": 0.4343781070994849, "learning_rate": 3.7635742202883664e-06, "loss": 0.0301, "step": 5597 }, { "epoch": 2.4863424383744173, "grad_norm": 0.48736918945639, "learning_rate": 3.761695982136997e-06, "loss": 0.0283, "step": 5598 }, { "epoch": 2.4867865867199646, "grad_norm": 0.4628216568506521, "learning_rate": 3.759817930123756e-06, "loss": 0.0325, "step": 5599 }, { "epoch": 2.487230735065512, "grad_norm": 0.4979234883742358, "learning_rate": 3.75794006453095e-06, "loss": 0.046, "step": 5600 }, { "epoch": 2.487674883411059, "grad_norm": 0.5295486655610855, "learning_rate": 3.7560623856408496e-06, "loss": 0.0421, "step": 5601 }, { "epoch": 2.488119031756607, "grad_norm": 0.3550831132877616, "learning_rate": 3.7541848937357037e-06, "loss": 0.0282, "step": 5602 }, { "epoch": 2.488563180102154, "grad_norm": 0.4820776586497428, "learning_rate": 3.7523075890977323e-06, "loss": 0.0327, "step": 5603 }, { "epoch": 2.4890073284477015, "grad_norm": 0.36069709664433147, "learning_rate": 3.7504304720091227e-06, "loss": 0.0282, "step": 5604 }, { "epoch": 2.489451476793249, "grad_norm": 0.3378590317401163, "learning_rate": 3.7485535427520393e-06, "loss": 0.0327, "step": 5605 }, { "epoch": 2.4898956251387965, "grad_norm": 0.5101551267438247, "learning_rate": 3.7466768016086187e-06, "loss": 0.0426, "step": 5606 }, { "epoch": 2.4903397734843438, "grad_norm": 0.4554277064950327, "learning_rate": 3.7448002488609647e-06, "loss": 0.0363, "step": 5607 }, { "epoch": 2.490783921829891, "grad_norm": 0.5190026980108126, "learning_rate": 3.7429238847911555e-06, "loss": 0.0303, "step": 5608 }, { "epoch": 2.4912280701754383, "grad_norm": 0.4937954972214749, "learning_rate": 3.7410477096812402e-06, "loss": 0.0489, "step": 5609 }, { "epoch": 2.491672218520986, "grad_norm": 0.36659365050297416, "learning_rate": 3.7391717238132386e-06, "loss": 0.0266, "step": 5610 }, { "epoch": 2.4921163668665334, "grad_norm": 0.391793755436005, "learning_rate": 3.737295927469146e-06, "loss": 0.0288, "step": 5611 }, { "epoch": 2.4925605152120807, "grad_norm": 0.386017966668196, "learning_rate": 3.7354203209309246e-06, "loss": 0.0347, "step": 5612 }, { "epoch": 2.4930046635576284, "grad_norm": 0.30330961970277515, "learning_rate": 3.733544904480512e-06, "loss": 0.0248, "step": 5613 }, { "epoch": 2.4934488119031757, "grad_norm": 0.4407907387683592, "learning_rate": 3.7316696783998124e-06, "loss": 0.0347, "step": 5614 }, { "epoch": 2.493892960248723, "grad_norm": 0.37646444767265064, "learning_rate": 3.7297946429707045e-06, "loss": 0.0328, "step": 5615 }, { "epoch": 2.4943371085942703, "grad_norm": 0.4575522855234119, "learning_rate": 3.727919798475038e-06, "loss": 0.0352, "step": 5616 }, { "epoch": 2.494781256939818, "grad_norm": 0.35695162304512706, "learning_rate": 3.7260451451946365e-06, "loss": 0.0316, "step": 5617 }, { "epoch": 2.4952254052853653, "grad_norm": 0.43360958825611745, "learning_rate": 3.724170683411291e-06, "loss": 0.0332, "step": 5618 }, { "epoch": 2.4956695536309126, "grad_norm": 0.4262372003773419, "learning_rate": 3.722296413406763e-06, "loss": 0.0255, "step": 5619 }, { "epoch": 2.4961137019764603, "grad_norm": 0.43363995486396417, "learning_rate": 3.7204223354627894e-06, "loss": 0.031, "step": 5620 }, { "epoch": 2.4965578503220076, "grad_norm": 0.47249288635203707, "learning_rate": 3.718548449861074e-06, "loss": 0.0331, "step": 5621 }, { "epoch": 2.497001998667555, "grad_norm": 0.4319106936736054, "learning_rate": 3.716674756883295e-06, "loss": 0.0392, "step": 5622 }, { "epoch": 2.497446147013102, "grad_norm": 0.5119693456166591, "learning_rate": 3.714801256811099e-06, "loss": 0.0409, "step": 5623 }, { "epoch": 2.49789029535865, "grad_norm": 0.40570657622630646, "learning_rate": 3.712927949926108e-06, "loss": 0.0266, "step": 5624 }, { "epoch": 2.498334443704197, "grad_norm": 0.40821652834924144, "learning_rate": 3.7110548365099075e-06, "loss": 0.0402, "step": 5625 }, { "epoch": 2.4987785920497445, "grad_norm": 0.3940755701591239, "learning_rate": 3.7091819168440624e-06, "loss": 0.0315, "step": 5626 }, { "epoch": 2.4992227403952922, "grad_norm": 0.495948418850682, "learning_rate": 3.7073091912101002e-06, "loss": 0.0326, "step": 5627 }, { "epoch": 2.4996668887408395, "grad_norm": 0.4435967840620372, "learning_rate": 3.705436659889527e-06, "loss": 0.0368, "step": 5628 }, { "epoch": 2.500111037086387, "grad_norm": 0.32392962929044805, "learning_rate": 3.7035643231638135e-06, "loss": 0.029, "step": 5629 }, { "epoch": 2.500555185431934, "grad_norm": 0.6179012687291638, "learning_rate": 3.7016921813144063e-06, "loss": 0.0382, "step": 5630 }, { "epoch": 2.5009993337774814, "grad_norm": 0.45744910004367445, "learning_rate": 3.6998202346227183e-06, "loss": 0.0418, "step": 5631 }, { "epoch": 2.501443482123029, "grad_norm": 0.5527549347471298, "learning_rate": 3.697948483370135e-06, "loss": 0.0257, "step": 5632 }, { "epoch": 2.5018876304685764, "grad_norm": 0.5434768947853562, "learning_rate": 3.696076927838011e-06, "loss": 0.0389, "step": 5633 }, { "epoch": 2.502331778814124, "grad_norm": 0.3410765055084389, "learning_rate": 3.6942055683076767e-06, "loss": 0.0299, "step": 5634 }, { "epoch": 2.5027759271596715, "grad_norm": 0.7036711081682665, "learning_rate": 3.692334405060427e-06, "loss": 0.0405, "step": 5635 }, { "epoch": 2.5032200755052187, "grad_norm": 0.3864467247613326, "learning_rate": 3.6904634383775283e-06, "loss": 0.0293, "step": 5636 }, { "epoch": 2.503664223850766, "grad_norm": 0.5152666553526902, "learning_rate": 3.6885926685402213e-06, "loss": 0.042, "step": 5637 }, { "epoch": 2.5041083721963133, "grad_norm": 0.3913873319828441, "learning_rate": 3.6867220958297132e-06, "loss": 0.0341, "step": 5638 }, { "epoch": 2.504552520541861, "grad_norm": 0.3712133677859356, "learning_rate": 3.6848517205271805e-06, "loss": 0.0257, "step": 5639 }, { "epoch": 2.5049966688874084, "grad_norm": 0.42947535443731244, "learning_rate": 3.682981542913776e-06, "loss": 0.0369, "step": 5640 }, { "epoch": 2.5054408172329556, "grad_norm": 0.5274708204343582, "learning_rate": 3.6811115632706185e-06, "loss": 0.0324, "step": 5641 }, { "epoch": 2.5058849655785034, "grad_norm": 0.4291181112678621, "learning_rate": 3.6792417818787972e-06, "loss": 0.0293, "step": 5642 }, { "epoch": 2.5063291139240507, "grad_norm": 0.42231381186795797, "learning_rate": 3.677372199019371e-06, "loss": 0.0366, "step": 5643 }, { "epoch": 2.506773262269598, "grad_norm": 0.4789995350453658, "learning_rate": 3.6755028149733697e-06, "loss": 0.0327, "step": 5644 }, { "epoch": 2.5072174106151452, "grad_norm": 0.40689270267790856, "learning_rate": 3.6736336300217964e-06, "loss": 0.0237, "step": 5645 }, { "epoch": 2.507661558960693, "grad_norm": 0.5913689239490344, "learning_rate": 3.6717646444456196e-06, "loss": 0.0366, "step": 5646 }, { "epoch": 2.5081057073062403, "grad_norm": 0.3914862469261757, "learning_rate": 3.669895858525778e-06, "loss": 0.025, "step": 5647 }, { "epoch": 2.5085498556517876, "grad_norm": 0.42201187516388383, "learning_rate": 3.6680272725431854e-06, "loss": 0.0336, "step": 5648 }, { "epoch": 2.5089940039973353, "grad_norm": 0.42314579939123975, "learning_rate": 3.6661588867787183e-06, "loss": 0.0368, "step": 5649 }, { "epoch": 2.5094381523428826, "grad_norm": 0.46190362474830615, "learning_rate": 3.664290701513229e-06, "loss": 0.0366, "step": 5650 }, { "epoch": 2.50988230068843, "grad_norm": 0.5151193711763113, "learning_rate": 3.662422717027536e-06, "loss": 0.0343, "step": 5651 }, { "epoch": 2.510326449033977, "grad_norm": 0.4683557027001918, "learning_rate": 3.6605549336024327e-06, "loss": 0.0502, "step": 5652 }, { "epoch": 2.510770597379525, "grad_norm": 0.3966005086612889, "learning_rate": 3.658687351518674e-06, "loss": 0.0307, "step": 5653 }, { "epoch": 2.511214745725072, "grad_norm": 0.4901749101545214, "learning_rate": 3.656819971056992e-06, "loss": 0.0514, "step": 5654 }, { "epoch": 2.5116588940706195, "grad_norm": 0.44088737564599295, "learning_rate": 3.654952792498086e-06, "loss": 0.035, "step": 5655 }, { "epoch": 2.512103042416167, "grad_norm": 0.637540920302593, "learning_rate": 3.653085816122621e-06, "loss": 0.0433, "step": 5656 }, { "epoch": 2.5125471907617145, "grad_norm": 0.3989139686742009, "learning_rate": 3.651219042211239e-06, "loss": 0.0266, "step": 5657 }, { "epoch": 2.512991339107262, "grad_norm": 0.49156765417598675, "learning_rate": 3.649352471044548e-06, "loss": 0.0232, "step": 5658 }, { "epoch": 2.513435487452809, "grad_norm": 0.3796366437427131, "learning_rate": 3.647486102903124e-06, "loss": 0.0254, "step": 5659 }, { "epoch": 2.5138796357983564, "grad_norm": 0.4428861844260197, "learning_rate": 3.6456199380675128e-06, "loss": 0.0336, "step": 5660 }, { "epoch": 2.514323784143904, "grad_norm": 0.4660014764332606, "learning_rate": 3.6437539768182305e-06, "loss": 0.0379, "step": 5661 }, { "epoch": 2.5147679324894514, "grad_norm": 0.48208527317355315, "learning_rate": 3.6418882194357662e-06, "loss": 0.0309, "step": 5662 }, { "epoch": 2.515212080834999, "grad_norm": 0.46984872075734047, "learning_rate": 3.6400226662005733e-06, "loss": 0.0281, "step": 5663 }, { "epoch": 2.5156562291805464, "grad_norm": 0.3674696428042144, "learning_rate": 3.638157317393074e-06, "loss": 0.0254, "step": 5664 }, { "epoch": 2.5161003775260937, "grad_norm": 0.3790734362034619, "learning_rate": 3.636292173293665e-06, "loss": 0.0322, "step": 5665 }, { "epoch": 2.516544525871641, "grad_norm": 0.36816453960949835, "learning_rate": 3.634427234182708e-06, "loss": 0.0339, "step": 5666 }, { "epoch": 2.5169886742171883, "grad_norm": 0.310536315755628, "learning_rate": 3.632562500340532e-06, "loss": 0.0204, "step": 5667 }, { "epoch": 2.517432822562736, "grad_norm": 0.5208883625261909, "learning_rate": 3.6306979720474424e-06, "loss": 0.0385, "step": 5668 }, { "epoch": 2.5178769709082833, "grad_norm": 0.4448683196893281, "learning_rate": 3.6288336495837085e-06, "loss": 0.0357, "step": 5669 }, { "epoch": 2.5183211192538306, "grad_norm": 0.44092160751797316, "learning_rate": 3.6269695332295697e-06, "loss": 0.0344, "step": 5670 }, { "epoch": 2.5187652675993784, "grad_norm": 0.45635940839778255, "learning_rate": 3.6251056232652327e-06, "loss": 0.0305, "step": 5671 }, { "epoch": 2.5192094159449256, "grad_norm": 0.5533263657116287, "learning_rate": 3.6232419199708764e-06, "loss": 0.0284, "step": 5672 }, { "epoch": 2.519653564290473, "grad_norm": 0.37835546841851936, "learning_rate": 3.6213784236266447e-06, "loss": 0.0326, "step": 5673 }, { "epoch": 2.5200977126360202, "grad_norm": 0.4665241718562087, "learning_rate": 3.6195151345126556e-06, "loss": 0.036, "step": 5674 }, { "epoch": 2.520541860981568, "grad_norm": 0.3945669545233313, "learning_rate": 3.6176520529089932e-06, "loss": 0.0339, "step": 5675 }, { "epoch": 2.5209860093271153, "grad_norm": 0.37055474601271443, "learning_rate": 3.6157891790957096e-06, "loss": 0.0339, "step": 5676 }, { "epoch": 2.5214301576726625, "grad_norm": 0.492060267079228, "learning_rate": 3.6139265133528246e-06, "loss": 0.0316, "step": 5677 }, { "epoch": 2.5218743060182103, "grad_norm": 0.5701337130019367, "learning_rate": 3.612064055960331e-06, "loss": 0.0464, "step": 5678 }, { "epoch": 2.5223184543637576, "grad_norm": 0.39863450477642115, "learning_rate": 3.6102018071981846e-06, "loss": 0.0334, "step": 5679 }, { "epoch": 2.522762602709305, "grad_norm": 0.515203363747466, "learning_rate": 3.6083397673463172e-06, "loss": 0.0486, "step": 5680 }, { "epoch": 2.523206751054852, "grad_norm": 0.3893233450260606, "learning_rate": 3.606477936684622e-06, "loss": 0.0359, "step": 5681 }, { "epoch": 2.5236508994004, "grad_norm": 0.45713551996484814, "learning_rate": 3.6046163154929657e-06, "loss": 0.0374, "step": 5682 }, { "epoch": 2.524095047745947, "grad_norm": 0.5641320893715582, "learning_rate": 3.6027549040511806e-06, "loss": 0.058, "step": 5683 }, { "epoch": 2.5245391960914945, "grad_norm": 0.3265494892390024, "learning_rate": 3.600893702639067e-06, "loss": 0.023, "step": 5684 }, { "epoch": 2.524983344437042, "grad_norm": 0.41936797741330234, "learning_rate": 3.5990327115363967e-06, "loss": 0.0256, "step": 5685 }, { "epoch": 2.5254274927825895, "grad_norm": 0.46801276167069494, "learning_rate": 3.5971719310229093e-06, "loss": 0.0254, "step": 5686 }, { "epoch": 2.525871641128137, "grad_norm": 0.49189838872219027, "learning_rate": 3.595311361378311e-06, "loss": 0.0364, "step": 5687 }, { "epoch": 2.526315789473684, "grad_norm": 0.5586944397026717, "learning_rate": 3.593451002882275e-06, "loss": 0.0335, "step": 5688 }, { "epoch": 2.5267599378192314, "grad_norm": 0.4422926900473203, "learning_rate": 3.5915908558144476e-06, "loss": 0.0298, "step": 5689 }, { "epoch": 2.527204086164779, "grad_norm": 0.3434949432587989, "learning_rate": 3.5897309204544375e-06, "loss": 0.0336, "step": 5690 }, { "epoch": 2.5276482345103264, "grad_norm": 0.3621563955807786, "learning_rate": 3.587871197081828e-06, "loss": 0.0254, "step": 5691 }, { "epoch": 2.528092382855874, "grad_norm": 0.4132257794043078, "learning_rate": 3.586011685976164e-06, "loss": 0.0332, "step": 5692 }, { "epoch": 2.5285365312014214, "grad_norm": 0.5011026660419655, "learning_rate": 3.5841523874169648e-06, "loss": 0.0345, "step": 5693 }, { "epoch": 2.5289806795469687, "grad_norm": 0.45248007625680975, "learning_rate": 3.582293301683713e-06, "loss": 0.0293, "step": 5694 }, { "epoch": 2.529424827892516, "grad_norm": 0.7885398440569542, "learning_rate": 3.580434429055859e-06, "loss": 0.032, "step": 5695 }, { "epoch": 2.5298689762380633, "grad_norm": 0.8455065643807562, "learning_rate": 3.578575769812824e-06, "loss": 0.0566, "step": 5696 }, { "epoch": 2.530313124583611, "grad_norm": 0.3581406871410735, "learning_rate": 3.576717324233998e-06, "loss": 0.0283, "step": 5697 }, { "epoch": 2.5307572729291583, "grad_norm": 0.37566727714871373, "learning_rate": 3.5748590925987347e-06, "loss": 0.0274, "step": 5698 }, { "epoch": 2.5312014212747056, "grad_norm": 0.40595777932831095, "learning_rate": 3.5730010751863605e-06, "loss": 0.029, "step": 5699 }, { "epoch": 2.5316455696202533, "grad_norm": 0.47532996991241144, "learning_rate": 3.571143272276164e-06, "loss": 0.0532, "step": 5700 }, { "epoch": 2.5320897179658006, "grad_norm": 0.44327227035484773, "learning_rate": 3.5692856841474045e-06, "loss": 0.0355, "step": 5701 }, { "epoch": 2.532533866311348, "grad_norm": 0.34290979724264187, "learning_rate": 3.5674283110793105e-06, "loss": 0.0219, "step": 5702 }, { "epoch": 2.532978014656895, "grad_norm": 0.6517538589913022, "learning_rate": 3.5655711533510783e-06, "loss": 0.0354, "step": 5703 }, { "epoch": 2.533422163002443, "grad_norm": 0.40477955297885887, "learning_rate": 3.5637142112418684e-06, "loss": 0.0292, "step": 5704 }, { "epoch": 2.5338663113479902, "grad_norm": 0.47637177118935925, "learning_rate": 3.5618574850308095e-06, "loss": 0.0256, "step": 5705 }, { "epoch": 2.5343104596935375, "grad_norm": 0.4050526950368258, "learning_rate": 3.560000974997001e-06, "loss": 0.0279, "step": 5706 }, { "epoch": 2.5347546080390853, "grad_norm": 0.4510491005204953, "learning_rate": 3.5581446814195054e-06, "loss": 0.039, "step": 5707 }, { "epoch": 2.5351987563846325, "grad_norm": 0.43059426602406603, "learning_rate": 3.556288604577359e-06, "loss": 0.0344, "step": 5708 }, { "epoch": 2.53564290473018, "grad_norm": 0.38353556571368813, "learning_rate": 3.5544327447495598e-06, "loss": 0.033, "step": 5709 }, { "epoch": 2.536087053075727, "grad_norm": 0.3857238286700737, "learning_rate": 3.5525771022150746e-06, "loss": 0.0282, "step": 5710 }, { "epoch": 2.536531201421275, "grad_norm": 0.48091169542814494, "learning_rate": 3.5507216772528392e-06, "loss": 0.0354, "step": 5711 }, { "epoch": 2.536975349766822, "grad_norm": 1.0810667528006974, "learning_rate": 3.548866470141753e-06, "loss": 0.0411, "step": 5712 }, { "epoch": 2.5374194981123694, "grad_norm": 0.37822644450804105, "learning_rate": 3.547011481160686e-06, "loss": 0.0287, "step": 5713 }, { "epoch": 2.537863646457917, "grad_norm": 0.44547431389156866, "learning_rate": 3.5451567105884777e-06, "loss": 0.0446, "step": 5714 }, { "epoch": 2.5383077948034645, "grad_norm": 0.33403093136486245, "learning_rate": 3.543302158703929e-06, "loss": 0.0304, "step": 5715 }, { "epoch": 2.5387519431490118, "grad_norm": 0.3853720547985273, "learning_rate": 3.5414478257858097e-06, "loss": 0.0316, "step": 5716 }, { "epoch": 2.539196091494559, "grad_norm": 0.4078086186517431, "learning_rate": 3.53959371211286e-06, "loss": 0.0359, "step": 5717 }, { "epoch": 2.5396402398401063, "grad_norm": 0.3855600056546902, "learning_rate": 3.5377398179637807e-06, "loss": 0.0201, "step": 5718 }, { "epoch": 2.540084388185654, "grad_norm": 0.3712283572301996, "learning_rate": 3.5358861436172487e-06, "loss": 0.0327, "step": 5719 }, { "epoch": 2.5405285365312014, "grad_norm": 0.4121457515255565, "learning_rate": 3.5340326893518993e-06, "loss": 0.0337, "step": 5720 }, { "epoch": 2.540972684876749, "grad_norm": 0.4589358591248568, "learning_rate": 3.5321794554463397e-06, "loss": 0.0282, "step": 5721 }, { "epoch": 2.5414168332222964, "grad_norm": 0.41283412784679185, "learning_rate": 3.530326442179142e-06, "loss": 0.0433, "step": 5722 }, { "epoch": 2.5418609815678437, "grad_norm": 0.3832687248919424, "learning_rate": 3.5284736498288452e-06, "loss": 0.0275, "step": 5723 }, { "epoch": 2.542305129913391, "grad_norm": 0.4017937297419123, "learning_rate": 3.526621078673954e-06, "loss": 0.0301, "step": 5724 }, { "epoch": 2.5427492782589383, "grad_norm": 0.3799788724726438, "learning_rate": 3.5247687289929443e-06, "loss": 0.0352, "step": 5725 }, { "epoch": 2.543193426604486, "grad_norm": 0.3381062594883136, "learning_rate": 3.5229166010642544e-06, "loss": 0.0237, "step": 5726 }, { "epoch": 2.5436375749500333, "grad_norm": 0.9370246012561935, "learning_rate": 3.521064695166292e-06, "loss": 0.0417, "step": 5727 }, { "epoch": 2.5440817232955806, "grad_norm": 0.5170089647125808, "learning_rate": 3.5192130115774283e-06, "loss": 0.0379, "step": 5728 }, { "epoch": 2.5445258716411283, "grad_norm": 0.5478100844231081, "learning_rate": 3.5173615505760015e-06, "loss": 0.0327, "step": 5729 }, { "epoch": 2.5449700199866756, "grad_norm": 0.3263956967060353, "learning_rate": 3.5155103124403184e-06, "loss": 0.0235, "step": 5730 }, { "epoch": 2.545414168332223, "grad_norm": 0.4151765481340016, "learning_rate": 3.513659297448655e-06, "loss": 0.0315, "step": 5731 }, { "epoch": 2.54585831667777, "grad_norm": 0.4249849360391111, "learning_rate": 3.511808505879247e-06, "loss": 0.037, "step": 5732 }, { "epoch": 2.546302465023318, "grad_norm": 0.430816357070103, "learning_rate": 3.5099579380103e-06, "loss": 0.0313, "step": 5733 }, { "epoch": 2.546746613368865, "grad_norm": 0.36068218836576676, "learning_rate": 3.508107594119987e-06, "loss": 0.0248, "step": 5734 }, { "epoch": 2.5471907617144125, "grad_norm": 0.5603158518242672, "learning_rate": 3.506257474486444e-06, "loss": 0.0407, "step": 5735 }, { "epoch": 2.5476349100599602, "grad_norm": 0.457105631896688, "learning_rate": 3.5044075793877784e-06, "loss": 0.0285, "step": 5736 }, { "epoch": 2.5480790584055075, "grad_norm": 0.4513751440943639, "learning_rate": 3.5025579091020584e-06, "loss": 0.0295, "step": 5737 }, { "epoch": 2.548523206751055, "grad_norm": 0.4467104682114999, "learning_rate": 3.500708463907323e-06, "loss": 0.0297, "step": 5738 }, { "epoch": 2.548967355096602, "grad_norm": 0.3775580565507929, "learning_rate": 3.498859244081573e-06, "loss": 0.0302, "step": 5739 }, { "epoch": 2.54941150344215, "grad_norm": 0.4736436006509606, "learning_rate": 3.4970102499027787e-06, "loss": 0.0324, "step": 5740 }, { "epoch": 2.549855651787697, "grad_norm": 0.38970978243484516, "learning_rate": 3.4951614816488733e-06, "loss": 0.0239, "step": 5741 }, { "epoch": 2.5502998001332444, "grad_norm": 0.3858729170075839, "learning_rate": 3.4933129395977627e-06, "loss": 0.0282, "step": 5742 }, { "epoch": 2.550743948478792, "grad_norm": 0.46868923999899664, "learning_rate": 3.491464624027311e-06, "loss": 0.0309, "step": 5743 }, { "epoch": 2.5511880968243394, "grad_norm": 0.42676283372780777, "learning_rate": 3.489616535215351e-06, "loss": 0.0288, "step": 5744 }, { "epoch": 2.5516322451698867, "grad_norm": 0.47537306808660484, "learning_rate": 3.487768673439684e-06, "loss": 0.0409, "step": 5745 }, { "epoch": 2.552076393515434, "grad_norm": 0.5509640851909691, "learning_rate": 3.4859210389780717e-06, "loss": 0.0537, "step": 5746 }, { "epoch": 2.5525205418609813, "grad_norm": 0.4532699668327261, "learning_rate": 3.484073632108248e-06, "loss": 0.0374, "step": 5747 }, { "epoch": 2.552964690206529, "grad_norm": 0.30162200474796635, "learning_rate": 3.4822264531079074e-06, "loss": 0.0346, "step": 5748 }, { "epoch": 2.5534088385520763, "grad_norm": 0.41586878305661834, "learning_rate": 3.4803795022547152e-06, "loss": 0.0385, "step": 5749 }, { "epoch": 2.553852986897624, "grad_norm": 0.5284306714224316, "learning_rate": 3.478532779826297e-06, "loss": 0.0329, "step": 5750 }, { "epoch": 2.5542971352431714, "grad_norm": 0.3711615788486314, "learning_rate": 3.476686286100247e-06, "loss": 0.0277, "step": 5751 }, { "epoch": 2.5547412835887187, "grad_norm": 0.4470649615235087, "learning_rate": 3.4748400213541233e-06, "loss": 0.0463, "step": 5752 }, { "epoch": 2.555185431934266, "grad_norm": 0.40717240259190046, "learning_rate": 3.4729939858654548e-06, "loss": 0.0296, "step": 5753 }, { "epoch": 2.5556295802798132, "grad_norm": 0.4846910790786083, "learning_rate": 3.471148179911728e-06, "loss": 0.0404, "step": 5754 }, { "epoch": 2.556073728625361, "grad_norm": 0.37627232321927323, "learning_rate": 3.4693026037704012e-06, "loss": 0.0334, "step": 5755 }, { "epoch": 2.5565178769709083, "grad_norm": 0.34629139309877494, "learning_rate": 3.467457257718896e-06, "loss": 0.0272, "step": 5756 }, { "epoch": 2.5569620253164556, "grad_norm": 0.3421995312642631, "learning_rate": 3.4656121420345968e-06, "loss": 0.0317, "step": 5757 }, { "epoch": 2.5574061736620033, "grad_norm": 0.44077802939094335, "learning_rate": 3.463767256994856e-06, "loss": 0.0549, "step": 5758 }, { "epoch": 2.5578503220075506, "grad_norm": 0.3954224399718614, "learning_rate": 3.461922602876995e-06, "loss": 0.0334, "step": 5759 }, { "epoch": 2.558294470353098, "grad_norm": 0.41169694619074404, "learning_rate": 3.460078179958294e-06, "loss": 0.0341, "step": 5760 }, { "epoch": 2.558738618698645, "grad_norm": 0.4812985244340657, "learning_rate": 3.458233988516e-06, "loss": 0.0407, "step": 5761 }, { "epoch": 2.559182767044193, "grad_norm": 0.3331989432004887, "learning_rate": 3.4563900288273287e-06, "loss": 0.0247, "step": 5762 }, { "epoch": 2.55962691538974, "grad_norm": 0.3885261825147752, "learning_rate": 3.454546301169458e-06, "loss": 0.0294, "step": 5763 }, { "epoch": 2.5600710637352875, "grad_norm": 0.43455546109798127, "learning_rate": 3.4527028058195276e-06, "loss": 0.0276, "step": 5764 }, { "epoch": 2.560515212080835, "grad_norm": 0.41019774224604005, "learning_rate": 3.4508595430546516e-06, "loss": 0.0376, "step": 5765 }, { "epoch": 2.5609593604263825, "grad_norm": 0.5340205931151287, "learning_rate": 3.4490165131519027e-06, "loss": 0.0363, "step": 5766 }, { "epoch": 2.56140350877193, "grad_norm": 0.32981175279651753, "learning_rate": 3.4471737163883178e-06, "loss": 0.0341, "step": 5767 }, { "epoch": 2.561847657117477, "grad_norm": 0.4191900996748185, "learning_rate": 3.4453311530409008e-06, "loss": 0.0336, "step": 5768 }, { "epoch": 2.5622918054630244, "grad_norm": 0.4533154388512584, "learning_rate": 3.4434888233866205e-06, "loss": 0.0379, "step": 5769 }, { "epoch": 2.562735953808572, "grad_norm": 0.4083356757917847, "learning_rate": 3.4416467277024097e-06, "loss": 0.0256, "step": 5770 }, { "epoch": 2.5631801021541194, "grad_norm": 0.46425748132873407, "learning_rate": 3.4398048662651693e-06, "loss": 0.0455, "step": 5771 }, { "epoch": 2.563624250499667, "grad_norm": 0.3043869275862561, "learning_rate": 3.4379632393517593e-06, "loss": 0.022, "step": 5772 }, { "epoch": 2.5640683988452144, "grad_norm": 0.7475044005307006, "learning_rate": 3.43612184723901e-06, "loss": 0.0502, "step": 5773 }, { "epoch": 2.5645125471907617, "grad_norm": 0.32220506593319287, "learning_rate": 3.4342806902037118e-06, "loss": 0.0271, "step": 5774 }, { "epoch": 2.564956695536309, "grad_norm": 0.36616059681338226, "learning_rate": 3.4324397685226217e-06, "loss": 0.0327, "step": 5775 }, { "epoch": 2.5654008438818563, "grad_norm": 0.4598617313466593, "learning_rate": 3.4305990824724645e-06, "loss": 0.0255, "step": 5776 }, { "epoch": 2.565844992227404, "grad_norm": 0.4169024524406747, "learning_rate": 3.428758632329925e-06, "loss": 0.0261, "step": 5777 }, { "epoch": 2.5662891405729513, "grad_norm": 0.46153415454886376, "learning_rate": 3.426918418371652e-06, "loss": 0.0313, "step": 5778 }, { "epoch": 2.5667332889184986, "grad_norm": 0.5358677284520018, "learning_rate": 3.4250784408742644e-06, "loss": 0.045, "step": 5779 }, { "epoch": 2.5671774372640463, "grad_norm": 0.4437829933438522, "learning_rate": 3.4232387001143396e-06, "loss": 0.0423, "step": 5780 }, { "epoch": 2.5676215856095936, "grad_norm": 0.39105850673238723, "learning_rate": 3.4213991963684212e-06, "loss": 0.0353, "step": 5781 }, { "epoch": 2.568065733955141, "grad_norm": 0.5049529429986211, "learning_rate": 3.419559929913021e-06, "loss": 0.0345, "step": 5782 }, { "epoch": 2.568509882300688, "grad_norm": 0.4292519880189004, "learning_rate": 3.4177209010246104e-06, "loss": 0.0372, "step": 5783 }, { "epoch": 2.568954030646236, "grad_norm": 0.373978887044069, "learning_rate": 3.415882109979627e-06, "loss": 0.0428, "step": 5784 }, { "epoch": 2.5693981789917832, "grad_norm": 0.500647171885995, "learning_rate": 3.4140435570544708e-06, "loss": 0.036, "step": 5785 }, { "epoch": 2.5698423273373305, "grad_norm": 0.5585745761796531, "learning_rate": 3.4122052425255097e-06, "loss": 0.0371, "step": 5786 }, { "epoch": 2.5702864756828783, "grad_norm": 0.40623789381422887, "learning_rate": 3.4103671666690706e-06, "loss": 0.0346, "step": 5787 }, { "epoch": 2.5707306240284256, "grad_norm": 0.5204708462096757, "learning_rate": 3.4085293297614513e-06, "loss": 0.0341, "step": 5788 }, { "epoch": 2.571174772373973, "grad_norm": 0.38514191058110864, "learning_rate": 3.406691732078907e-06, "loss": 0.0247, "step": 5789 }, { "epoch": 2.57161892071952, "grad_norm": 0.47415946915349233, "learning_rate": 3.4048543738976624e-06, "loss": 0.03, "step": 5790 }, { "epoch": 2.572063069065068, "grad_norm": 0.35195512662300593, "learning_rate": 3.4030172554939022e-06, "loss": 0.0179, "step": 5791 }, { "epoch": 2.572507217410615, "grad_norm": 0.455360916505429, "learning_rate": 3.401180377143774e-06, "loss": 0.0355, "step": 5792 }, { "epoch": 2.5729513657561625, "grad_norm": 0.6594611426447926, "learning_rate": 3.399343739123395e-06, "loss": 0.04, "step": 5793 }, { "epoch": 2.57339551410171, "grad_norm": 0.30764478468780626, "learning_rate": 3.3975073417088445e-06, "loss": 0.024, "step": 5794 }, { "epoch": 2.5738396624472575, "grad_norm": 0.5139517791311652, "learning_rate": 3.3956711851761603e-06, "loss": 0.0386, "step": 5795 }, { "epoch": 2.5742838107928048, "grad_norm": 0.44276110456272694, "learning_rate": 3.393835269801351e-06, "loss": 0.0276, "step": 5796 }, { "epoch": 2.574727959138352, "grad_norm": 0.6084624820775043, "learning_rate": 3.3919995958603845e-06, "loss": 0.0349, "step": 5797 }, { "epoch": 2.5751721074838994, "grad_norm": 0.6187475898080996, "learning_rate": 3.3901641636291925e-06, "loss": 0.0369, "step": 5798 }, { "epoch": 2.575616255829447, "grad_norm": 0.4166403983258688, "learning_rate": 3.388328973383673e-06, "loss": 0.028, "step": 5799 }, { "epoch": 2.5760604041749944, "grad_norm": 0.37895711615417893, "learning_rate": 3.3864940253996885e-06, "loss": 0.0309, "step": 5800 }, { "epoch": 2.576504552520542, "grad_norm": 0.4165330638891502, "learning_rate": 3.3846593199530598e-06, "loss": 0.0324, "step": 5801 }, { "epoch": 2.5769487008660894, "grad_norm": 0.3850803256916831, "learning_rate": 3.3828248573195744e-06, "loss": 0.0243, "step": 5802 }, { "epoch": 2.5773928492116367, "grad_norm": 0.4535252370266945, "learning_rate": 3.3809906377749853e-06, "loss": 0.026, "step": 5803 }, { "epoch": 2.577836997557184, "grad_norm": 0.4473597091096733, "learning_rate": 3.3791566615950034e-06, "loss": 0.0446, "step": 5804 }, { "epoch": 2.5782811459027313, "grad_norm": 0.4461365886821962, "learning_rate": 3.37732292905531e-06, "loss": 0.0353, "step": 5805 }, { "epoch": 2.578725294248279, "grad_norm": 0.4044232712398919, "learning_rate": 3.375489440431544e-06, "loss": 0.0356, "step": 5806 }, { "epoch": 2.5791694425938263, "grad_norm": 0.4100077927626587, "learning_rate": 3.373656195999312e-06, "loss": 0.0288, "step": 5807 }, { "epoch": 2.5796135909393736, "grad_norm": 0.49681714136189903, "learning_rate": 3.3718231960341807e-06, "loss": 0.0392, "step": 5808 }, { "epoch": 2.5800577392849213, "grad_norm": 0.3508735282517198, "learning_rate": 3.3699904408116778e-06, "loss": 0.0266, "step": 5809 }, { "epoch": 2.5805018876304686, "grad_norm": 0.4685055266479743, "learning_rate": 3.368157930607303e-06, "loss": 0.0322, "step": 5810 }, { "epoch": 2.580946035976016, "grad_norm": 0.3392092135667048, "learning_rate": 3.3663256656965115e-06, "loss": 0.0266, "step": 5811 }, { "epoch": 2.581390184321563, "grad_norm": 0.3673851977352648, "learning_rate": 3.364493646354724e-06, "loss": 0.0291, "step": 5812 }, { "epoch": 2.581834332667111, "grad_norm": 0.4826180223086163, "learning_rate": 3.3626618728573233e-06, "loss": 0.0366, "step": 5813 }, { "epoch": 2.5822784810126582, "grad_norm": 0.46379192115203366, "learning_rate": 3.3608303454796578e-06, "loss": 0.0359, "step": 5814 }, { "epoch": 2.5827226293582055, "grad_norm": 0.33507634737295205, "learning_rate": 3.3589990644970325e-06, "loss": 0.0293, "step": 5815 }, { "epoch": 2.5831667777037532, "grad_norm": 0.394019845804789, "learning_rate": 3.3571680301847265e-06, "loss": 0.0287, "step": 5816 }, { "epoch": 2.5836109260493005, "grad_norm": 0.4267750495828387, "learning_rate": 3.355337242817972e-06, "loss": 0.0225, "step": 5817 }, { "epoch": 2.584055074394848, "grad_norm": 0.40408538263164145, "learning_rate": 3.3535067026719683e-06, "loss": 0.0354, "step": 5818 }, { "epoch": 2.584499222740395, "grad_norm": 0.3264537051893323, "learning_rate": 3.3516764100218744e-06, "loss": 0.0184, "step": 5819 }, { "epoch": 2.584943371085943, "grad_norm": 0.5823548169064077, "learning_rate": 3.3498463651428183e-06, "loss": 0.0555, "step": 5820 }, { "epoch": 2.58538751943149, "grad_norm": 0.6018406880891104, "learning_rate": 3.348016568309882e-06, "loss": 0.0323, "step": 5821 }, { "epoch": 2.5858316677770374, "grad_norm": 0.4499473097806689, "learning_rate": 3.3461870197981205e-06, "loss": 0.0313, "step": 5822 }, { "epoch": 2.586275816122585, "grad_norm": 0.4573988840299094, "learning_rate": 3.3443577198825416e-06, "loss": 0.0217, "step": 5823 }, { "epoch": 2.5867199644681325, "grad_norm": 0.47710512867182214, "learning_rate": 3.342528668838123e-06, "loss": 0.0305, "step": 5824 }, { "epoch": 2.5871641128136798, "grad_norm": 0.3333521539172945, "learning_rate": 3.3406998669398015e-06, "loss": 0.0276, "step": 5825 }, { "epoch": 2.587608261159227, "grad_norm": 0.4557037415155258, "learning_rate": 3.338871314462474e-06, "loss": 0.0283, "step": 5826 }, { "epoch": 2.5880524095047743, "grad_norm": 0.4444177366240355, "learning_rate": 3.337043011681007e-06, "loss": 0.0328, "step": 5827 }, { "epoch": 2.588496557850322, "grad_norm": 0.3980372800704896, "learning_rate": 3.335214958870225e-06, "loss": 0.0312, "step": 5828 }, { "epoch": 2.5889407061958694, "grad_norm": 0.38163158317470414, "learning_rate": 3.333387156304914e-06, "loss": 0.021, "step": 5829 }, { "epoch": 2.589384854541417, "grad_norm": 0.3444340204979025, "learning_rate": 3.3315596042598235e-06, "loss": 0.0283, "step": 5830 }, { "epoch": 2.5898290028869644, "grad_norm": 0.4610922606737187, "learning_rate": 3.3297323030096672e-06, "loss": 0.032, "step": 5831 }, { "epoch": 2.5902731512325117, "grad_norm": 0.41870835356631375, "learning_rate": 3.327905252829117e-06, "loss": 0.0448, "step": 5832 }, { "epoch": 2.590717299578059, "grad_norm": 0.4779584466171409, "learning_rate": 3.326078453992813e-06, "loss": 0.0242, "step": 5833 }, { "epoch": 2.5911614479236063, "grad_norm": 0.5325628313786069, "learning_rate": 3.324251906775351e-06, "loss": 0.0395, "step": 5834 }, { "epoch": 2.591605596269154, "grad_norm": 0.358759607760571, "learning_rate": 3.3224256114512953e-06, "loss": 0.0316, "step": 5835 }, { "epoch": 2.5920497446147013, "grad_norm": 0.43363197580686524, "learning_rate": 3.3205995682951666e-06, "loss": 0.0334, "step": 5836 }, { "epoch": 2.5924938929602486, "grad_norm": 0.4445842523123569, "learning_rate": 3.31877377758145e-06, "loss": 0.033, "step": 5837 }, { "epoch": 2.5929380413057963, "grad_norm": 0.603779398622737, "learning_rate": 3.316948239584592e-06, "loss": 0.0449, "step": 5838 }, { "epoch": 2.5933821896513436, "grad_norm": 0.4392799290696822, "learning_rate": 3.3151229545790066e-06, "loss": 0.0384, "step": 5839 }, { "epoch": 2.593826337996891, "grad_norm": 0.38613834204137615, "learning_rate": 3.3132979228390615e-06, "loss": 0.0248, "step": 5840 }, { "epoch": 2.594270486342438, "grad_norm": 0.31483135068991924, "learning_rate": 3.3114731446390897e-06, "loss": 0.0204, "step": 5841 }, { "epoch": 2.594714634687986, "grad_norm": 0.42067924470792134, "learning_rate": 3.3096486202533884e-06, "loss": 0.0312, "step": 5842 }, { "epoch": 2.595158783033533, "grad_norm": 0.4747651071435795, "learning_rate": 3.3078243499562126e-06, "loss": 0.0325, "step": 5843 }, { "epoch": 2.5956029313790805, "grad_norm": 0.3567548194704181, "learning_rate": 3.3060003340217822e-06, "loss": 0.0241, "step": 5844 }, { "epoch": 2.5960470797246282, "grad_norm": 0.3574932064483991, "learning_rate": 3.3041765727242773e-06, "loss": 0.0302, "step": 5845 }, { "epoch": 2.5964912280701755, "grad_norm": 0.8739424998669374, "learning_rate": 3.302353066337842e-06, "loss": 0.0458, "step": 5846 }, { "epoch": 2.596935376415723, "grad_norm": 0.4781713811806308, "learning_rate": 3.300529815136577e-06, "loss": 0.0328, "step": 5847 }, { "epoch": 2.59737952476127, "grad_norm": 0.4686560316809108, "learning_rate": 3.2987068193945515e-06, "loss": 0.0392, "step": 5848 }, { "epoch": 2.597823673106818, "grad_norm": 0.7682784843583998, "learning_rate": 3.296884079385789e-06, "loss": 0.0432, "step": 5849 }, { "epoch": 2.598267821452365, "grad_norm": 0.43912852313892203, "learning_rate": 3.2950615953842816e-06, "loss": 0.0357, "step": 5850 }, { "epoch": 2.5987119697979124, "grad_norm": 0.4363862214055466, "learning_rate": 3.293239367663978e-06, "loss": 0.0355, "step": 5851 }, { "epoch": 2.59915611814346, "grad_norm": 0.4194478678628829, "learning_rate": 3.2914173964987905e-06, "loss": 0.0376, "step": 5852 }, { "epoch": 2.5996002664890074, "grad_norm": 0.3939580447848396, "learning_rate": 3.289595682162593e-06, "loss": 0.0351, "step": 5853 }, { "epoch": 2.6000444148345547, "grad_norm": 0.3563672060439901, "learning_rate": 3.2877742249292174e-06, "loss": 0.0264, "step": 5854 }, { "epoch": 2.600488563180102, "grad_norm": 0.45772110095911156, "learning_rate": 3.2859530250724604e-06, "loss": 0.0286, "step": 5855 }, { "epoch": 2.6009327115256493, "grad_norm": 0.3788943790281068, "learning_rate": 3.284132082866083e-06, "loss": 0.0231, "step": 5856 }, { "epoch": 2.601376859871197, "grad_norm": 0.42056256980059764, "learning_rate": 3.2823113985837996e-06, "loss": 0.0301, "step": 5857 }, { "epoch": 2.6018210082167443, "grad_norm": 0.4731371391351043, "learning_rate": 3.2804909724992917e-06, "loss": 0.0421, "step": 5858 }, { "epoch": 2.602265156562292, "grad_norm": 0.4171211336356501, "learning_rate": 3.2786708048862e-06, "loss": 0.0267, "step": 5859 }, { "epoch": 2.6027093049078394, "grad_norm": 0.33335158042812324, "learning_rate": 3.276850896018128e-06, "loss": 0.0287, "step": 5860 }, { "epoch": 2.6031534532533867, "grad_norm": 0.6871125152697936, "learning_rate": 3.2750312461686346e-06, "loss": 0.0253, "step": 5861 }, { "epoch": 2.603597601598934, "grad_norm": 0.38868974447379917, "learning_rate": 3.273211855611248e-06, "loss": 0.0384, "step": 5862 }, { "epoch": 2.6040417499444812, "grad_norm": 0.41427271665417736, "learning_rate": 3.271392724619454e-06, "loss": 0.0283, "step": 5863 }, { "epoch": 2.604485898290029, "grad_norm": 0.47635109894278754, "learning_rate": 3.2695738534666964e-06, "loss": 0.0307, "step": 5864 }, { "epoch": 2.6049300466355763, "grad_norm": 0.43305461847503646, "learning_rate": 3.2677552424263836e-06, "loss": 0.0323, "step": 5865 }, { "epoch": 2.6053741949811235, "grad_norm": 0.5925027302243593, "learning_rate": 3.2659368917718813e-06, "loss": 0.0349, "step": 5866 }, { "epoch": 2.6058183433266713, "grad_norm": 0.5670868601287159, "learning_rate": 3.264118801776524e-06, "loss": 0.0272, "step": 5867 }, { "epoch": 2.6062624916722186, "grad_norm": 0.4639294675991596, "learning_rate": 3.262300972713598e-06, "loss": 0.0281, "step": 5868 }, { "epoch": 2.606706640017766, "grad_norm": 0.5113196767933541, "learning_rate": 3.2604834048563527e-06, "loss": 0.0258, "step": 5869 }, { "epoch": 2.607150788363313, "grad_norm": 0.37055184296693755, "learning_rate": 3.2586660984780017e-06, "loss": 0.0292, "step": 5870 }, { "epoch": 2.607594936708861, "grad_norm": 0.5340070208292783, "learning_rate": 3.256849053851716e-06, "loss": 0.0362, "step": 5871 }, { "epoch": 2.608039085054408, "grad_norm": 0.6461515771963295, "learning_rate": 3.2550322712506265e-06, "loss": 0.0378, "step": 5872 }, { "epoch": 2.6084832333999555, "grad_norm": 0.43751867677053585, "learning_rate": 3.2532157509478313e-06, "loss": 0.0297, "step": 5873 }, { "epoch": 2.608927381745503, "grad_norm": 0.4731032790298419, "learning_rate": 3.2513994932163806e-06, "loss": 0.0426, "step": 5874 }, { "epoch": 2.6093715300910505, "grad_norm": 0.40443116847842614, "learning_rate": 3.2495834983292894e-06, "loss": 0.0315, "step": 5875 }, { "epoch": 2.609815678436598, "grad_norm": 0.4867165911113075, "learning_rate": 3.2477677665595333e-06, "loss": 0.0319, "step": 5876 }, { "epoch": 2.610259826782145, "grad_norm": 0.42932234052521123, "learning_rate": 3.2459522981800473e-06, "loss": 0.0465, "step": 5877 }, { "epoch": 2.610703975127693, "grad_norm": 0.35081568597263124, "learning_rate": 3.244137093463725e-06, "loss": 0.0194, "step": 5878 }, { "epoch": 2.61114812347324, "grad_norm": 0.5690602488132523, "learning_rate": 3.2423221526834253e-06, "loss": 0.0381, "step": 5879 }, { "epoch": 2.6115922718187874, "grad_norm": 0.36352934356142463, "learning_rate": 3.2405074761119648e-06, "loss": 0.0282, "step": 5880 }, { "epoch": 2.612036420164335, "grad_norm": 0.3774759920653436, "learning_rate": 3.2386930640221193e-06, "loss": 0.0344, "step": 5881 }, { "epoch": 2.6124805685098824, "grad_norm": 0.5371544305687926, "learning_rate": 3.2368789166866244e-06, "loss": 0.0497, "step": 5882 }, { "epoch": 2.6129247168554297, "grad_norm": 0.38943009927535477, "learning_rate": 3.2350650343781775e-06, "loss": 0.0227, "step": 5883 }, { "epoch": 2.613368865200977, "grad_norm": 0.4501911900625063, "learning_rate": 3.2332514173694396e-06, "loss": 0.0395, "step": 5884 }, { "epoch": 2.6138130135465243, "grad_norm": 0.4234499617743984, "learning_rate": 3.2314380659330246e-06, "loss": 0.0301, "step": 5885 }, { "epoch": 2.614257161892072, "grad_norm": 0.4318219863554864, "learning_rate": 3.22962498034151e-06, "loss": 0.0294, "step": 5886 }, { "epoch": 2.6147013102376193, "grad_norm": 0.503946757308322, "learning_rate": 3.227812160867436e-06, "loss": 0.0394, "step": 5887 }, { "epoch": 2.615145458583167, "grad_norm": 0.3890770124231559, "learning_rate": 3.2259996077832976e-06, "loss": 0.0267, "step": 5888 }, { "epoch": 2.6155896069287143, "grad_norm": 0.3697580136655012, "learning_rate": 3.2241873213615514e-06, "loss": 0.034, "step": 5889 }, { "epoch": 2.6160337552742616, "grad_norm": 0.5649235090654835, "learning_rate": 3.2223753018746186e-06, "loss": 0.0272, "step": 5890 }, { "epoch": 2.616477903619809, "grad_norm": 0.3708345448824024, "learning_rate": 3.220563549594874e-06, "loss": 0.035, "step": 5891 }, { "epoch": 2.616922051965356, "grad_norm": 0.3432122520668072, "learning_rate": 3.2187520647946547e-06, "loss": 0.0283, "step": 5892 }, { "epoch": 2.617366200310904, "grad_norm": 0.42022838385345057, "learning_rate": 3.2169408477462594e-06, "loss": 0.0364, "step": 5893 }, { "epoch": 2.6178103486564512, "grad_norm": 0.44719094760718237, "learning_rate": 3.2151298987219437e-06, "loss": 0.0325, "step": 5894 }, { "epoch": 2.6182544970019985, "grad_norm": 0.35618319174657215, "learning_rate": 3.2133192179939215e-06, "loss": 0.0256, "step": 5895 }, { "epoch": 2.6186986453475463, "grad_norm": 0.3925862521483163, "learning_rate": 3.2115088058343725e-06, "loss": 0.0282, "step": 5896 }, { "epoch": 2.6191427936930936, "grad_norm": 0.45474329229543986, "learning_rate": 3.209698662515432e-06, "loss": 0.0296, "step": 5897 }, { "epoch": 2.619586942038641, "grad_norm": 0.3953757491585112, "learning_rate": 3.2078887883091948e-06, "loss": 0.0422, "step": 5898 }, { "epoch": 2.620031090384188, "grad_norm": 0.36604722451208876, "learning_rate": 3.2060791834877136e-06, "loss": 0.029, "step": 5899 }, { "epoch": 2.620475238729736, "grad_norm": 0.32800664474353197, "learning_rate": 3.204269848323004e-06, "loss": 0.0295, "step": 5900 }, { "epoch": 2.620919387075283, "grad_norm": 0.5458287273191036, "learning_rate": 3.2024607830870424e-06, "loss": 0.0475, "step": 5901 }, { "epoch": 2.6213635354208304, "grad_norm": 0.43819732935782096, "learning_rate": 3.2006519880517597e-06, "loss": 0.0452, "step": 5902 }, { "epoch": 2.621807683766378, "grad_norm": 0.36142275590555617, "learning_rate": 3.1988434634890476e-06, "loss": 0.0277, "step": 5903 }, { "epoch": 2.6222518321119255, "grad_norm": 0.4477755001983317, "learning_rate": 3.197035209670761e-06, "loss": 0.0294, "step": 5904 }, { "epoch": 2.6226959804574728, "grad_norm": 0.46965829732778464, "learning_rate": 3.1952272268687083e-06, "loss": 0.0336, "step": 5905 }, { "epoch": 2.62314012880302, "grad_norm": 0.39994107522556216, "learning_rate": 3.19341951535466e-06, "loss": 0.0214, "step": 5906 }, { "epoch": 2.6235842771485673, "grad_norm": 0.4238276596672287, "learning_rate": 3.1916120754003475e-06, "loss": 0.0269, "step": 5907 }, { "epoch": 2.624028425494115, "grad_norm": 1.8594679605411257, "learning_rate": 3.1898049072774605e-06, "loss": 0.0367, "step": 5908 }, { "epoch": 2.6244725738396624, "grad_norm": 0.31000445284347417, "learning_rate": 3.1879980112576457e-06, "loss": 0.0248, "step": 5909 }, { "epoch": 2.62491672218521, "grad_norm": 0.40069203774397927, "learning_rate": 3.1861913876125093e-06, "loss": 0.0314, "step": 5910 }, { "epoch": 2.6253608705307574, "grad_norm": 0.39410237773498447, "learning_rate": 3.1843850366136198e-06, "loss": 0.0293, "step": 5911 }, { "epoch": 2.6258050188763047, "grad_norm": 0.3268432306409947, "learning_rate": 3.182578958532499e-06, "loss": 0.0266, "step": 5912 }, { "epoch": 2.626249167221852, "grad_norm": 0.45108591338261844, "learning_rate": 3.180773153640635e-06, "loss": 0.0341, "step": 5913 }, { "epoch": 2.6266933155673993, "grad_norm": 0.43982934139295143, "learning_rate": 3.178967622209469e-06, "loss": 0.0328, "step": 5914 }, { "epoch": 2.627137463912947, "grad_norm": 0.34491560520514963, "learning_rate": 3.177162364510404e-06, "loss": 0.0243, "step": 5915 }, { "epoch": 2.6275816122584943, "grad_norm": 0.404112232349346, "learning_rate": 3.175357380814799e-06, "loss": 0.0233, "step": 5916 }, { "epoch": 2.628025760604042, "grad_norm": 0.4687180614976695, "learning_rate": 3.1735526713939757e-06, "loss": 0.0337, "step": 5917 }, { "epoch": 2.6284699089495893, "grad_norm": 0.37747833327236135, "learning_rate": 3.1717482365192106e-06, "loss": 0.0263, "step": 5918 }, { "epoch": 2.6289140572951366, "grad_norm": 0.38430475009185994, "learning_rate": 3.1699440764617432e-06, "loss": 0.0398, "step": 5919 }, { "epoch": 2.629358205640684, "grad_norm": 0.39716398492985894, "learning_rate": 3.1681401914927678e-06, "loss": 0.0324, "step": 5920 }, { "epoch": 2.629802353986231, "grad_norm": 0.40985834454742587, "learning_rate": 3.1663365818834406e-06, "loss": 0.0365, "step": 5921 }, { "epoch": 2.630246502331779, "grad_norm": 0.32571897096876656, "learning_rate": 3.1645332479048734e-06, "loss": 0.03, "step": 5922 }, { "epoch": 2.630690650677326, "grad_norm": 0.41303965455433855, "learning_rate": 3.1627301898281364e-06, "loss": 0.0256, "step": 5923 }, { "epoch": 2.6311347990228735, "grad_norm": 0.4244488216427459, "learning_rate": 3.1609274079242625e-06, "loss": 0.0383, "step": 5924 }, { "epoch": 2.6315789473684212, "grad_norm": 0.5719404451011891, "learning_rate": 3.159124902464241e-06, "loss": 0.0549, "step": 5925 }, { "epoch": 2.6320230957139685, "grad_norm": 0.4271035366652883, "learning_rate": 3.1573226737190164e-06, "loss": 0.0365, "step": 5926 }, { "epoch": 2.632467244059516, "grad_norm": 0.3779054549587856, "learning_rate": 3.155520721959496e-06, "loss": 0.0338, "step": 5927 }, { "epoch": 2.632911392405063, "grad_norm": 0.3266233056591113, "learning_rate": 3.1537190474565437e-06, "loss": 0.0219, "step": 5928 }, { "epoch": 2.633355540750611, "grad_norm": 0.6026007410470748, "learning_rate": 3.151917650480979e-06, "loss": 0.0593, "step": 5929 }, { "epoch": 2.633799689096158, "grad_norm": 0.47698196221281425, "learning_rate": 3.1501165313035877e-06, "loss": 0.0393, "step": 5930 }, { "epoch": 2.6342438374417054, "grad_norm": 0.47225489877722016, "learning_rate": 3.148315690195104e-06, "loss": 0.036, "step": 5931 }, { "epoch": 2.634687985787253, "grad_norm": 0.32874590760012345, "learning_rate": 3.146515127426228e-06, "loss": 0.021, "step": 5932 }, { "epoch": 2.6351321341328005, "grad_norm": 0.4293763106482794, "learning_rate": 3.144714843267613e-06, "loss": 0.0334, "step": 5933 }, { "epoch": 2.6355762824783477, "grad_norm": 0.34918767908800713, "learning_rate": 3.142914837989873e-06, "loss": 0.0235, "step": 5934 }, { "epoch": 2.636020430823895, "grad_norm": 0.514273957879937, "learning_rate": 3.1411151118635774e-06, "loss": 0.0335, "step": 5935 }, { "epoch": 2.6364645791694423, "grad_norm": 0.42893295828776457, "learning_rate": 3.1393156651592597e-06, "loss": 0.0271, "step": 5936 }, { "epoch": 2.63690872751499, "grad_norm": 0.5880664186723723, "learning_rate": 3.137516498147405e-06, "loss": 0.0346, "step": 5937 }, { "epoch": 2.6373528758605373, "grad_norm": 0.4554614929699438, "learning_rate": 3.1357176110984578e-06, "loss": 0.0466, "step": 5938 }, { "epoch": 2.637797024206085, "grad_norm": 0.4626649260029378, "learning_rate": 3.1339190042828227e-06, "loss": 0.0325, "step": 5939 }, { "epoch": 2.6382411725516324, "grad_norm": 0.385619247407723, "learning_rate": 3.132120677970859e-06, "loss": 0.0317, "step": 5940 }, { "epoch": 2.6386853208971797, "grad_norm": 0.772804476594651, "learning_rate": 3.1303226324328896e-06, "loss": 0.041, "step": 5941 }, { "epoch": 2.639129469242727, "grad_norm": 0.7245537321538326, "learning_rate": 3.1285248679391866e-06, "loss": 0.0335, "step": 5942 }, { "epoch": 2.6395736175882742, "grad_norm": 0.4668910845302412, "learning_rate": 3.1267273847599888e-06, "loss": 0.0282, "step": 5943 }, { "epoch": 2.640017765933822, "grad_norm": 0.4437588379794779, "learning_rate": 3.1249301831654842e-06, "loss": 0.0248, "step": 5944 }, { "epoch": 2.6404619142793693, "grad_norm": 0.391809580171727, "learning_rate": 3.123133263425827e-06, "loss": 0.0218, "step": 5945 }, { "epoch": 2.6409060626249166, "grad_norm": 0.4573419094850795, "learning_rate": 3.1213366258111207e-06, "loss": 0.0377, "step": 5946 }, { "epoch": 2.6413502109704643, "grad_norm": 0.3567944286524997, "learning_rate": 3.1195402705914337e-06, "loss": 0.0244, "step": 5947 }, { "epoch": 2.6417943593160116, "grad_norm": 0.5003131202018632, "learning_rate": 3.1177441980367873e-06, "loss": 0.0389, "step": 5948 }, { "epoch": 2.642238507661559, "grad_norm": 0.5664593011176533, "learning_rate": 3.115948408417162e-06, "loss": 0.027, "step": 5949 }, { "epoch": 2.642682656007106, "grad_norm": 0.4398029849585822, "learning_rate": 3.1141529020024964e-06, "loss": 0.0306, "step": 5950 }, { "epoch": 2.643126804352654, "grad_norm": 0.3797625962528218, "learning_rate": 3.1123576790626825e-06, "loss": 0.0266, "step": 5951 }, { "epoch": 2.643570952698201, "grad_norm": 0.41053216679804944, "learning_rate": 3.1105627398675743e-06, "loss": 0.0352, "step": 5952 }, { "epoch": 2.6440151010437485, "grad_norm": 0.3836633925963885, "learning_rate": 3.1087680846869844e-06, "loss": 0.0324, "step": 5953 }, { "epoch": 2.644459249389296, "grad_norm": 0.439056518784003, "learning_rate": 3.1069737137906776e-06, "loss": 0.042, "step": 5954 }, { "epoch": 2.6449033977348435, "grad_norm": 0.45357743660503874, "learning_rate": 3.1051796274483776e-06, "loss": 0.0343, "step": 5955 }, { "epoch": 2.645347546080391, "grad_norm": 0.38982067943155413, "learning_rate": 3.1033858259297677e-06, "loss": 0.0374, "step": 5956 }, { "epoch": 2.645791694425938, "grad_norm": 0.36926658548126234, "learning_rate": 3.1015923095044844e-06, "loss": 0.0307, "step": 5957 }, { "epoch": 2.646235842771486, "grad_norm": 0.5321168036178672, "learning_rate": 3.0997990784421273e-06, "loss": 0.0571, "step": 5958 }, { "epoch": 2.646679991117033, "grad_norm": 0.46761980336540304, "learning_rate": 3.0980061330122463e-06, "loss": 0.0398, "step": 5959 }, { "epoch": 2.6471241394625804, "grad_norm": 0.41367845862555414, "learning_rate": 3.096213473484354e-06, "loss": 0.0211, "step": 5960 }, { "epoch": 2.647568287808128, "grad_norm": 0.42099626188495787, "learning_rate": 3.094421100127916e-06, "loss": 0.0365, "step": 5961 }, { "epoch": 2.6480124361536754, "grad_norm": 0.4995405826324344, "learning_rate": 3.092629013212356e-06, "loss": 0.0394, "step": 5962 }, { "epoch": 2.6484565844992227, "grad_norm": 0.4212882639660684, "learning_rate": 3.090837213007054e-06, "loss": 0.0357, "step": 5963 }, { "epoch": 2.64890073284477, "grad_norm": 0.4419706138728259, "learning_rate": 3.0890456997813534e-06, "loss": 0.0382, "step": 5964 }, { "epoch": 2.6493448811903173, "grad_norm": 0.5281379267202536, "learning_rate": 3.087254473804544e-06, "loss": 0.0442, "step": 5965 }, { "epoch": 2.649789029535865, "grad_norm": 0.9881997927190076, "learning_rate": 3.0854635353458795e-06, "loss": 0.0348, "step": 5966 }, { "epoch": 2.6502331778814123, "grad_norm": 0.4051851073824622, "learning_rate": 3.083672884674568e-06, "loss": 0.0328, "step": 5967 }, { "epoch": 2.65067732622696, "grad_norm": 0.43295642683770935, "learning_rate": 3.081882522059774e-06, "loss": 0.0445, "step": 5968 }, { "epoch": 2.6511214745725074, "grad_norm": 0.7752250617010704, "learning_rate": 3.0800924477706185e-06, "loss": 0.0366, "step": 5969 }, { "epoch": 2.6515656229180546, "grad_norm": 0.551166830656724, "learning_rate": 3.0783026620761846e-06, "loss": 0.0313, "step": 5970 }, { "epoch": 2.652009771263602, "grad_norm": 0.35152614881285826, "learning_rate": 3.076513165245504e-06, "loss": 0.0279, "step": 5971 }, { "epoch": 2.6524539196091492, "grad_norm": 0.39759843537124434, "learning_rate": 3.0747239575475674e-06, "loss": 0.0363, "step": 5972 }, { "epoch": 2.652898067954697, "grad_norm": 0.4918571550335759, "learning_rate": 3.072935039251327e-06, "loss": 0.03, "step": 5973 }, { "epoch": 2.6533422163002442, "grad_norm": 0.3903119645882834, "learning_rate": 3.071146410625682e-06, "loss": 0.0262, "step": 5974 }, { "epoch": 2.6537863646457915, "grad_norm": 0.3990290251739691, "learning_rate": 3.0693580719395e-06, "loss": 0.0457, "step": 5975 }, { "epoch": 2.6542305129913393, "grad_norm": 0.4382406546867813, "learning_rate": 3.067570023461594e-06, "loss": 0.0345, "step": 5976 }, { "epoch": 2.6546746613368866, "grad_norm": 0.4165975886070993, "learning_rate": 3.065782265460741e-06, "loss": 0.0273, "step": 5977 }, { "epoch": 2.655118809682434, "grad_norm": 0.5153364937600957, "learning_rate": 3.06399479820567e-06, "loss": 0.0386, "step": 5978 }, { "epoch": 2.655562958027981, "grad_norm": 0.5238135987655357, "learning_rate": 3.062207621965067e-06, "loss": 0.0352, "step": 5979 }, { "epoch": 2.656007106373529, "grad_norm": 0.427195897063526, "learning_rate": 3.0604207370075743e-06, "loss": 0.0252, "step": 5980 }, { "epoch": 2.656451254719076, "grad_norm": 0.46584491776786274, "learning_rate": 3.0586341436017954e-06, "loss": 0.0476, "step": 5981 }, { "epoch": 2.6568954030646235, "grad_norm": 0.43759944048446714, "learning_rate": 3.056847842016282e-06, "loss": 0.0273, "step": 5982 }, { "epoch": 2.657339551410171, "grad_norm": 0.47962982437840673, "learning_rate": 3.0550618325195457e-06, "loss": 0.0292, "step": 5983 }, { "epoch": 2.6577836997557185, "grad_norm": 0.40526954332365667, "learning_rate": 3.053276115380055e-06, "loss": 0.0249, "step": 5984 }, { "epoch": 2.6582278481012658, "grad_norm": 0.5052423442608693, "learning_rate": 3.0514906908662346e-06, "loss": 0.0247, "step": 5985 }, { "epoch": 2.658671996446813, "grad_norm": 0.5146431419630214, "learning_rate": 3.0497055592464596e-06, "loss": 0.0381, "step": 5986 }, { "epoch": 2.659116144792361, "grad_norm": 0.40107763434385424, "learning_rate": 3.04792072078907e-06, "loss": 0.0305, "step": 5987 }, { "epoch": 2.659560293137908, "grad_norm": 0.5005148653434598, "learning_rate": 3.046136175762357e-06, "loss": 0.0308, "step": 5988 }, { "epoch": 2.6600044414834554, "grad_norm": 0.5429487251371317, "learning_rate": 3.0443519244345666e-06, "loss": 0.0359, "step": 5989 }, { "epoch": 2.660448589829003, "grad_norm": 0.5062800108359077, "learning_rate": 3.0425679670739026e-06, "loss": 0.0409, "step": 5990 }, { "epoch": 2.6608927381745504, "grad_norm": 0.37298776061190037, "learning_rate": 3.040784303948523e-06, "loss": 0.0314, "step": 5991 }, { "epoch": 2.6613368865200977, "grad_norm": 0.6131690894387388, "learning_rate": 3.0390009353265458e-06, "loss": 0.0276, "step": 5992 }, { "epoch": 2.661781034865645, "grad_norm": 0.4287293989906706, "learning_rate": 3.0372178614760382e-06, "loss": 0.038, "step": 5993 }, { "epoch": 2.6622251832111923, "grad_norm": 0.494407979190499, "learning_rate": 3.035435082665029e-06, "loss": 0.0328, "step": 5994 }, { "epoch": 2.66266933155674, "grad_norm": 0.4258763284446213, "learning_rate": 3.033652599161499e-06, "loss": 0.0268, "step": 5995 }, { "epoch": 2.6631134799022873, "grad_norm": 0.4705172448160009, "learning_rate": 3.0318704112333847e-06, "loss": 0.0314, "step": 5996 }, { "epoch": 2.663557628247835, "grad_norm": 0.4749657917164724, "learning_rate": 3.0300885191485797e-06, "loss": 0.0363, "step": 5997 }, { "epoch": 2.6640017765933823, "grad_norm": 0.7383043422980394, "learning_rate": 3.0283069231749344e-06, "loss": 0.0361, "step": 5998 }, { "epoch": 2.6644459249389296, "grad_norm": 0.38814202685884, "learning_rate": 3.026525623580252e-06, "loss": 0.026, "step": 5999 }, { "epoch": 2.664890073284477, "grad_norm": 0.3764976664025451, "learning_rate": 3.0247446206322916e-06, "loss": 0.0321, "step": 6000 }, { "epoch": 2.665334221630024, "grad_norm": 0.45096374436929443, "learning_rate": 3.0229639145987687e-06, "loss": 0.034, "step": 6001 }, { "epoch": 2.665778369975572, "grad_norm": 0.36848556078596323, "learning_rate": 3.021183505747354e-06, "loss": 0.031, "step": 6002 }, { "epoch": 2.6662225183211192, "grad_norm": 0.3699352216981277, "learning_rate": 3.0194033943456696e-06, "loss": 0.028, "step": 6003 }, { "epoch": 2.6666666666666665, "grad_norm": 0.381719196592165, "learning_rate": 3.0176235806613008e-06, "loss": 0.0245, "step": 6004 }, { "epoch": 2.6671108150122143, "grad_norm": 0.4602734928315702, "learning_rate": 3.0158440649617836e-06, "loss": 0.039, "step": 6005 }, { "epoch": 2.6675549633577615, "grad_norm": 0.34646717181854414, "learning_rate": 3.014064847514609e-06, "loss": 0.0248, "step": 6006 }, { "epoch": 2.667999111703309, "grad_norm": 0.41630402405704753, "learning_rate": 3.0122859285872214e-06, "loss": 0.0364, "step": 6007 }, { "epoch": 2.668443260048856, "grad_norm": 0.39925464642353814, "learning_rate": 3.010507308447025e-06, "loss": 0.0286, "step": 6008 }, { "epoch": 2.668887408394404, "grad_norm": 0.5709915993898592, "learning_rate": 3.0087289873613746e-06, "loss": 0.0409, "step": 6009 }, { "epoch": 2.669331556739951, "grad_norm": 0.5153833490415598, "learning_rate": 3.0069509655975835e-06, "loss": 0.0388, "step": 6010 }, { "epoch": 2.6697757050854984, "grad_norm": 0.37873158972984705, "learning_rate": 3.0051732434229185e-06, "loss": 0.0254, "step": 6011 }, { "epoch": 2.670219853431046, "grad_norm": 0.4404773311355675, "learning_rate": 3.003395821104602e-06, "loss": 0.0308, "step": 6012 }, { "epoch": 2.6706640017765935, "grad_norm": 0.5123588004903873, "learning_rate": 3.001618698909809e-06, "loss": 0.032, "step": 6013 }, { "epoch": 2.6711081501221408, "grad_norm": 0.41512949784084213, "learning_rate": 2.999841877105672e-06, "loss": 0.0277, "step": 6014 }, { "epoch": 2.671552298467688, "grad_norm": 0.4025208152025212, "learning_rate": 2.9980653559592775e-06, "loss": 0.0313, "step": 6015 }, { "epoch": 2.671996446813236, "grad_norm": 0.5344242242777247, "learning_rate": 2.996289135737668e-06, "loss": 0.0351, "step": 6016 }, { "epoch": 2.672440595158783, "grad_norm": 0.4264946062568879, "learning_rate": 2.994513216707838e-06, "loss": 0.0263, "step": 6017 }, { "epoch": 2.6728847435043304, "grad_norm": 0.3811830713887272, "learning_rate": 2.992737599136739e-06, "loss": 0.0268, "step": 6018 }, { "epoch": 2.673328891849878, "grad_norm": 0.386704376318305, "learning_rate": 2.9909622832912767e-06, "loss": 0.0315, "step": 6019 }, { "epoch": 2.6737730401954254, "grad_norm": 0.4136667792416568, "learning_rate": 2.989187269438308e-06, "loss": 0.0307, "step": 6020 }, { "epoch": 2.6742171885409727, "grad_norm": 0.4174591961736545, "learning_rate": 2.987412557844653e-06, "loss": 0.0319, "step": 6021 }, { "epoch": 2.67466133688652, "grad_norm": 0.42669426474826655, "learning_rate": 2.985638148777078e-06, "loss": 0.0288, "step": 6022 }, { "epoch": 2.6751054852320673, "grad_norm": 0.3748882818930551, "learning_rate": 2.983864042502308e-06, "loss": 0.0263, "step": 6023 }, { "epoch": 2.675549633577615, "grad_norm": 0.4706253013201654, "learning_rate": 2.9820902392870197e-06, "loss": 0.0284, "step": 6024 }, { "epoch": 2.6759937819231623, "grad_norm": 0.4662239351305156, "learning_rate": 2.980316739397847e-06, "loss": 0.0358, "step": 6025 }, { "epoch": 2.67643793026871, "grad_norm": 0.314532858998878, "learning_rate": 2.9785435431013755e-06, "loss": 0.0232, "step": 6026 }, { "epoch": 2.6768820786142573, "grad_norm": 0.5337532314358315, "learning_rate": 2.97677065066415e-06, "loss": 0.0333, "step": 6027 }, { "epoch": 2.6773262269598046, "grad_norm": 0.5956371023489798, "learning_rate": 2.9749980623526633e-06, "loss": 0.0381, "step": 6028 }, { "epoch": 2.677770375305352, "grad_norm": 0.3325969522067864, "learning_rate": 2.9732257784333673e-06, "loss": 0.0249, "step": 6029 }, { "epoch": 2.678214523650899, "grad_norm": 0.44226896551023326, "learning_rate": 2.9714537991726656e-06, "loss": 0.0321, "step": 6030 }, { "epoch": 2.678658671996447, "grad_norm": 0.407289289939863, "learning_rate": 2.9696821248369152e-06, "loss": 0.0294, "step": 6031 }, { "epoch": 2.679102820341994, "grad_norm": 0.42539794841254347, "learning_rate": 2.9679107556924314e-06, "loss": 0.0312, "step": 6032 }, { "epoch": 2.6795469686875415, "grad_norm": 0.3452430100807996, "learning_rate": 2.966139692005481e-06, "loss": 0.0311, "step": 6033 }, { "epoch": 2.6799911170330892, "grad_norm": 0.47114526962385006, "learning_rate": 2.9643689340422844e-06, "loss": 0.0351, "step": 6034 }, { "epoch": 2.6804352653786365, "grad_norm": 0.3056012283706184, "learning_rate": 2.962598482069015e-06, "loss": 0.0254, "step": 6035 }, { "epoch": 2.680879413724184, "grad_norm": 0.39834900420012637, "learning_rate": 2.960828336351804e-06, "loss": 0.0329, "step": 6036 }, { "epoch": 2.681323562069731, "grad_norm": 0.4921282055431307, "learning_rate": 2.9590584971567327e-06, "loss": 0.0378, "step": 6037 }, { "epoch": 2.681767710415279, "grad_norm": 0.401942181093132, "learning_rate": 2.957288964749839e-06, "loss": 0.0282, "step": 6038 }, { "epoch": 2.682211858760826, "grad_norm": 0.5998754085660062, "learning_rate": 2.955519739397114e-06, "loss": 0.04, "step": 6039 }, { "epoch": 2.6826560071063734, "grad_norm": 0.5229301227786665, "learning_rate": 2.9537508213645026e-06, "loss": 0.0439, "step": 6040 }, { "epoch": 2.683100155451921, "grad_norm": 0.4131624612623256, "learning_rate": 2.9519822109179007e-06, "loss": 0.0324, "step": 6041 }, { "epoch": 2.6835443037974684, "grad_norm": 0.3861923836427622, "learning_rate": 2.950213908323164e-06, "loss": 0.0277, "step": 6042 }, { "epoch": 2.6839884521430157, "grad_norm": 0.4648268575218616, "learning_rate": 2.948445913846094e-06, "loss": 0.0289, "step": 6043 }, { "epoch": 2.684432600488563, "grad_norm": 0.4732042527119129, "learning_rate": 2.9466782277524554e-06, "loss": 0.0309, "step": 6044 }, { "epoch": 2.6848767488341103, "grad_norm": 0.3982899300782419, "learning_rate": 2.944910850307958e-06, "loss": 0.0314, "step": 6045 }, { "epoch": 2.685320897179658, "grad_norm": 0.4153109085037015, "learning_rate": 2.9431437817782705e-06, "loss": 0.0339, "step": 6046 }, { "epoch": 2.6857650455252053, "grad_norm": 0.43324892108850316, "learning_rate": 2.9413770224290126e-06, "loss": 0.0311, "step": 6047 }, { "epoch": 2.686209193870753, "grad_norm": 0.28406951308335854, "learning_rate": 2.9396105725257563e-06, "loss": 0.0231, "step": 6048 }, { "epoch": 2.6866533422163004, "grad_norm": 0.39298370689744644, "learning_rate": 2.9378444323340316e-06, "loss": 0.0304, "step": 6049 }, { "epoch": 2.6870974905618477, "grad_norm": 0.4485933834660012, "learning_rate": 2.9360786021193192e-06, "loss": 0.0285, "step": 6050 }, { "epoch": 2.687541638907395, "grad_norm": 0.35520292176022183, "learning_rate": 2.934313082147053e-06, "loss": 0.0226, "step": 6051 }, { "epoch": 2.6879857872529422, "grad_norm": 0.44687364826159626, "learning_rate": 2.93254787268262e-06, "loss": 0.0357, "step": 6052 }, { "epoch": 2.68842993559849, "grad_norm": 0.5271674075113904, "learning_rate": 2.930782973991362e-06, "loss": 0.0377, "step": 6053 }, { "epoch": 2.6888740839440373, "grad_norm": 0.4626973474509376, "learning_rate": 2.929018386338571e-06, "loss": 0.0502, "step": 6054 }, { "epoch": 2.689318232289585, "grad_norm": 0.4073720832807278, "learning_rate": 2.927254109989499e-06, "loss": 0.0328, "step": 6055 }, { "epoch": 2.6897623806351323, "grad_norm": 0.38757290739790723, "learning_rate": 2.9254901452093424e-06, "loss": 0.031, "step": 6056 }, { "epoch": 2.6902065289806796, "grad_norm": 0.3981460707866334, "learning_rate": 2.923726492263258e-06, "loss": 0.0329, "step": 6057 }, { "epoch": 2.690650677326227, "grad_norm": 0.34586475207518175, "learning_rate": 2.9219631514163514e-06, "loss": 0.0229, "step": 6058 }, { "epoch": 2.691094825671774, "grad_norm": 0.3814494157202918, "learning_rate": 2.9202001229336817e-06, "loss": 0.0252, "step": 6059 }, { "epoch": 2.691538974017322, "grad_norm": 0.5320910876205124, "learning_rate": 2.9184374070802633e-06, "loss": 0.0528, "step": 6060 }, { "epoch": 2.691983122362869, "grad_norm": 0.5320769049550915, "learning_rate": 2.916675004121062e-06, "loss": 0.0374, "step": 6061 }, { "epoch": 2.6924272707084165, "grad_norm": 0.46814435114343156, "learning_rate": 2.9149129143209974e-06, "loss": 0.0366, "step": 6062 }, { "epoch": 2.692871419053964, "grad_norm": 0.46820101387242424, "learning_rate": 2.9131511379449428e-06, "loss": 0.0398, "step": 6063 }, { "epoch": 2.6933155673995115, "grad_norm": 0.42299234767365035, "learning_rate": 2.9113896752577205e-06, "loss": 0.0306, "step": 6064 }, { "epoch": 2.693759715745059, "grad_norm": 0.3540454635472545, "learning_rate": 2.9096285265241063e-06, "loss": 0.0253, "step": 6065 }, { "epoch": 2.694203864090606, "grad_norm": 0.47552195678010667, "learning_rate": 2.9078676920088378e-06, "loss": 0.0304, "step": 6066 }, { "epoch": 2.694648012436154, "grad_norm": 0.41330422015572915, "learning_rate": 2.9061071719765933e-06, "loss": 0.0334, "step": 6067 }, { "epoch": 2.695092160781701, "grad_norm": 0.3903849704575043, "learning_rate": 2.9043469666920088e-06, "loss": 0.0261, "step": 6068 }, { "epoch": 2.6955363091272484, "grad_norm": 0.37706353026097844, "learning_rate": 2.902587076419676e-06, "loss": 0.0294, "step": 6069 }, { "epoch": 2.695980457472796, "grad_norm": 0.3862061240373804, "learning_rate": 2.900827501424133e-06, "loss": 0.0354, "step": 6070 }, { "epoch": 2.6964246058183434, "grad_norm": 0.4432184785361431, "learning_rate": 2.899068241969876e-06, "loss": 0.032, "step": 6071 }, { "epoch": 2.6968687541638907, "grad_norm": 0.4814495852537153, "learning_rate": 2.8973092983213493e-06, "loss": 0.0335, "step": 6072 }, { "epoch": 2.697312902509438, "grad_norm": 0.444449262745695, "learning_rate": 2.8955506707429545e-06, "loss": 0.0428, "step": 6073 }, { "epoch": 2.6977570508549853, "grad_norm": 0.38128392307642117, "learning_rate": 2.8937923594990435e-06, "loss": 0.0253, "step": 6074 }, { "epoch": 2.698201199200533, "grad_norm": 0.4767935660247025, "learning_rate": 2.8920343648539174e-06, "loss": 0.0261, "step": 6075 }, { "epoch": 2.6986453475460803, "grad_norm": 0.5621830464805478, "learning_rate": 2.8902766870718347e-06, "loss": 0.0478, "step": 6076 }, { "epoch": 2.699089495891628, "grad_norm": 0.34049032210297114, "learning_rate": 2.8885193264170036e-06, "loss": 0.0294, "step": 6077 }, { "epoch": 2.6995336442371753, "grad_norm": 0.4750456467312549, "learning_rate": 2.886762283153586e-06, "loss": 0.0371, "step": 6078 }, { "epoch": 2.6999777925827226, "grad_norm": 0.4893300140355996, "learning_rate": 2.885005557545694e-06, "loss": 0.0285, "step": 6079 }, { "epoch": 2.70042194092827, "grad_norm": 0.4397921048743652, "learning_rate": 2.8832491498573965e-06, "loss": 0.0365, "step": 6080 }, { "epoch": 2.700866089273817, "grad_norm": 0.4948308694304273, "learning_rate": 2.8814930603527067e-06, "loss": 0.0318, "step": 6081 }, { "epoch": 2.701310237619365, "grad_norm": 0.4221767743708031, "learning_rate": 2.8797372892955978e-06, "loss": 0.0355, "step": 6082 }, { "epoch": 2.7017543859649122, "grad_norm": 0.548027854837134, "learning_rate": 2.877981836949991e-06, "loss": 0.0286, "step": 6083 }, { "epoch": 2.7021985343104595, "grad_norm": 0.50981889703332, "learning_rate": 2.8762267035797607e-06, "loss": 0.0346, "step": 6084 }, { "epoch": 2.7026426826560073, "grad_norm": 0.49304824695259686, "learning_rate": 2.8744718894487345e-06, "loss": 0.0392, "step": 6085 }, { "epoch": 2.7030868310015546, "grad_norm": 0.33965710652598363, "learning_rate": 2.8727173948206905e-06, "loss": 0.0211, "step": 6086 }, { "epoch": 2.703530979347102, "grad_norm": 0.4906097802841706, "learning_rate": 2.870963219959357e-06, "loss": 0.0339, "step": 6087 }, { "epoch": 2.703975127692649, "grad_norm": 0.45131838531292917, "learning_rate": 2.869209365128417e-06, "loss": 0.0327, "step": 6088 }, { "epoch": 2.704419276038197, "grad_norm": 0.44310343452513123, "learning_rate": 2.8674558305915057e-06, "loss": 0.0413, "step": 6089 }, { "epoch": 2.704863424383744, "grad_norm": 0.4694661668866994, "learning_rate": 2.865702616612208e-06, "loss": 0.0303, "step": 6090 }, { "epoch": 2.7053075727292915, "grad_norm": 0.42182727027273254, "learning_rate": 2.8639497234540646e-06, "loss": 0.0322, "step": 6091 }, { "epoch": 2.705751721074839, "grad_norm": 0.34911280190910793, "learning_rate": 2.862197151380561e-06, "loss": 0.0297, "step": 6092 }, { "epoch": 2.7061958694203865, "grad_norm": 0.5070815814024743, "learning_rate": 2.8604449006551406e-06, "loss": 0.0535, "step": 6093 }, { "epoch": 2.7066400177659338, "grad_norm": 0.6056329049163846, "learning_rate": 2.8586929715411963e-06, "loss": 0.0589, "step": 6094 }, { "epoch": 2.707084166111481, "grad_norm": 0.42386614097773856, "learning_rate": 2.8569413643020725e-06, "loss": 0.0337, "step": 6095 }, { "epoch": 2.707528314457029, "grad_norm": 0.5481706485098858, "learning_rate": 2.855190079201067e-06, "loss": 0.0346, "step": 6096 }, { "epoch": 2.707972462802576, "grad_norm": 0.4317878891723994, "learning_rate": 2.8534391165014275e-06, "loss": 0.0447, "step": 6097 }, { "epoch": 2.7084166111481234, "grad_norm": 0.4182536684547791, "learning_rate": 2.8516884764663512e-06, "loss": 0.0381, "step": 6098 }, { "epoch": 2.708860759493671, "grad_norm": 0.39160951294185736, "learning_rate": 2.849938159358989e-06, "loss": 0.0261, "step": 6099 }, { "epoch": 2.7093049078392184, "grad_norm": 0.3551951087739855, "learning_rate": 2.848188165442446e-06, "loss": 0.0293, "step": 6100 }, { "epoch": 2.7097490561847657, "grad_norm": 0.4415214034913667, "learning_rate": 2.846438494979774e-06, "loss": 0.0374, "step": 6101 }, { "epoch": 2.710193204530313, "grad_norm": 0.6570425809867534, "learning_rate": 2.844689148233979e-06, "loss": 0.0493, "step": 6102 }, { "epoch": 2.7106373528758603, "grad_norm": 0.42365442096686196, "learning_rate": 2.842940125468019e-06, "loss": 0.0382, "step": 6103 }, { "epoch": 2.711081501221408, "grad_norm": 0.4801471324952032, "learning_rate": 2.8411914269447984e-06, "loss": 0.0252, "step": 6104 }, { "epoch": 2.7115256495669553, "grad_norm": 0.3451437825598062, "learning_rate": 2.8394430529271777e-06, "loss": 0.0299, "step": 6105 }, { "epoch": 2.711969797912503, "grad_norm": 0.5219358142266073, "learning_rate": 2.8376950036779683e-06, "loss": 0.0295, "step": 6106 }, { "epoch": 2.7124139462580503, "grad_norm": 0.3693933163750994, "learning_rate": 2.8359472794599307e-06, "loss": 0.0277, "step": 6107 }, { "epoch": 2.7128580946035976, "grad_norm": 0.4033904609462883, "learning_rate": 2.8341998805357796e-06, "loss": 0.0265, "step": 6108 }, { "epoch": 2.713302242949145, "grad_norm": 0.4144874366737247, "learning_rate": 2.832452807168175e-06, "loss": 0.0268, "step": 6109 }, { "epoch": 2.713746391294692, "grad_norm": 0.5838661054136332, "learning_rate": 2.8307060596197337e-06, "loss": 0.0384, "step": 6110 }, { "epoch": 2.71419053964024, "grad_norm": 0.5420801857113182, "learning_rate": 2.8289596381530214e-06, "loss": 0.0305, "step": 6111 }, { "epoch": 2.714634687985787, "grad_norm": 0.42444475722005587, "learning_rate": 2.8272135430305558e-06, "loss": 0.041, "step": 6112 }, { "epoch": 2.7150788363313345, "grad_norm": 0.36578048462525586, "learning_rate": 2.825467774514803e-06, "loss": 0.0224, "step": 6113 }, { "epoch": 2.7155229846768822, "grad_norm": 0.33007914592544324, "learning_rate": 2.823722332868185e-06, "loss": 0.0252, "step": 6114 }, { "epoch": 2.7159671330224295, "grad_norm": 0.5564364593461901, "learning_rate": 2.821977218353067e-06, "loss": 0.0304, "step": 6115 }, { "epoch": 2.716411281367977, "grad_norm": 0.539249473006622, "learning_rate": 2.820232431231771e-06, "loss": 0.0417, "step": 6116 }, { "epoch": 2.716855429713524, "grad_norm": 0.39638502515608776, "learning_rate": 2.818487971766568e-06, "loss": 0.0276, "step": 6117 }, { "epoch": 2.717299578059072, "grad_norm": 0.3923969070351993, "learning_rate": 2.816743840219681e-06, "loss": 0.0345, "step": 6118 }, { "epoch": 2.717743726404619, "grad_norm": 0.7090345157048296, "learning_rate": 2.8150000368532826e-06, "loss": 0.0459, "step": 6119 }, { "epoch": 2.7181878747501664, "grad_norm": 0.49769141697815394, "learning_rate": 2.8132565619294943e-06, "loss": 0.0312, "step": 6120 }, { "epoch": 2.718632023095714, "grad_norm": 0.3567290862892367, "learning_rate": 2.8115134157103906e-06, "loss": 0.0427, "step": 6121 }, { "epoch": 2.7190761714412615, "grad_norm": 0.41308801601010114, "learning_rate": 2.809770598457997e-06, "loss": 0.0308, "step": 6122 }, { "epoch": 2.7195203197868087, "grad_norm": 0.42911807998450446, "learning_rate": 2.8080281104342875e-06, "loss": 0.0389, "step": 6123 }, { "epoch": 2.719964468132356, "grad_norm": 0.3668652303738415, "learning_rate": 2.8062859519011885e-06, "loss": 0.031, "step": 6124 }, { "epoch": 2.7204086164779038, "grad_norm": 0.4729128881363199, "learning_rate": 2.8045441231205773e-06, "loss": 0.0359, "step": 6125 }, { "epoch": 2.720852764823451, "grad_norm": 0.49961860986096596, "learning_rate": 2.802802624354276e-06, "loss": 0.0284, "step": 6126 }, { "epoch": 2.7212969131689984, "grad_norm": 0.38704332861461993, "learning_rate": 2.8010614558640653e-06, "loss": 0.0306, "step": 6127 }, { "epoch": 2.721741061514546, "grad_norm": 0.36762472796404744, "learning_rate": 2.7993206179116706e-06, "loss": 0.0297, "step": 6128 }, { "epoch": 2.7221852098600934, "grad_norm": 0.40237829108230383, "learning_rate": 2.79758011075877e-06, "loss": 0.0379, "step": 6129 }, { "epoch": 2.7226293582056407, "grad_norm": 0.39422460053368863, "learning_rate": 2.7958399346669916e-06, "loss": 0.0324, "step": 6130 }, { "epoch": 2.723073506551188, "grad_norm": 0.4944769619389698, "learning_rate": 2.7941000898979153e-06, "loss": 0.0365, "step": 6131 }, { "epoch": 2.7235176548967353, "grad_norm": 0.43625796235808223, "learning_rate": 2.7923605767130644e-06, "loss": 0.0385, "step": 6132 }, { "epoch": 2.723961803242283, "grad_norm": 0.39500536045009, "learning_rate": 2.790621395373921e-06, "loss": 0.0247, "step": 6133 }, { "epoch": 2.7244059515878303, "grad_norm": 0.4784483734678597, "learning_rate": 2.7888825461419124e-06, "loss": 0.0226, "step": 6134 }, { "epoch": 2.724850099933378, "grad_norm": 0.5298205495609496, "learning_rate": 2.7871440292784167e-06, "loss": 0.0407, "step": 6135 }, { "epoch": 2.7252942482789253, "grad_norm": 0.46575202716742126, "learning_rate": 2.7854058450447657e-06, "loss": 0.0412, "step": 6136 }, { "epoch": 2.7257383966244726, "grad_norm": 0.4738047535073062, "learning_rate": 2.783667993702234e-06, "loss": 0.0385, "step": 6137 }, { "epoch": 2.72618254497002, "grad_norm": 0.3869731324535632, "learning_rate": 2.7819304755120514e-06, "loss": 0.0295, "step": 6138 }, { "epoch": 2.726626693315567, "grad_norm": 0.3652035596631365, "learning_rate": 2.7801932907353966e-06, "loss": 0.0295, "step": 6139 }, { "epoch": 2.727070841661115, "grad_norm": 0.41533452786136, "learning_rate": 2.778456439633398e-06, "loss": 0.0242, "step": 6140 }, { "epoch": 2.727514990006662, "grad_norm": 0.35015673870621206, "learning_rate": 2.776719922467135e-06, "loss": 0.0313, "step": 6141 }, { "epoch": 2.7279591383522095, "grad_norm": 0.48649742529511003, "learning_rate": 2.7749837394976353e-06, "loss": 0.0403, "step": 6142 }, { "epoch": 2.7284032866977572, "grad_norm": 0.4158217744500075, "learning_rate": 2.773247890985874e-06, "loss": 0.0294, "step": 6143 }, { "epoch": 2.7288474350433045, "grad_norm": 0.3886529547809444, "learning_rate": 2.7715123771927817e-06, "loss": 0.0285, "step": 6144 }, { "epoch": 2.729291583388852, "grad_norm": 0.48618253354322, "learning_rate": 2.7697771983792334e-06, "loss": 0.0381, "step": 6145 }, { "epoch": 2.729735731734399, "grad_norm": 0.4239190944164018, "learning_rate": 2.7680423548060574e-06, "loss": 0.028, "step": 6146 }, { "epoch": 2.730179880079947, "grad_norm": 0.6790565769570187, "learning_rate": 2.766307846734032e-06, "loss": 0.0344, "step": 6147 }, { "epoch": 2.730624028425494, "grad_norm": 0.4083045746295625, "learning_rate": 2.764573674423879e-06, "loss": 0.0292, "step": 6148 }, { "epoch": 2.7310681767710414, "grad_norm": 0.39741625214319276, "learning_rate": 2.7628398381362765e-06, "loss": 0.0369, "step": 6149 }, { "epoch": 2.731512325116589, "grad_norm": 0.3364754032489686, "learning_rate": 2.7611063381318483e-06, "loss": 0.0223, "step": 6150 }, { "epoch": 2.7319564734621364, "grad_norm": 0.33915812547001584, "learning_rate": 2.7593731746711695e-06, "loss": 0.0276, "step": 6151 }, { "epoch": 2.7324006218076837, "grad_norm": 0.3730496797762164, "learning_rate": 2.757640348014764e-06, "loss": 0.0274, "step": 6152 }, { "epoch": 2.732844770153231, "grad_norm": 0.3751953423117338, "learning_rate": 2.755907858423108e-06, "loss": 0.0363, "step": 6153 }, { "epoch": 2.7332889184987788, "grad_norm": 0.42372086556075383, "learning_rate": 2.754175706156619e-06, "loss": 0.0244, "step": 6154 }, { "epoch": 2.733733066844326, "grad_norm": 0.3686003953358143, "learning_rate": 2.7524438914756714e-06, "loss": 0.0227, "step": 6155 }, { "epoch": 2.7341772151898733, "grad_norm": 0.38850199266907187, "learning_rate": 2.750712414640588e-06, "loss": 0.0318, "step": 6156 }, { "epoch": 2.734621363535421, "grad_norm": 0.4212753598660164, "learning_rate": 2.748981275911633e-06, "loss": 0.027, "step": 6157 }, { "epoch": 2.7350655118809684, "grad_norm": 0.3950070118539609, "learning_rate": 2.747250475549033e-06, "loss": 0.0347, "step": 6158 }, { "epoch": 2.7355096602265156, "grad_norm": 0.5989665019227255, "learning_rate": 2.745520013812956e-06, "loss": 0.0413, "step": 6159 }, { "epoch": 2.735953808572063, "grad_norm": 0.4076839235525957, "learning_rate": 2.743789890963516e-06, "loss": 0.0239, "step": 6160 }, { "epoch": 2.7363979569176102, "grad_norm": 0.4896550081248881, "learning_rate": 2.742060107260781e-06, "loss": 0.0284, "step": 6161 }, { "epoch": 2.736842105263158, "grad_norm": 0.5961993037262533, "learning_rate": 2.740330662964768e-06, "loss": 0.0399, "step": 6162 }, { "epoch": 2.7372862536087053, "grad_norm": 0.3528802270631205, "learning_rate": 2.7386015583354414e-06, "loss": 0.0271, "step": 6163 }, { "epoch": 2.737730401954253, "grad_norm": 0.38484518351915703, "learning_rate": 2.736872793632717e-06, "loss": 0.0369, "step": 6164 }, { "epoch": 2.7381745502998003, "grad_norm": 0.4319515735643185, "learning_rate": 2.7351443691164537e-06, "loss": 0.0228, "step": 6165 }, { "epoch": 2.7386186986453476, "grad_norm": 0.42098419937374343, "learning_rate": 2.7334162850464645e-06, "loss": 0.0292, "step": 6166 }, { "epoch": 2.739062846990895, "grad_norm": 0.35177937154537675, "learning_rate": 2.7316885416825123e-06, "loss": 0.0218, "step": 6167 }, { "epoch": 2.739506995336442, "grad_norm": 0.41712238277892294, "learning_rate": 2.7299611392843005e-06, "loss": 0.0368, "step": 6168 }, { "epoch": 2.73995114368199, "grad_norm": 0.39108614184700036, "learning_rate": 2.7282340781114926e-06, "loss": 0.0261, "step": 6169 }, { "epoch": 2.740395292027537, "grad_norm": 0.526808362535383, "learning_rate": 2.726507358423695e-06, "loss": 0.0267, "step": 6170 }, { "epoch": 2.7408394403730845, "grad_norm": 0.3749087607755556, "learning_rate": 2.7247809804804593e-06, "loss": 0.0198, "step": 6171 }, { "epoch": 2.741283588718632, "grad_norm": 0.41805354629010627, "learning_rate": 2.723054944541292e-06, "loss": 0.0358, "step": 6172 }, { "epoch": 2.7417277370641795, "grad_norm": 0.37604430696616314, "learning_rate": 2.721329250865646e-06, "loss": 0.0273, "step": 6173 }, { "epoch": 2.742171885409727, "grad_norm": 0.4611005258922287, "learning_rate": 2.719603899712919e-06, "loss": 0.031, "step": 6174 }, { "epoch": 2.742616033755274, "grad_norm": 0.41841947895126935, "learning_rate": 2.7178788913424635e-06, "loss": 0.0315, "step": 6175 }, { "epoch": 2.743060182100822, "grad_norm": 0.41801694735137535, "learning_rate": 2.7161542260135797e-06, "loss": 0.0336, "step": 6176 }, { "epoch": 2.743504330446369, "grad_norm": 0.4765481557698237, "learning_rate": 2.7144299039855105e-06, "loss": 0.0375, "step": 6177 }, { "epoch": 2.7439484787919164, "grad_norm": 0.3580598459132636, "learning_rate": 2.7127059255174504e-06, "loss": 0.0286, "step": 6178 }, { "epoch": 2.744392627137464, "grad_norm": 0.4101523804697921, "learning_rate": 2.7109822908685445e-06, "loss": 0.0274, "step": 6179 }, { "epoch": 2.7448367754830114, "grad_norm": 0.4715955568359928, "learning_rate": 2.7092590002978837e-06, "loss": 0.0345, "step": 6180 }, { "epoch": 2.7452809238285587, "grad_norm": 0.6089181044007468, "learning_rate": 2.70753605406451e-06, "loss": 0.0343, "step": 6181 }, { "epoch": 2.745725072174106, "grad_norm": 0.5806324207660223, "learning_rate": 2.7058134524274083e-06, "loss": 0.0346, "step": 6182 }, { "epoch": 2.7461692205196533, "grad_norm": 0.40045495221538185, "learning_rate": 2.7040911956455153e-06, "loss": 0.0278, "step": 6183 }, { "epoch": 2.746613368865201, "grad_norm": 0.45888251149836196, "learning_rate": 2.702369283977718e-06, "loss": 0.0321, "step": 6184 }, { "epoch": 2.7470575172107483, "grad_norm": 0.39535983779938544, "learning_rate": 2.7006477176828443e-06, "loss": 0.0291, "step": 6185 }, { "epoch": 2.747501665556296, "grad_norm": 0.40307288228268773, "learning_rate": 2.6989264970196795e-06, "loss": 0.031, "step": 6186 }, { "epoch": 2.7479458139018433, "grad_norm": 0.42537830184812087, "learning_rate": 2.697205622246952e-06, "loss": 0.0379, "step": 6187 }, { "epoch": 2.7483899622473906, "grad_norm": 0.7384045353118717, "learning_rate": 2.6954850936233357e-06, "loss": 0.0388, "step": 6188 }, { "epoch": 2.748834110592938, "grad_norm": 0.49237325688101374, "learning_rate": 2.693764911407456e-06, "loss": 0.0357, "step": 6189 }, { "epoch": 2.749278258938485, "grad_norm": 0.43303507384385986, "learning_rate": 2.6920450758578885e-06, "loss": 0.0305, "step": 6190 }, { "epoch": 2.749722407284033, "grad_norm": 0.5198323318463377, "learning_rate": 2.690325587233148e-06, "loss": 0.042, "step": 6191 }, { "epoch": 2.7501665556295802, "grad_norm": 0.5579062089534479, "learning_rate": 2.6886064457917094e-06, "loss": 0.0442, "step": 6192 }, { "epoch": 2.750610703975128, "grad_norm": 0.2992794413549194, "learning_rate": 2.6868876517919845e-06, "loss": 0.0214, "step": 6193 }, { "epoch": 2.7510548523206753, "grad_norm": 0.5197412003207367, "learning_rate": 2.6851692054923385e-06, "loss": 0.0447, "step": 6194 }, { "epoch": 2.7514990006662225, "grad_norm": 0.45629204802490403, "learning_rate": 2.6834511071510823e-06, "loss": 0.0335, "step": 6195 }, { "epoch": 2.75194314901177, "grad_norm": 0.3939710962212484, "learning_rate": 2.681733357026476e-06, "loss": 0.0334, "step": 6196 }, { "epoch": 2.752387297357317, "grad_norm": 0.4395100384242003, "learning_rate": 2.680015955376727e-06, "loss": 0.0269, "step": 6197 }, { "epoch": 2.752831445702865, "grad_norm": 0.35669081976582817, "learning_rate": 2.6782989024599913e-06, "loss": 0.0239, "step": 6198 }, { "epoch": 2.753275594048412, "grad_norm": 0.42101464193049454, "learning_rate": 2.6765821985343676e-06, "loss": 0.0297, "step": 6199 }, { "epoch": 2.7537197423939594, "grad_norm": 0.4443116484682141, "learning_rate": 2.6748658438579075e-06, "loss": 0.0367, "step": 6200 }, { "epoch": 2.754163890739507, "grad_norm": 0.3576485677615678, "learning_rate": 2.6731498386886094e-06, "loss": 0.0269, "step": 6201 }, { "epoch": 2.7546080390850545, "grad_norm": 0.8029786113817223, "learning_rate": 2.6714341832844137e-06, "loss": 0.0366, "step": 6202 }, { "epoch": 2.7550521874306018, "grad_norm": 0.4869826644782611, "learning_rate": 2.6697188779032173e-06, "loss": 0.0293, "step": 6203 }, { "epoch": 2.755496335776149, "grad_norm": 0.43061987045181044, "learning_rate": 2.6680039228028603e-06, "loss": 0.0352, "step": 6204 }, { "epoch": 2.755940484121697, "grad_norm": 0.3632393699444563, "learning_rate": 2.6662893182411255e-06, "loss": 0.0256, "step": 6205 }, { "epoch": 2.756384632467244, "grad_norm": 0.37775571779218736, "learning_rate": 2.6645750644757484e-06, "loss": 0.0342, "step": 6206 }, { "epoch": 2.7568287808127914, "grad_norm": 0.31327481469952234, "learning_rate": 2.6628611617644133e-06, "loss": 0.0209, "step": 6207 }, { "epoch": 2.757272929158339, "grad_norm": 0.5059332837930003, "learning_rate": 2.6611476103647425e-06, "loss": 0.0286, "step": 6208 }, { "epoch": 2.7577170775038864, "grad_norm": 0.39890166964039236, "learning_rate": 2.6594344105343207e-06, "loss": 0.0335, "step": 6209 }, { "epoch": 2.7581612258494337, "grad_norm": 0.40337347616568126, "learning_rate": 2.657721562530664e-06, "loss": 0.0206, "step": 6210 }, { "epoch": 2.758605374194981, "grad_norm": 0.44935773733297946, "learning_rate": 2.656009066611244e-06, "loss": 0.03, "step": 6211 }, { "epoch": 2.7590495225405283, "grad_norm": 0.417217650015585, "learning_rate": 2.654296923033481e-06, "loss": 0.0354, "step": 6212 }, { "epoch": 2.759493670886076, "grad_norm": 0.39156228698655793, "learning_rate": 2.652585132054734e-06, "loss": 0.0301, "step": 6213 }, { "epoch": 2.7599378192316233, "grad_norm": 0.4181623279832991, "learning_rate": 2.6508736939323187e-06, "loss": 0.0416, "step": 6214 }, { "epoch": 2.760381967577171, "grad_norm": 0.38946142096359226, "learning_rate": 2.649162608923493e-06, "loss": 0.022, "step": 6215 }, { "epoch": 2.7608261159227183, "grad_norm": 0.28459219563440136, "learning_rate": 2.6474518772854606e-06, "loss": 0.0211, "step": 6216 }, { "epoch": 2.7612702642682656, "grad_norm": 0.39516529374066633, "learning_rate": 2.6457414992753728e-06, "loss": 0.031, "step": 6217 }, { "epoch": 2.761714412613813, "grad_norm": 0.6507735193198333, "learning_rate": 2.6440314751503314e-06, "loss": 0.0405, "step": 6218 }, { "epoch": 2.76215856095936, "grad_norm": 0.39863052089616485, "learning_rate": 2.6423218051673766e-06, "loss": 0.0257, "step": 6219 }, { "epoch": 2.762602709304908, "grad_norm": 0.45747750899314005, "learning_rate": 2.6406124895835084e-06, "loss": 0.0309, "step": 6220 }, { "epoch": 2.763046857650455, "grad_norm": 0.39469825591698227, "learning_rate": 2.6389035286556598e-06, "loss": 0.0301, "step": 6221 }, { "epoch": 2.7634910059960025, "grad_norm": 0.4493000737157054, "learning_rate": 2.637194922640719e-06, "loss": 0.0336, "step": 6222 }, { "epoch": 2.7639351543415502, "grad_norm": 0.3609277682343467, "learning_rate": 2.6354866717955186e-06, "loss": 0.0303, "step": 6223 }, { "epoch": 2.7643793026870975, "grad_norm": 0.3650445917615703, "learning_rate": 2.6337787763768384e-06, "loss": 0.0242, "step": 6224 }, { "epoch": 2.764823451032645, "grad_norm": 0.48546314397013385, "learning_rate": 2.6320712366414005e-06, "loss": 0.0294, "step": 6225 }, { "epoch": 2.765267599378192, "grad_norm": 0.32486540563930044, "learning_rate": 2.6303640528458834e-06, "loss": 0.0223, "step": 6226 }, { "epoch": 2.76571174772374, "grad_norm": 0.3434852926148082, "learning_rate": 2.6286572252469e-06, "loss": 0.0316, "step": 6227 }, { "epoch": 2.766155896069287, "grad_norm": 0.7142093600067161, "learning_rate": 2.626950754101018e-06, "loss": 0.056, "step": 6228 }, { "epoch": 2.7666000444148344, "grad_norm": 0.6028670535609708, "learning_rate": 2.6252446396647503e-06, "loss": 0.0323, "step": 6229 }, { "epoch": 2.767044192760382, "grad_norm": 0.42675838339191474, "learning_rate": 2.6235388821945497e-06, "loss": 0.0297, "step": 6230 }, { "epoch": 2.7674883411059295, "grad_norm": 0.7192216736608809, "learning_rate": 2.621833481946826e-06, "loss": 0.0357, "step": 6231 }, { "epoch": 2.7679324894514767, "grad_norm": 0.4766794360092108, "learning_rate": 2.6201284391779303e-06, "loss": 0.0331, "step": 6232 }, { "epoch": 2.768376637797024, "grad_norm": 0.4250368407242259, "learning_rate": 2.618423754144155e-06, "loss": 0.0321, "step": 6233 }, { "epoch": 2.7688207861425718, "grad_norm": 0.5212475478820174, "learning_rate": 2.616719427101745e-06, "loss": 0.0324, "step": 6234 }, { "epoch": 2.769264934488119, "grad_norm": 0.3008339483542143, "learning_rate": 2.6150154583068922e-06, "loss": 0.0226, "step": 6235 }, { "epoch": 2.7697090828336663, "grad_norm": 0.3816969610714372, "learning_rate": 2.613311848015725e-06, "loss": 0.0259, "step": 6236 }, { "epoch": 2.770153231179214, "grad_norm": 0.38596290627054197, "learning_rate": 2.611608596484335e-06, "loss": 0.0288, "step": 6237 }, { "epoch": 2.7705973795247614, "grad_norm": 0.4303882315778446, "learning_rate": 2.609905703968742e-06, "loss": 0.0223, "step": 6238 }, { "epoch": 2.7710415278703087, "grad_norm": 0.4053342617572821, "learning_rate": 2.6082031707249223e-06, "loss": 0.03, "step": 6239 }, { "epoch": 2.771485676215856, "grad_norm": 0.7823903528274498, "learning_rate": 2.6065009970087974e-06, "loss": 0.0324, "step": 6240 }, { "epoch": 2.7719298245614032, "grad_norm": 0.42522914693804564, "learning_rate": 2.6047991830762297e-06, "loss": 0.038, "step": 6241 }, { "epoch": 2.772373972906951, "grad_norm": 0.3532987358681675, "learning_rate": 2.60309772918303e-06, "loss": 0.0235, "step": 6242 }, { "epoch": 2.7728181212524983, "grad_norm": 0.33902634374584634, "learning_rate": 2.6013966355849618e-06, "loss": 0.0327, "step": 6243 }, { "epoch": 2.773262269598046, "grad_norm": 0.44149160136100024, "learning_rate": 2.5996959025377224e-06, "loss": 0.0492, "step": 6244 }, { "epoch": 2.7737064179435933, "grad_norm": 0.48103215081730055, "learning_rate": 2.597995530296963e-06, "loss": 0.0267, "step": 6245 }, { "epoch": 2.7741505662891406, "grad_norm": 0.3235934136296388, "learning_rate": 2.5962955191182792e-06, "loss": 0.0183, "step": 6246 }, { "epoch": 2.774594714634688, "grad_norm": 0.3462677290866656, "learning_rate": 2.59459586925721e-06, "loss": 0.0272, "step": 6247 }, { "epoch": 2.775038862980235, "grad_norm": 0.32294840719861506, "learning_rate": 2.592896580969242e-06, "loss": 0.0281, "step": 6248 }, { "epoch": 2.775483011325783, "grad_norm": 0.3499956168537934, "learning_rate": 2.591197654509807e-06, "loss": 0.0309, "step": 6249 }, { "epoch": 2.77592715967133, "grad_norm": 0.40008372827541966, "learning_rate": 2.5894990901342833e-06, "loss": 0.0268, "step": 6250 }, { "epoch": 2.7763713080168775, "grad_norm": 0.40878589097636386, "learning_rate": 2.587800888097993e-06, "loss": 0.0313, "step": 6251 }, { "epoch": 2.776815456362425, "grad_norm": 0.7688538094871912, "learning_rate": 2.5861030486562084e-06, "loss": 0.0479, "step": 6252 }, { "epoch": 2.7772596047079725, "grad_norm": 0.40809425302097857, "learning_rate": 2.5844055720641357e-06, "loss": 0.0311, "step": 6253 }, { "epoch": 2.77770375305352, "grad_norm": 0.37450235777366314, "learning_rate": 2.5827084585769436e-06, "loss": 0.0286, "step": 6254 }, { "epoch": 2.778147901399067, "grad_norm": 0.3661400531713744, "learning_rate": 2.581011708449731e-06, "loss": 0.0271, "step": 6255 }, { "epoch": 2.778592049744615, "grad_norm": 0.37170138943829045, "learning_rate": 2.57931532193755e-06, "loss": 0.0346, "step": 6256 }, { "epoch": 2.779036198090162, "grad_norm": 0.30626642522716074, "learning_rate": 2.577619299295398e-06, "loss": 0.026, "step": 6257 }, { "epoch": 2.7794803464357094, "grad_norm": 0.3007717640940463, "learning_rate": 2.5759236407782128e-06, "loss": 0.0247, "step": 6258 }, { "epoch": 2.779924494781257, "grad_norm": 0.5105689813889596, "learning_rate": 2.5742283466408803e-06, "loss": 0.0477, "step": 6259 }, { "epoch": 2.7803686431268044, "grad_norm": 0.49116305516555464, "learning_rate": 2.572533417138237e-06, "loss": 0.0257, "step": 6260 }, { "epoch": 2.7808127914723517, "grad_norm": 0.356325832140254, "learning_rate": 2.570838852525055e-06, "loss": 0.0263, "step": 6261 }, { "epoch": 2.781256939817899, "grad_norm": 0.4435162531660302, "learning_rate": 2.569144653056058e-06, "loss": 0.0435, "step": 6262 }, { "epoch": 2.7817010881634467, "grad_norm": 0.5419805855499744, "learning_rate": 2.5674508189859147e-06, "loss": 0.0314, "step": 6263 }, { "epoch": 2.782145236508994, "grad_norm": 0.3656656264231573, "learning_rate": 2.565757350569233e-06, "loss": 0.0241, "step": 6264 }, { "epoch": 2.7825893848545413, "grad_norm": 0.43315585203736967, "learning_rate": 2.5640642480605722e-06, "loss": 0.0372, "step": 6265 }, { "epoch": 2.783033533200089, "grad_norm": 0.6118620035354352, "learning_rate": 2.5623715117144337e-06, "loss": 0.0389, "step": 6266 }, { "epoch": 2.7834776815456364, "grad_norm": 1.003007529298919, "learning_rate": 2.5606791417852655e-06, "loss": 0.0489, "step": 6267 }, { "epoch": 2.7839218298911836, "grad_norm": 0.335632805390222, "learning_rate": 2.558987138527461e-06, "loss": 0.0298, "step": 6268 }, { "epoch": 2.784365978236731, "grad_norm": 0.49218374523384917, "learning_rate": 2.5572955021953525e-06, "loss": 0.0504, "step": 6269 }, { "epoch": 2.7848101265822782, "grad_norm": 0.3662564117457857, "learning_rate": 2.555604233043224e-06, "loss": 0.024, "step": 6270 }, { "epoch": 2.785254274927826, "grad_norm": 0.4410048606518427, "learning_rate": 2.553913331325305e-06, "loss": 0.0308, "step": 6271 }, { "epoch": 2.7856984232733732, "grad_norm": 0.6748277215653389, "learning_rate": 2.5522227972957626e-06, "loss": 0.0353, "step": 6272 }, { "epoch": 2.786142571618921, "grad_norm": 0.4988731713313121, "learning_rate": 2.550532631208713e-06, "loss": 0.0262, "step": 6273 }, { "epoch": 2.7865867199644683, "grad_norm": 0.3910346837842304, "learning_rate": 2.5488428333182213e-06, "loss": 0.0369, "step": 6274 }, { "epoch": 2.7870308683100156, "grad_norm": 0.5189622685954451, "learning_rate": 2.5471534038782876e-06, "loss": 0.0365, "step": 6275 }, { "epoch": 2.787475016655563, "grad_norm": 0.4108944491644082, "learning_rate": 2.545464343142862e-06, "loss": 0.0322, "step": 6276 }, { "epoch": 2.78791916500111, "grad_norm": 0.3860699292761844, "learning_rate": 2.543775651365844e-06, "loss": 0.0232, "step": 6277 }, { "epoch": 2.788363313346658, "grad_norm": 0.47557488338460535, "learning_rate": 2.5420873288010682e-06, "loss": 0.0387, "step": 6278 }, { "epoch": 2.788807461692205, "grad_norm": 0.3839192710393702, "learning_rate": 2.5403993757023193e-06, "loss": 0.0261, "step": 6279 }, { "epoch": 2.7892516100377525, "grad_norm": 0.4089956846684774, "learning_rate": 2.538711792323328e-06, "loss": 0.0393, "step": 6280 }, { "epoch": 2.7896957583833, "grad_norm": 0.3534578778564665, "learning_rate": 2.5370245789177615e-06, "loss": 0.0234, "step": 6281 }, { "epoch": 2.7901399067288475, "grad_norm": 0.38317967580423545, "learning_rate": 2.53533773573924e-06, "loss": 0.0264, "step": 6282 }, { "epoch": 2.7905840550743948, "grad_norm": 0.42936934153083767, "learning_rate": 2.533651263041324e-06, "loss": 0.0393, "step": 6283 }, { "epoch": 2.791028203419942, "grad_norm": 0.554120368450601, "learning_rate": 2.5319651610775194e-06, "loss": 0.0317, "step": 6284 }, { "epoch": 2.79147235176549, "grad_norm": 0.4081200054049489, "learning_rate": 2.5302794301012766e-06, "loss": 0.0297, "step": 6285 }, { "epoch": 2.791916500111037, "grad_norm": 0.7070187464980049, "learning_rate": 2.528594070365988e-06, "loss": 0.0356, "step": 6286 }, { "epoch": 2.7923606484565844, "grad_norm": 0.45395646778800836, "learning_rate": 2.52690908212499e-06, "loss": 0.0344, "step": 6287 }, { "epoch": 2.792804796802132, "grad_norm": 0.7713383991827947, "learning_rate": 2.525224465631571e-06, "loss": 0.0323, "step": 6288 }, { "epoch": 2.7932489451476794, "grad_norm": 0.43262436851175007, "learning_rate": 2.5235402211389525e-06, "loss": 0.0311, "step": 6289 }, { "epoch": 2.7936930934932267, "grad_norm": 0.9524822174469669, "learning_rate": 2.5218563489003062e-06, "loss": 0.0307, "step": 6290 }, { "epoch": 2.794137241838774, "grad_norm": 0.35257313740001833, "learning_rate": 2.520172849168749e-06, "loss": 0.0321, "step": 6291 }, { "epoch": 2.7945813901843217, "grad_norm": 0.4825645734393702, "learning_rate": 2.518489722197335e-06, "loss": 0.0374, "step": 6292 }, { "epoch": 2.795025538529869, "grad_norm": 0.7297573078351898, "learning_rate": 2.51680696823907e-06, "loss": 0.041, "step": 6293 }, { "epoch": 2.7954696868754163, "grad_norm": 0.4555687972584318, "learning_rate": 2.5151245875468993e-06, "loss": 0.0386, "step": 6294 }, { "epoch": 2.795913835220964, "grad_norm": 0.4970797678676715, "learning_rate": 2.5134425803737137e-06, "loss": 0.0418, "step": 6295 }, { "epoch": 2.7963579835665113, "grad_norm": 0.6816809012387476, "learning_rate": 2.511760946972348e-06, "loss": 0.0409, "step": 6296 }, { "epoch": 2.7968021319120586, "grad_norm": 0.5264337163942395, "learning_rate": 2.5100796875955815e-06, "loss": 0.0473, "step": 6297 }, { "epoch": 2.797246280257606, "grad_norm": 0.5625709988931173, "learning_rate": 2.508398802496132e-06, "loss": 0.0372, "step": 6298 }, { "epoch": 2.797690428603153, "grad_norm": 0.5237928838120754, "learning_rate": 2.5067182919266676e-06, "loss": 0.0311, "step": 6299 }, { "epoch": 2.798134576948701, "grad_norm": 0.4778467003244999, "learning_rate": 2.5050381561397974e-06, "loss": 0.0429, "step": 6300 }, { "epoch": 2.7985787252942482, "grad_norm": 0.43069407371279944, "learning_rate": 2.503358395388074e-06, "loss": 0.0435, "step": 6301 }, { "epoch": 2.799022873639796, "grad_norm": 0.35842408526989356, "learning_rate": 2.501679009923997e-06, "loss": 0.0279, "step": 6302 }, { "epoch": 2.7994670219853433, "grad_norm": 0.3642207986414275, "learning_rate": 2.5000000000000015e-06, "loss": 0.0311, "step": 6303 }, { "epoch": 2.7999111703308905, "grad_norm": 0.655911709523861, "learning_rate": 2.498321365868471e-06, "loss": 0.036, "step": 6304 }, { "epoch": 2.800355318676438, "grad_norm": 0.3959150499907849, "learning_rate": 2.49664310778174e-06, "loss": 0.0253, "step": 6305 }, { "epoch": 2.800799467021985, "grad_norm": 0.5170912208789272, "learning_rate": 2.4949652259920727e-06, "loss": 0.0279, "step": 6306 }, { "epoch": 2.801243615367533, "grad_norm": 0.4192727714282743, "learning_rate": 2.4932877207516844e-06, "loss": 0.0288, "step": 6307 }, { "epoch": 2.80168776371308, "grad_norm": 0.4736817072181479, "learning_rate": 2.4916105923127355e-06, "loss": 0.0395, "step": 6308 }, { "epoch": 2.8021319120586274, "grad_norm": 0.4164208273038199, "learning_rate": 2.489933840927323e-06, "loss": 0.025, "step": 6309 }, { "epoch": 2.802576060404175, "grad_norm": 0.4237935178364351, "learning_rate": 2.4882574668474925e-06, "loss": 0.0326, "step": 6310 }, { "epoch": 2.8030202087497225, "grad_norm": 0.3766334421419762, "learning_rate": 2.486581470325232e-06, "loss": 0.0228, "step": 6311 }, { "epoch": 2.8034643570952698, "grad_norm": 0.41806136620537815, "learning_rate": 2.484905851612471e-06, "loss": 0.0364, "step": 6312 }, { "epoch": 2.803908505440817, "grad_norm": 0.5567970380481517, "learning_rate": 2.4832306109610877e-06, "loss": 0.0575, "step": 6313 }, { "epoch": 2.804352653786365, "grad_norm": 0.38489435628632473, "learning_rate": 2.4815557486228937e-06, "loss": 0.0243, "step": 6314 }, { "epoch": 2.804796802131912, "grad_norm": 0.33130559466084525, "learning_rate": 2.479881264849651e-06, "loss": 0.0237, "step": 6315 }, { "epoch": 2.8052409504774594, "grad_norm": 1.680243538004499, "learning_rate": 2.478207159893064e-06, "loss": 0.032, "step": 6316 }, { "epoch": 2.805685098823007, "grad_norm": 0.48052932859070213, "learning_rate": 2.476533434004779e-06, "loss": 0.0311, "step": 6317 }, { "epoch": 2.8061292471685544, "grad_norm": 0.36896833958182274, "learning_rate": 2.474860087436384e-06, "loss": 0.0261, "step": 6318 }, { "epoch": 2.8065733955141017, "grad_norm": 0.633026238867653, "learning_rate": 2.4731871204394155e-06, "loss": 0.048, "step": 6319 }, { "epoch": 2.807017543859649, "grad_norm": 0.4418796227317967, "learning_rate": 2.4715145332653433e-06, "loss": 0.0359, "step": 6320 }, { "epoch": 2.8074616922051967, "grad_norm": 0.4502178155781612, "learning_rate": 2.4698423261655887e-06, "loss": 0.0306, "step": 6321 }, { "epoch": 2.807905840550744, "grad_norm": 0.4566900893012358, "learning_rate": 2.468170499391512e-06, "loss": 0.0362, "step": 6322 }, { "epoch": 2.8083499888962913, "grad_norm": 0.38715597816758157, "learning_rate": 2.4664990531944176e-06, "loss": 0.0404, "step": 6323 }, { "epoch": 2.808794137241839, "grad_norm": 0.43966470207668296, "learning_rate": 2.4648279878255523e-06, "loss": 0.0258, "step": 6324 }, { "epoch": 2.8092382855873863, "grad_norm": 0.3596530155746306, "learning_rate": 2.4631573035361073e-06, "loss": 0.0294, "step": 6325 }, { "epoch": 2.8096824339329336, "grad_norm": 0.44583220946986085, "learning_rate": 2.4614870005772105e-06, "loss": 0.0223, "step": 6326 }, { "epoch": 2.810126582278481, "grad_norm": 0.5911442996714825, "learning_rate": 2.45981707919994e-06, "loss": 0.0393, "step": 6327 }, { "epoch": 2.810570730624028, "grad_norm": 0.5581799888907024, "learning_rate": 2.458147539655313e-06, "loss": 0.0326, "step": 6328 }, { "epoch": 2.811014878969576, "grad_norm": 0.39311733798131593, "learning_rate": 2.4564783821942884e-06, "loss": 0.029, "step": 6329 }, { "epoch": 2.811459027315123, "grad_norm": 0.4450796457627704, "learning_rate": 2.454809607067772e-06, "loss": 0.0369, "step": 6330 }, { "epoch": 2.811903175660671, "grad_norm": 0.43590500853966985, "learning_rate": 2.4531412145266055e-06, "loss": 0.0386, "step": 6331 }, { "epoch": 2.8123473240062182, "grad_norm": 0.457972822762452, "learning_rate": 2.4514732048215774e-06, "loss": 0.042, "step": 6332 }, { "epoch": 2.8127914723517655, "grad_norm": 0.32284714510978635, "learning_rate": 2.4498055782034187e-06, "loss": 0.0334, "step": 6333 }, { "epoch": 2.813235620697313, "grad_norm": 0.36527473928195653, "learning_rate": 2.4481383349228016e-06, "loss": 0.0253, "step": 6334 }, { "epoch": 2.81367976904286, "grad_norm": 0.3860105760601358, "learning_rate": 2.446471475230342e-06, "loss": 0.0391, "step": 6335 }, { "epoch": 2.814123917388408, "grad_norm": 0.41855117247115486, "learning_rate": 2.4448049993765975e-06, "loss": 0.0296, "step": 6336 }, { "epoch": 2.814568065733955, "grad_norm": 0.4146118074577428, "learning_rate": 2.4431389076120657e-06, "loss": 0.0325, "step": 6337 }, { "epoch": 2.8150122140795024, "grad_norm": 0.3990029724898874, "learning_rate": 2.4414732001871892e-06, "loss": 0.0337, "step": 6338 }, { "epoch": 2.81545636242505, "grad_norm": 0.5638606230897282, "learning_rate": 2.4398078773523526e-06, "loss": 0.0445, "step": 6339 }, { "epoch": 2.8159005107705974, "grad_norm": 0.45902483366087526, "learning_rate": 2.438142939357882e-06, "loss": 0.0417, "step": 6340 }, { "epoch": 2.8163446591161447, "grad_norm": 0.3183617092675609, "learning_rate": 2.4364783864540482e-06, "loss": 0.0252, "step": 6341 }, { "epoch": 2.816788807461692, "grad_norm": 0.3554967849352285, "learning_rate": 2.434814218891057e-06, "loss": 0.0253, "step": 6342 }, { "epoch": 2.8172329558072398, "grad_norm": 0.39321716328224376, "learning_rate": 2.433150436919064e-06, "loss": 0.0272, "step": 6343 }, { "epoch": 2.817677104152787, "grad_norm": 0.5605795416471407, "learning_rate": 2.4314870407881637e-06, "loss": 0.0348, "step": 6344 }, { "epoch": 2.8181212524983343, "grad_norm": 0.3732190806583137, "learning_rate": 2.4298240307483923e-06, "loss": 0.0233, "step": 6345 }, { "epoch": 2.818565400843882, "grad_norm": 0.32751466948341074, "learning_rate": 2.4281614070497282e-06, "loss": 0.0284, "step": 6346 }, { "epoch": 2.8190095491894294, "grad_norm": 0.38489431013084585, "learning_rate": 2.4264991699420953e-06, "loss": 0.0235, "step": 6347 }, { "epoch": 2.8194536975349767, "grad_norm": 0.5207677415942126, "learning_rate": 2.4248373196753512e-06, "loss": 0.0362, "step": 6348 }, { "epoch": 2.819897845880524, "grad_norm": 0.39019062549891126, "learning_rate": 2.423175856499302e-06, "loss": 0.0221, "step": 6349 }, { "epoch": 2.8203419942260712, "grad_norm": 0.4152809637919385, "learning_rate": 2.4215147806636942e-06, "loss": 0.0314, "step": 6350 }, { "epoch": 2.820786142571619, "grad_norm": 0.4000109524825699, "learning_rate": 2.4198540924182156e-06, "loss": 0.0307, "step": 6351 }, { "epoch": 2.8212302909171663, "grad_norm": 0.4388997785747502, "learning_rate": 2.4181937920124966e-06, "loss": 0.0298, "step": 6352 }, { "epoch": 2.821674439262714, "grad_norm": 0.4019919408619044, "learning_rate": 2.4165338796961093e-06, "loss": 0.033, "step": 6353 }, { "epoch": 2.8221185876082613, "grad_norm": 0.570406566396068, "learning_rate": 2.414874355718563e-06, "loss": 0.0379, "step": 6354 }, { "epoch": 2.8225627359538086, "grad_norm": 0.4407385335769093, "learning_rate": 2.413215220329315e-06, "loss": 0.0282, "step": 6355 }, { "epoch": 2.823006884299356, "grad_norm": 0.38569267409608843, "learning_rate": 2.411556473777761e-06, "loss": 0.0282, "step": 6356 }, { "epoch": 2.823451032644903, "grad_norm": 0.4606495839160357, "learning_rate": 2.4098981163132395e-06, "loss": 0.0342, "step": 6357 }, { "epoch": 2.823895180990451, "grad_norm": 0.43263213722768684, "learning_rate": 2.4082401481850306e-06, "loss": 0.0369, "step": 6358 }, { "epoch": 2.824339329335998, "grad_norm": 0.5551215018246358, "learning_rate": 2.4065825696423522e-06, "loss": 0.0327, "step": 6359 }, { "epoch": 2.8247834776815455, "grad_norm": 0.3806554615954976, "learning_rate": 2.4049253809343678e-06, "loss": 0.0333, "step": 6360 }, { "epoch": 2.825227626027093, "grad_norm": 0.4594230517569721, "learning_rate": 2.4032685823101814e-06, "loss": 0.0302, "step": 6361 }, { "epoch": 2.8256717743726405, "grad_norm": 0.49282566341591966, "learning_rate": 2.4016121740188375e-06, "loss": 0.0283, "step": 6362 }, { "epoch": 2.826115922718188, "grad_norm": 0.37575563744052526, "learning_rate": 2.3999561563093234e-06, "loss": 0.0349, "step": 6363 }, { "epoch": 2.826560071063735, "grad_norm": 0.4760513100430366, "learning_rate": 2.3983005294305673e-06, "loss": 0.0374, "step": 6364 }, { "epoch": 2.827004219409283, "grad_norm": 0.395278083467666, "learning_rate": 2.396645293631435e-06, "loss": 0.0248, "step": 6365 }, { "epoch": 2.82744836775483, "grad_norm": 0.4876360922819094, "learning_rate": 2.3949904491607384e-06, "loss": 0.0342, "step": 6366 }, { "epoch": 2.8278925161003774, "grad_norm": 0.40885930978989254, "learning_rate": 2.393335996267229e-06, "loss": 0.0325, "step": 6367 }, { "epoch": 2.828336664445925, "grad_norm": 0.44935957035165386, "learning_rate": 2.3916819351995984e-06, "loss": 0.0273, "step": 6368 }, { "epoch": 2.8287808127914724, "grad_norm": 1.506178960229344, "learning_rate": 2.3900282662064806e-06, "loss": 0.059, "step": 6369 }, { "epoch": 2.8292249611370197, "grad_norm": 0.30959880923138633, "learning_rate": 2.3883749895364523e-06, "loss": 0.0217, "step": 6370 }, { "epoch": 2.829669109482567, "grad_norm": 0.7823253783794836, "learning_rate": 2.3867221054380244e-06, "loss": 0.0329, "step": 6371 }, { "epoch": 2.8301132578281147, "grad_norm": 0.4806013956209637, "learning_rate": 2.3850696141596563e-06, "loss": 0.0455, "step": 6372 }, { "epoch": 2.830557406173662, "grad_norm": 0.6896449670646502, "learning_rate": 2.3834175159497446e-06, "loss": 0.0562, "step": 6373 }, { "epoch": 2.8310015545192093, "grad_norm": 0.4709885726846783, "learning_rate": 2.3817658110566288e-06, "loss": 0.0326, "step": 6374 }, { "epoch": 2.831445702864757, "grad_norm": 0.4631209021676538, "learning_rate": 2.380114499728589e-06, "loss": 0.0265, "step": 6375 }, { "epoch": 2.8318898512103043, "grad_norm": 0.4360372773811446, "learning_rate": 2.3784635822138424e-06, "loss": 0.0409, "step": 6376 }, { "epoch": 2.8323339995558516, "grad_norm": 0.36688129563252986, "learning_rate": 2.3768130587605513e-06, "loss": 0.0248, "step": 6377 }, { "epoch": 2.832778147901399, "grad_norm": 0.37437289245252986, "learning_rate": 2.3751629296168177e-06, "loss": 0.0305, "step": 6378 }, { "epoch": 2.833222296246946, "grad_norm": 0.531778233535108, "learning_rate": 2.3735131950306845e-06, "loss": 0.0352, "step": 6379 }, { "epoch": 2.833666444592494, "grad_norm": 0.4416928778863043, "learning_rate": 2.371863855250134e-06, "loss": 0.0356, "step": 6380 }, { "epoch": 2.8341105929380412, "grad_norm": 0.3539473400154312, "learning_rate": 2.3702149105230914e-06, "loss": 0.0329, "step": 6381 }, { "epoch": 2.834554741283589, "grad_norm": 0.8474469374112829, "learning_rate": 2.3685663610974193e-06, "loss": 0.0339, "step": 6382 }, { "epoch": 2.8349988896291363, "grad_norm": 0.42747029305832396, "learning_rate": 2.3669182072209225e-06, "loss": 0.0381, "step": 6383 }, { "epoch": 2.8354430379746836, "grad_norm": 0.7608488814785124, "learning_rate": 2.3652704491413477e-06, "loss": 0.032, "step": 6384 }, { "epoch": 2.835887186320231, "grad_norm": 0.388700958157769, "learning_rate": 2.3636230871063803e-06, "loss": 0.0273, "step": 6385 }, { "epoch": 2.836331334665778, "grad_norm": 0.5022022192841024, "learning_rate": 2.3619761213636496e-06, "loss": 0.0437, "step": 6386 }, { "epoch": 2.836775483011326, "grad_norm": 0.34657365126440165, "learning_rate": 2.360329552160718e-06, "loss": 0.0246, "step": 6387 }, { "epoch": 2.837219631356873, "grad_norm": 0.3939450249882695, "learning_rate": 2.358683379745094e-06, "loss": 0.0267, "step": 6388 }, { "epoch": 2.8376637797024205, "grad_norm": 0.41677776471298256, "learning_rate": 2.357037604364229e-06, "loss": 0.0324, "step": 6389 }, { "epoch": 2.838107928047968, "grad_norm": 0.3562978131516035, "learning_rate": 2.3553922262655045e-06, "loss": 0.0289, "step": 6390 }, { "epoch": 2.8385520763935155, "grad_norm": 0.3800097306965494, "learning_rate": 2.3537472456962536e-06, "loss": 0.0269, "step": 6391 }, { "epoch": 2.8389962247390628, "grad_norm": 0.3927488750912727, "learning_rate": 2.3521026629037456e-06, "loss": 0.032, "step": 6392 }, { "epoch": 2.83944037308461, "grad_norm": 0.6796389696750893, "learning_rate": 2.3504584781351857e-06, "loss": 0.035, "step": 6393 }, { "epoch": 2.839884521430158, "grad_norm": 0.4312865368452842, "learning_rate": 2.3488146916377246e-06, "loss": 0.024, "step": 6394 }, { "epoch": 2.840328669775705, "grad_norm": 0.5503469902463891, "learning_rate": 2.3471713036584507e-06, "loss": 0.0387, "step": 6395 }, { "epoch": 2.8407728181212524, "grad_norm": 0.39537225808331916, "learning_rate": 2.345528314444394e-06, "loss": 0.0365, "step": 6396 }, { "epoch": 2.8412169664668, "grad_norm": 0.4828036261450407, "learning_rate": 2.343885724242523e-06, "loss": 0.0359, "step": 6397 }, { "epoch": 2.8416611148123474, "grad_norm": 0.3405572301380993, "learning_rate": 2.342243533299749e-06, "loss": 0.0296, "step": 6398 }, { "epoch": 2.8421052631578947, "grad_norm": 0.3471570803326242, "learning_rate": 2.3406017418629173e-06, "loss": 0.0295, "step": 6399 }, { "epoch": 2.842549411503442, "grad_norm": 0.30534728377719983, "learning_rate": 2.3389603501788187e-06, "loss": 0.0161, "step": 6400 }, { "epoch": 2.8429935598489897, "grad_norm": 0.4261866679259168, "learning_rate": 2.3373193584941833e-06, "loss": 0.0213, "step": 6401 }, { "epoch": 2.843437708194537, "grad_norm": 0.5685316006056542, "learning_rate": 2.335678767055679e-06, "loss": 0.0434, "step": 6402 }, { "epoch": 2.8438818565400843, "grad_norm": 0.49631869614829277, "learning_rate": 2.334038576109917e-06, "loss": 0.0356, "step": 6403 }, { "epoch": 2.844326004885632, "grad_norm": 0.5515165392461051, "learning_rate": 2.332398785903442e-06, "loss": 0.0363, "step": 6404 }, { "epoch": 2.8447701532311793, "grad_norm": 0.532355259631848, "learning_rate": 2.330759396682744e-06, "loss": 0.0399, "step": 6405 }, { "epoch": 2.8452143015767266, "grad_norm": 0.5194908272702733, "learning_rate": 2.329120408694253e-06, "loss": 0.0419, "step": 6406 }, { "epoch": 2.845658449922274, "grad_norm": 0.4352160286424336, "learning_rate": 2.327481822184331e-06, "loss": 0.0323, "step": 6407 }, { "epoch": 2.846102598267821, "grad_norm": 0.40556462446399455, "learning_rate": 2.3258436373992914e-06, "loss": 0.0295, "step": 6408 }, { "epoch": 2.846546746613369, "grad_norm": 0.48488189817066807, "learning_rate": 2.3242058545853806e-06, "loss": 0.0323, "step": 6409 }, { "epoch": 2.846990894958916, "grad_norm": 0.7693171033124235, "learning_rate": 2.322568473988782e-06, "loss": 0.0304, "step": 6410 }, { "epoch": 2.847435043304464, "grad_norm": 0.6057507153145041, "learning_rate": 2.3209314958556232e-06, "loss": 0.0262, "step": 6411 }, { "epoch": 2.8478791916500112, "grad_norm": 0.4479104368079513, "learning_rate": 2.319294920431972e-06, "loss": 0.0363, "step": 6412 }, { "epoch": 2.8483233399955585, "grad_norm": 0.5011576633143359, "learning_rate": 2.317658747963828e-06, "loss": 0.0415, "step": 6413 }, { "epoch": 2.848767488341106, "grad_norm": 0.43395301260402036, "learning_rate": 2.316022978697143e-06, "loss": 0.0305, "step": 6414 }, { "epoch": 2.849211636686653, "grad_norm": 0.4161396554904535, "learning_rate": 2.314387612877795e-06, "loss": 0.031, "step": 6415 }, { "epoch": 2.849655785032201, "grad_norm": 0.453771160365058, "learning_rate": 2.312752650751609e-06, "loss": 0.0328, "step": 6416 }, { "epoch": 2.850099933377748, "grad_norm": 0.4057605385720707, "learning_rate": 2.3111180925643477e-06, "loss": 0.0308, "step": 6417 }, { "epoch": 2.8505440817232954, "grad_norm": 0.4469537485385078, "learning_rate": 2.309483938561714e-06, "loss": 0.0345, "step": 6418 }, { "epoch": 2.850988230068843, "grad_norm": 0.3300098185481985, "learning_rate": 2.3078501889893477e-06, "loss": 0.028, "step": 6419 }, { "epoch": 2.8514323784143905, "grad_norm": 0.36465566926853343, "learning_rate": 2.3062168440928324e-06, "loss": 0.0299, "step": 6420 }, { "epoch": 2.8518765267599377, "grad_norm": 0.39053368090966234, "learning_rate": 2.304583904117682e-06, "loss": 0.029, "step": 6421 }, { "epoch": 2.852320675105485, "grad_norm": 0.3264096837914728, "learning_rate": 2.302951369309358e-06, "loss": 0.0245, "step": 6422 }, { "epoch": 2.8527648234510328, "grad_norm": 0.4328229716057836, "learning_rate": 2.30131923991326e-06, "loss": 0.0383, "step": 6423 }, { "epoch": 2.85320897179658, "grad_norm": 0.4077375134882527, "learning_rate": 2.2996875161747194e-06, "loss": 0.038, "step": 6424 }, { "epoch": 2.8536531201421274, "grad_norm": 0.4094918043040497, "learning_rate": 2.298056198339017e-06, "loss": 0.0298, "step": 6425 }, { "epoch": 2.854097268487675, "grad_norm": 0.32420798744454565, "learning_rate": 2.296425286651368e-06, "loss": 0.0335, "step": 6426 }, { "epoch": 2.8545414168332224, "grad_norm": 0.4249764666477243, "learning_rate": 2.294794781356922e-06, "loss": 0.0373, "step": 6427 }, { "epoch": 2.8549855651787697, "grad_norm": 0.8704460304111548, "learning_rate": 2.293164682700774e-06, "loss": 0.0456, "step": 6428 }, { "epoch": 2.855429713524317, "grad_norm": 0.5067415787866943, "learning_rate": 2.2915349909279573e-06, "loss": 0.0348, "step": 6429 }, { "epoch": 2.8558738618698647, "grad_norm": 0.38730263257871017, "learning_rate": 2.2899057062834363e-06, "loss": 0.0508, "step": 6430 }, { "epoch": 2.856318010215412, "grad_norm": 0.426646849291928, "learning_rate": 2.2882768290121277e-06, "loss": 0.03, "step": 6431 }, { "epoch": 2.8567621585609593, "grad_norm": 0.4433931975798466, "learning_rate": 2.286648359358874e-06, "loss": 0.0361, "step": 6432 }, { "epoch": 2.857206306906507, "grad_norm": 0.41545375813689084, "learning_rate": 2.2850202975684637e-06, "loss": 0.0302, "step": 6433 }, { "epoch": 2.8576504552520543, "grad_norm": 0.3777035808530584, "learning_rate": 2.283392643885624e-06, "loss": 0.0287, "step": 6434 }, { "epoch": 2.8580946035976016, "grad_norm": 0.7240655817753611, "learning_rate": 2.2817653985550132e-06, "loss": 0.0336, "step": 6435 }, { "epoch": 2.858538751943149, "grad_norm": 0.3261261166151642, "learning_rate": 2.2801385618212395e-06, "loss": 0.0244, "step": 6436 }, { "epoch": 2.858982900288696, "grad_norm": 0.365601220986991, "learning_rate": 2.2785121339288446e-06, "loss": 0.0281, "step": 6437 }, { "epoch": 2.859427048634244, "grad_norm": 0.4479492062462993, "learning_rate": 2.276886115122304e-06, "loss": 0.0345, "step": 6438 }, { "epoch": 2.859871196979791, "grad_norm": 0.3824493182929994, "learning_rate": 2.2752605056460374e-06, "loss": 0.0263, "step": 6439 }, { "epoch": 2.860315345325339, "grad_norm": 0.444006507921262, "learning_rate": 2.2736353057444045e-06, "loss": 0.0286, "step": 6440 }, { "epoch": 2.8607594936708862, "grad_norm": 0.4135565678132424, "learning_rate": 2.272010515661694e-06, "loss": 0.031, "step": 6441 }, { "epoch": 2.8612036420164335, "grad_norm": 0.4704071890757161, "learning_rate": 2.2703861356421476e-06, "loss": 0.0374, "step": 6442 }, { "epoch": 2.861647790361981, "grad_norm": 0.5654612362616459, "learning_rate": 2.268762165929931e-06, "loss": 0.0298, "step": 6443 }, { "epoch": 2.862091938707528, "grad_norm": 0.4525755398497248, "learning_rate": 2.267138606769156e-06, "loss": 0.0373, "step": 6444 }, { "epoch": 2.862536087053076, "grad_norm": 0.3792079187603561, "learning_rate": 2.2655154584038718e-06, "loss": 0.0274, "step": 6445 }, { "epoch": 2.862980235398623, "grad_norm": 0.6472477866875229, "learning_rate": 2.263892721078067e-06, "loss": 0.0306, "step": 6446 }, { "epoch": 2.8634243837441704, "grad_norm": 0.4724763504374727, "learning_rate": 2.2622703950356607e-06, "loss": 0.0395, "step": 6447 }, { "epoch": 2.863868532089718, "grad_norm": 0.5178919025923505, "learning_rate": 2.2606484805205235e-06, "loss": 0.0329, "step": 6448 }, { "epoch": 2.8643126804352654, "grad_norm": 0.4061558909889993, "learning_rate": 2.2590269777764516e-06, "loss": 0.0299, "step": 6449 }, { "epoch": 2.8647568287808127, "grad_norm": 0.39901609043961084, "learning_rate": 2.257405887047186e-06, "loss": 0.0328, "step": 6450 }, { "epoch": 2.86520097712636, "grad_norm": 0.49608870962855456, "learning_rate": 2.2557852085764053e-06, "loss": 0.0303, "step": 6451 }, { "epoch": 2.8656451254719078, "grad_norm": 0.38259419645750875, "learning_rate": 2.254164942607721e-06, "loss": 0.0219, "step": 6452 }, { "epoch": 2.866089273817455, "grad_norm": 0.42520263669940206, "learning_rate": 2.2525450893846906e-06, "loss": 0.0248, "step": 6453 }, { "epoch": 2.8665334221630023, "grad_norm": 0.343086064868294, "learning_rate": 2.2509256491508063e-06, "loss": 0.0226, "step": 6454 }, { "epoch": 2.86697757050855, "grad_norm": 0.48422523578882104, "learning_rate": 2.249306622149494e-06, "loss": 0.0336, "step": 6455 }, { "epoch": 2.8674217188540974, "grad_norm": 0.40460550415553925, "learning_rate": 2.2476880086241225e-06, "loss": 0.0325, "step": 6456 }, { "epoch": 2.8678658671996446, "grad_norm": 0.38849375983156137, "learning_rate": 2.2460698088179985e-06, "loss": 0.034, "step": 6457 }, { "epoch": 2.868310015545192, "grad_norm": 0.3659863472898491, "learning_rate": 2.24445202297436e-06, "loss": 0.0311, "step": 6458 }, { "epoch": 2.8687541638907397, "grad_norm": 0.32032734176784455, "learning_rate": 2.242834651336394e-06, "loss": 0.0304, "step": 6459 }, { "epoch": 2.869198312236287, "grad_norm": 0.2895901124804986, "learning_rate": 2.2412176941472146e-06, "loss": 0.0252, "step": 6460 }, { "epoch": 2.8696424605818343, "grad_norm": 0.4544732858743473, "learning_rate": 2.2396011516498794e-06, "loss": 0.0378, "step": 6461 }, { "epoch": 2.870086608927382, "grad_norm": 0.33869349353641764, "learning_rate": 2.2379850240873836e-06, "loss": 0.0243, "step": 6462 }, { "epoch": 2.8705307572729293, "grad_norm": 0.39779581383793367, "learning_rate": 2.2363693117026554e-06, "loss": 0.0328, "step": 6463 }, { "epoch": 2.8709749056184766, "grad_norm": 0.5359509284893266, "learning_rate": 2.2347540147385636e-06, "loss": 0.0276, "step": 6464 }, { "epoch": 2.871419053964024, "grad_norm": 0.3901564758848709, "learning_rate": 2.2331391334379205e-06, "loss": 0.0286, "step": 6465 }, { "epoch": 2.871863202309571, "grad_norm": 0.4664762506585298, "learning_rate": 2.231524668043465e-06, "loss": 0.0421, "step": 6466 }, { "epoch": 2.872307350655119, "grad_norm": 0.4098655625284004, "learning_rate": 2.229910618797879e-06, "loss": 0.0376, "step": 6467 }, { "epoch": 2.872751499000666, "grad_norm": 0.4583633277386958, "learning_rate": 2.228296985943785e-06, "loss": 0.0383, "step": 6468 }, { "epoch": 2.873195647346214, "grad_norm": 0.37685547831341093, "learning_rate": 2.226683769723734e-06, "loss": 0.028, "step": 6469 }, { "epoch": 2.873639795691761, "grad_norm": 0.400587688868656, "learning_rate": 2.225070970380224e-06, "loss": 0.0337, "step": 6470 }, { "epoch": 2.8740839440373085, "grad_norm": 0.3702484950984152, "learning_rate": 2.2234585881556864e-06, "loss": 0.0274, "step": 6471 }, { "epoch": 2.874528092382856, "grad_norm": 0.4674975669326644, "learning_rate": 2.2218466232924867e-06, "loss": 0.03, "step": 6472 }, { "epoch": 2.874972240728403, "grad_norm": 0.37515826872200714, "learning_rate": 2.2202350760329328e-06, "loss": 0.0257, "step": 6473 }, { "epoch": 2.875416389073951, "grad_norm": 0.3257808540366288, "learning_rate": 2.2186239466192676e-06, "loss": 0.0219, "step": 6474 }, { "epoch": 2.875860537419498, "grad_norm": 0.4753925237563355, "learning_rate": 2.2170132352936675e-06, "loss": 0.0434, "step": 6475 }, { "epoch": 2.8763046857650454, "grad_norm": 0.44581196717512095, "learning_rate": 2.2154029422982563e-06, "loss": 0.0371, "step": 6476 }, { "epoch": 2.876748834110593, "grad_norm": 0.4263527151388236, "learning_rate": 2.2137930678750835e-06, "loss": 0.0376, "step": 6477 }, { "epoch": 2.8771929824561404, "grad_norm": 0.46476168558248493, "learning_rate": 2.2121836122661416e-06, "loss": 0.034, "step": 6478 }, { "epoch": 2.8776371308016877, "grad_norm": 0.34003139533262217, "learning_rate": 2.2105745757133612e-06, "loss": 0.0294, "step": 6479 }, { "epoch": 2.878081279147235, "grad_norm": 0.49849907569693336, "learning_rate": 2.2089659584586047e-06, "loss": 0.0377, "step": 6480 }, { "epoch": 2.8785254274927827, "grad_norm": 0.5382448752849822, "learning_rate": 2.2073577607436737e-06, "loss": 0.0443, "step": 6481 }, { "epoch": 2.87896957583833, "grad_norm": 0.3879946256129708, "learning_rate": 2.2057499828103142e-06, "loss": 0.0327, "step": 6482 }, { "epoch": 2.8794137241838773, "grad_norm": 0.5104922320743165, "learning_rate": 2.2041426249001955e-06, "loss": 0.0362, "step": 6483 }, { "epoch": 2.879857872529425, "grad_norm": 0.4930647183156878, "learning_rate": 2.2025356872549345e-06, "loss": 0.0256, "step": 6484 }, { "epoch": 2.8803020208749723, "grad_norm": 0.3966997553776206, "learning_rate": 2.2009291701160817e-06, "loss": 0.0271, "step": 6485 }, { "epoch": 2.8807461692205196, "grad_norm": 0.4962925195644924, "learning_rate": 2.1993230737251216e-06, "loss": 0.0371, "step": 6486 }, { "epoch": 2.881190317566067, "grad_norm": 0.42642379725740076, "learning_rate": 2.197717398323477e-06, "loss": 0.0263, "step": 6487 }, { "epoch": 2.881634465911614, "grad_norm": 0.4141444359146802, "learning_rate": 2.1961121441525113e-06, "loss": 0.0344, "step": 6488 }, { "epoch": 2.882078614257162, "grad_norm": 0.5311512207572496, "learning_rate": 2.19450731145352e-06, "loss": 0.0364, "step": 6489 }, { "epoch": 2.8825227626027092, "grad_norm": 0.4267919708323801, "learning_rate": 2.192902900467736e-06, "loss": 0.0274, "step": 6490 }, { "epoch": 2.882966910948257, "grad_norm": 0.31959536449480613, "learning_rate": 2.1912989114363326e-06, "loss": 0.0242, "step": 6491 }, { "epoch": 2.8834110592938043, "grad_norm": 0.44718270123585074, "learning_rate": 2.1896953446004104e-06, "loss": 0.0397, "step": 6492 }, { "epoch": 2.8838552076393515, "grad_norm": 0.4584663064416918, "learning_rate": 2.1880922002010208e-06, "loss": 0.0321, "step": 6493 }, { "epoch": 2.884299355984899, "grad_norm": 0.6111455889547676, "learning_rate": 2.186489478479137e-06, "loss": 0.0423, "step": 6494 }, { "epoch": 2.884743504330446, "grad_norm": 0.5635891360279379, "learning_rate": 2.1848871796756784e-06, "loss": 0.0559, "step": 6495 }, { "epoch": 2.885187652675994, "grad_norm": 0.49224878038120795, "learning_rate": 2.183285304031498e-06, "loss": 0.0381, "step": 6496 }, { "epoch": 2.885631801021541, "grad_norm": 0.412800271356595, "learning_rate": 2.1816838517873834e-06, "loss": 0.0281, "step": 6497 }, { "epoch": 2.8860759493670884, "grad_norm": 0.42819522838748797, "learning_rate": 2.1800828231840583e-06, "loss": 0.0349, "step": 6498 }, { "epoch": 2.886520097712636, "grad_norm": 0.3172148118997228, "learning_rate": 2.178482218462191e-06, "loss": 0.0198, "step": 6499 }, { "epoch": 2.8869642460581835, "grad_norm": 0.6336802661233113, "learning_rate": 2.176882037862373e-06, "loss": 0.0527, "step": 6500 }, { "epoch": 2.8874083944037308, "grad_norm": 0.6279273972202338, "learning_rate": 2.1752822816251405e-06, "loss": 0.0366, "step": 6501 }, { "epoch": 2.887852542749278, "grad_norm": 0.6022931994329644, "learning_rate": 2.173682949990968e-06, "loss": 0.0306, "step": 6502 }, { "epoch": 2.888296691094826, "grad_norm": 0.5362069754281357, "learning_rate": 2.172084043200256e-06, "loss": 0.0342, "step": 6503 }, { "epoch": 2.888740839440373, "grad_norm": 0.7929642629941647, "learning_rate": 2.17048556149335e-06, "loss": 0.0333, "step": 6504 }, { "epoch": 2.8891849877859204, "grad_norm": 0.33575039821832753, "learning_rate": 2.16888750511053e-06, "loss": 0.0242, "step": 6505 }, { "epoch": 2.889629136131468, "grad_norm": 0.3741108552741579, "learning_rate": 2.1672898742920094e-06, "loss": 0.0273, "step": 6506 }, { "epoch": 2.8900732844770154, "grad_norm": 0.3774493813914148, "learning_rate": 2.1656926692779423e-06, "loss": 0.0313, "step": 6507 }, { "epoch": 2.8905174328225627, "grad_norm": 0.9718206954815033, "learning_rate": 2.1640958903084118e-06, "loss": 0.0401, "step": 6508 }, { "epoch": 2.89096158116811, "grad_norm": 0.3694310192990195, "learning_rate": 2.1624995376234403e-06, "loss": 0.0348, "step": 6509 }, { "epoch": 2.8914057295136577, "grad_norm": 0.6520572022397477, "learning_rate": 2.1609036114629933e-06, "loss": 0.0402, "step": 6510 }, { "epoch": 2.891849877859205, "grad_norm": 0.4215772661238576, "learning_rate": 2.159308112066959e-06, "loss": 0.0293, "step": 6511 }, { "epoch": 2.8922940262047523, "grad_norm": 0.38467024272056793, "learning_rate": 2.1577130396751705e-06, "loss": 0.0325, "step": 6512 }, { "epoch": 2.8927381745503, "grad_norm": 0.38039523075207066, "learning_rate": 2.1561183945273958e-06, "loss": 0.029, "step": 6513 }, { "epoch": 2.8931823228958473, "grad_norm": 0.35012157636808233, "learning_rate": 2.154524176863334e-06, "loss": 0.0298, "step": 6514 }, { "epoch": 2.8936264712413946, "grad_norm": 0.32604790956444796, "learning_rate": 2.1529303869226244e-06, "loss": 0.0223, "step": 6515 }, { "epoch": 2.894070619586942, "grad_norm": 0.6062038168527448, "learning_rate": 2.151337024944841e-06, "loss": 0.0328, "step": 6516 }, { "epoch": 2.894514767932489, "grad_norm": 0.4616383089873512, "learning_rate": 2.149744091169493e-06, "loss": 0.0355, "step": 6517 }, { "epoch": 2.894958916278037, "grad_norm": 0.31498629669392003, "learning_rate": 2.1481515858360254e-06, "loss": 0.0223, "step": 6518 }, { "epoch": 2.895403064623584, "grad_norm": 0.3418552975452249, "learning_rate": 2.1465595091838204e-06, "loss": 0.0284, "step": 6519 }, { "epoch": 2.895847212969132, "grad_norm": 0.6349150689262983, "learning_rate": 2.144967861452191e-06, "loss": 0.0288, "step": 6520 }, { "epoch": 2.8962913613146792, "grad_norm": 0.3672718765769208, "learning_rate": 2.143376642880391e-06, "loss": 0.028, "step": 6521 }, { "epoch": 2.8967355096602265, "grad_norm": 0.6329188736427485, "learning_rate": 2.141785853707607e-06, "loss": 0.0419, "step": 6522 }, { "epoch": 2.897179658005774, "grad_norm": 0.35536621143471536, "learning_rate": 2.1401954941729614e-06, "loss": 0.0276, "step": 6523 }, { "epoch": 2.897623806351321, "grad_norm": 0.4803884960432198, "learning_rate": 2.1386055645155144e-06, "loss": 0.0334, "step": 6524 }, { "epoch": 2.898067954696869, "grad_norm": 0.7351089421454166, "learning_rate": 2.137016064974256e-06, "loss": 0.034, "step": 6525 }, { "epoch": 2.898512103042416, "grad_norm": 0.35740753737071096, "learning_rate": 2.135426995788115e-06, "loss": 0.0255, "step": 6526 }, { "epoch": 2.8989562513879634, "grad_norm": 0.40072320482835744, "learning_rate": 2.133838357195961e-06, "loss": 0.0369, "step": 6527 }, { "epoch": 2.899400399733511, "grad_norm": 0.3671291054027193, "learning_rate": 2.1322501494365873e-06, "loss": 0.0248, "step": 6528 }, { "epoch": 2.8998445480790584, "grad_norm": 0.46498735962046045, "learning_rate": 2.1306623727487306e-06, "loss": 0.0336, "step": 6529 }, { "epoch": 2.9002886964246057, "grad_norm": 0.37582439553874314, "learning_rate": 2.1290750273710625e-06, "loss": 0.0311, "step": 6530 }, { "epoch": 2.900732844770153, "grad_norm": 0.3769841744886795, "learning_rate": 2.127488113542185e-06, "loss": 0.0219, "step": 6531 }, { "epoch": 2.9011769931157008, "grad_norm": 0.5408699565947783, "learning_rate": 2.1259016315006388e-06, "loss": 0.0369, "step": 6532 }, { "epoch": 2.901621141461248, "grad_norm": 0.41566157060894093, "learning_rate": 2.1243155814849003e-06, "loss": 0.0322, "step": 6533 }, { "epoch": 2.9020652898067953, "grad_norm": 0.4200346887421879, "learning_rate": 2.1227299637333793e-06, "loss": 0.0265, "step": 6534 }, { "epoch": 2.902509438152343, "grad_norm": 0.481827142730724, "learning_rate": 2.1211447784844223e-06, "loss": 0.0332, "step": 6535 }, { "epoch": 2.9029535864978904, "grad_norm": 0.41218410599797384, "learning_rate": 2.1195600259763064e-06, "loss": 0.0344, "step": 6536 }, { "epoch": 2.9033977348434377, "grad_norm": 0.5125400394558322, "learning_rate": 2.1179757064472495e-06, "loss": 0.0311, "step": 6537 }, { "epoch": 2.903841883188985, "grad_norm": 0.5243375032164893, "learning_rate": 2.1163918201354005e-06, "loss": 0.0442, "step": 6538 }, { "epoch": 2.9042860315345327, "grad_norm": 0.36105640514776777, "learning_rate": 2.114808367278845e-06, "loss": 0.0315, "step": 6539 }, { "epoch": 2.90473017988008, "grad_norm": 0.37491287513386307, "learning_rate": 2.113225348115603e-06, "loss": 0.0273, "step": 6540 }, { "epoch": 2.9051743282256273, "grad_norm": 0.3675658951021628, "learning_rate": 2.11164276288363e-06, "loss": 0.0249, "step": 6541 }, { "epoch": 2.905618476571175, "grad_norm": 0.47441864691687574, "learning_rate": 2.110060611820813e-06, "loss": 0.0439, "step": 6542 }, { "epoch": 2.9060626249167223, "grad_norm": 0.4331767301461994, "learning_rate": 2.1084788951649753e-06, "loss": 0.0308, "step": 6543 }, { "epoch": 2.9065067732622696, "grad_norm": 0.32889369683548264, "learning_rate": 2.106897613153882e-06, "loss": 0.0269, "step": 6544 }, { "epoch": 2.906950921607817, "grad_norm": 0.46027509577610654, "learning_rate": 2.105316766025221e-06, "loss": 0.0309, "step": 6545 }, { "epoch": 2.907395069953364, "grad_norm": 0.5494880014843939, "learning_rate": 2.1037363540166224e-06, "loss": 0.0273, "step": 6546 }, { "epoch": 2.907839218298912, "grad_norm": 0.4011778001737706, "learning_rate": 2.1021563773656493e-06, "loss": 0.0352, "step": 6547 }, { "epoch": 2.908283366644459, "grad_norm": 0.45266951475580575, "learning_rate": 2.1005768363097977e-06, "loss": 0.0301, "step": 6548 }, { "epoch": 2.908727514990007, "grad_norm": 0.49868301394140047, "learning_rate": 2.0989977310865e-06, "loss": 0.0339, "step": 6549 }, { "epoch": 2.909171663335554, "grad_norm": 0.6954722852910884, "learning_rate": 2.0974190619331224e-06, "loss": 0.0349, "step": 6550 }, { "epoch": 2.9096158116811015, "grad_norm": 0.4383145753772345, "learning_rate": 2.0958408290869662e-06, "loss": 0.0356, "step": 6551 }, { "epoch": 2.910059960026649, "grad_norm": 0.3704293568574071, "learning_rate": 2.0942630327852687e-06, "loss": 0.0303, "step": 6552 }, { "epoch": 2.910504108372196, "grad_norm": 0.3498231235044318, "learning_rate": 2.092685673265195e-06, "loss": 0.0318, "step": 6553 }, { "epoch": 2.910948256717744, "grad_norm": 0.36641412131018597, "learning_rate": 2.0911087507638513e-06, "loss": 0.0237, "step": 6554 }, { "epoch": 2.911392405063291, "grad_norm": 0.4005634976837491, "learning_rate": 2.0895322655182754e-06, "loss": 0.0327, "step": 6555 }, { "epoch": 2.9118365534088384, "grad_norm": 0.36560498009667564, "learning_rate": 2.0879562177654404e-06, "loss": 0.0247, "step": 6556 }, { "epoch": 2.912280701754386, "grad_norm": 0.38952731591083495, "learning_rate": 2.0863806077422534e-06, "loss": 0.027, "step": 6557 }, { "epoch": 2.9127248500999334, "grad_norm": 0.4138708556738211, "learning_rate": 2.0848054356855557e-06, "loss": 0.0304, "step": 6558 }, { "epoch": 2.9131689984454807, "grad_norm": 0.5073707851794668, "learning_rate": 2.08323070183212e-06, "loss": 0.0353, "step": 6559 }, { "epoch": 2.913613146791028, "grad_norm": 0.5942930274632409, "learning_rate": 2.081656406418658e-06, "loss": 0.0331, "step": 6560 }, { "epoch": 2.9140572951365757, "grad_norm": 0.35640079528674945, "learning_rate": 2.080082549681811e-06, "loss": 0.0279, "step": 6561 }, { "epoch": 2.914501443482123, "grad_norm": 0.3814193495027854, "learning_rate": 2.0785091318581577e-06, "loss": 0.0305, "step": 6562 }, { "epoch": 2.9149455918276703, "grad_norm": 0.358555755728585, "learning_rate": 2.076936153184211e-06, "loss": 0.0239, "step": 6563 }, { "epoch": 2.915389740173218, "grad_norm": 0.4250693357842293, "learning_rate": 2.0753636138964134e-06, "loss": 0.0286, "step": 6564 }, { "epoch": 2.9158338885187653, "grad_norm": 0.6221059510930096, "learning_rate": 2.0737915142311454e-06, "loss": 0.0441, "step": 6565 }, { "epoch": 2.9162780368643126, "grad_norm": 0.38815256704717155, "learning_rate": 2.07221985442472e-06, "loss": 0.0292, "step": 6566 }, { "epoch": 2.91672218520986, "grad_norm": 0.420249928229787, "learning_rate": 2.0706486347133853e-06, "loss": 0.0289, "step": 6567 }, { "epoch": 2.9171663335554077, "grad_norm": 0.39867842058238456, "learning_rate": 2.0690778553333215e-06, "loss": 0.0399, "step": 6568 }, { "epoch": 2.917610481900955, "grad_norm": 0.35700246538754193, "learning_rate": 2.0675075165206456e-06, "loss": 0.0328, "step": 6569 }, { "epoch": 2.9180546302465022, "grad_norm": 0.39862639265555505, "learning_rate": 2.0659376185114024e-06, "loss": 0.0282, "step": 6570 }, { "epoch": 2.91849877859205, "grad_norm": 0.3874386180786859, "learning_rate": 2.064368161541576e-06, "loss": 0.0269, "step": 6571 }, { "epoch": 2.9189429269375973, "grad_norm": 0.35734528243866814, "learning_rate": 2.0627991458470826e-06, "loss": 0.0277, "step": 6572 }, { "epoch": 2.9193870752831446, "grad_norm": 0.43326587014764417, "learning_rate": 2.061230571663772e-06, "loss": 0.0287, "step": 6573 }, { "epoch": 2.919831223628692, "grad_norm": 0.442123671529554, "learning_rate": 2.0596624392274277e-06, "loss": 0.0348, "step": 6574 }, { "epoch": 2.920275371974239, "grad_norm": 0.41360740653208006, "learning_rate": 2.058094748773768e-06, "loss": 0.0245, "step": 6575 }, { "epoch": 2.920719520319787, "grad_norm": 0.4521622442444811, "learning_rate": 2.05652750053844e-06, "loss": 0.0272, "step": 6576 }, { "epoch": 2.921163668665334, "grad_norm": 0.6652238384193798, "learning_rate": 2.0549606947570295e-06, "loss": 0.0403, "step": 6577 }, { "epoch": 2.921607817010882, "grad_norm": 0.33561551996607286, "learning_rate": 2.053394331665054e-06, "loss": 0.023, "step": 6578 }, { "epoch": 2.922051965356429, "grad_norm": 0.4533033136781839, "learning_rate": 2.051828411497964e-06, "loss": 0.0418, "step": 6579 }, { "epoch": 2.9224961137019765, "grad_norm": 0.2933903835866281, "learning_rate": 2.0502629344911475e-06, "loss": 0.0204, "step": 6580 }, { "epoch": 2.9229402620475238, "grad_norm": 0.4807368532427772, "learning_rate": 2.0486979008799164e-06, "loss": 0.0388, "step": 6581 }, { "epoch": 2.923384410393071, "grad_norm": 0.39312159469638, "learning_rate": 2.047133310899525e-06, "loss": 0.0352, "step": 6582 }, { "epoch": 2.923828558738619, "grad_norm": 0.49386287194544953, "learning_rate": 2.045569164785157e-06, "loss": 0.0318, "step": 6583 }, { "epoch": 2.924272707084166, "grad_norm": 0.3909685186859985, "learning_rate": 2.044005462771931e-06, "loss": 0.0244, "step": 6584 }, { "epoch": 2.9247168554297134, "grad_norm": 0.4648834902668276, "learning_rate": 2.0424422050948976e-06, "loss": 0.0387, "step": 6585 }, { "epoch": 2.925161003775261, "grad_norm": 0.36487456799403395, "learning_rate": 2.0408793919890424e-06, "loss": 0.0288, "step": 6586 }, { "epoch": 2.9256051521208084, "grad_norm": 0.5349577863858896, "learning_rate": 2.0393170236892795e-06, "loss": 0.0426, "step": 6587 }, { "epoch": 2.9260493004663557, "grad_norm": 0.35432394100233194, "learning_rate": 2.0377551004304613e-06, "loss": 0.0304, "step": 6588 }, { "epoch": 2.926493448811903, "grad_norm": 0.5133489099915194, "learning_rate": 2.036193622447371e-06, "loss": 0.0436, "step": 6589 }, { "epoch": 2.9269375971574507, "grad_norm": 0.40489974505235427, "learning_rate": 2.034632589974726e-06, "loss": 0.0319, "step": 6590 }, { "epoch": 2.927381745502998, "grad_norm": 0.3994253955209103, "learning_rate": 2.033072003247175e-06, "loss": 0.0308, "step": 6591 }, { "epoch": 2.9278258938485453, "grad_norm": 0.34532953088972845, "learning_rate": 2.0315118624993035e-06, "loss": 0.0261, "step": 6592 }, { "epoch": 2.928270042194093, "grad_norm": 0.35520607631530465, "learning_rate": 2.0299521679656225e-06, "loss": 0.0228, "step": 6593 }, { "epoch": 2.9287141905396403, "grad_norm": 0.32280141712366317, "learning_rate": 2.0283929198805837e-06, "loss": 0.0189, "step": 6594 }, { "epoch": 2.9291583388851876, "grad_norm": 0.43110913730957606, "learning_rate": 2.0268341184785674e-06, "loss": 0.0378, "step": 6595 }, { "epoch": 2.929602487230735, "grad_norm": 0.354328605582052, "learning_rate": 2.025275763993888e-06, "loss": 0.0313, "step": 6596 }, { "epoch": 2.9300466355762826, "grad_norm": 0.3880427101102111, "learning_rate": 2.023717856660795e-06, "loss": 0.0283, "step": 6597 }, { "epoch": 2.93049078392183, "grad_norm": 0.37092577723252107, "learning_rate": 2.0221603967134645e-06, "loss": 0.0309, "step": 6598 }, { "epoch": 2.9309349322673772, "grad_norm": 0.3853764249821867, "learning_rate": 2.0206033843860113e-06, "loss": 0.036, "step": 6599 }, { "epoch": 2.931379080612925, "grad_norm": 0.32195066732125943, "learning_rate": 2.0190468199124804e-06, "loss": 0.0266, "step": 6600 }, { "epoch": 2.9318232289584722, "grad_norm": 0.377934707171494, "learning_rate": 2.01749070352685e-06, "loss": 0.0323, "step": 6601 }, { "epoch": 2.9322673773040195, "grad_norm": 0.4090601636781386, "learning_rate": 2.0159350354630307e-06, "loss": 0.0275, "step": 6602 }, { "epoch": 2.932711525649567, "grad_norm": 0.4773326358669816, "learning_rate": 2.0143798159548677e-06, "loss": 0.0331, "step": 6603 }, { "epoch": 2.933155673995114, "grad_norm": 0.4210797373291252, "learning_rate": 2.0128250452361334e-06, "loss": 0.0362, "step": 6604 }, { "epoch": 2.933599822340662, "grad_norm": 0.4248736186586538, "learning_rate": 2.0112707235405386e-06, "loss": 0.0309, "step": 6605 }, { "epoch": 2.934043970686209, "grad_norm": 0.3339641218919188, "learning_rate": 2.0097168511017234e-06, "loss": 0.0187, "step": 6606 }, { "epoch": 2.934488119031757, "grad_norm": 0.5167791996296286, "learning_rate": 2.0081634281532613e-06, "loss": 0.0315, "step": 6607 }, { "epoch": 2.934932267377304, "grad_norm": 0.4991803228092949, "learning_rate": 2.0066104549286602e-06, "loss": 0.0436, "step": 6608 }, { "epoch": 2.9353764157228515, "grad_norm": 0.3779855137295941, "learning_rate": 2.005057931661355e-06, "loss": 0.0273, "step": 6609 }, { "epoch": 2.9358205640683988, "grad_norm": 0.5036801733855913, "learning_rate": 2.0035058585847173e-06, "loss": 0.0328, "step": 6610 }, { "epoch": 2.936264712413946, "grad_norm": 0.3544894089098337, "learning_rate": 2.001954235932051e-06, "loss": 0.0235, "step": 6611 }, { "epoch": 2.9367088607594938, "grad_norm": 0.42819328292612396, "learning_rate": 2.0004030639365907e-06, "loss": 0.0308, "step": 6612 }, { "epoch": 2.937153009105041, "grad_norm": 0.39806520440496346, "learning_rate": 1.9988523428315045e-06, "loss": 0.0277, "step": 6613 }, { "epoch": 2.9375971574505884, "grad_norm": 0.41174193415314053, "learning_rate": 1.997302072849893e-06, "loss": 0.0282, "step": 6614 }, { "epoch": 2.938041305796136, "grad_norm": 0.4414650832704903, "learning_rate": 1.995752254224786e-06, "loss": 0.0303, "step": 6615 }, { "epoch": 2.9384854541416834, "grad_norm": 0.4432623492720395, "learning_rate": 1.994202887189148e-06, "loss": 0.0253, "step": 6616 }, { "epoch": 2.9389296024872307, "grad_norm": 0.4800911306612613, "learning_rate": 1.9926539719758747e-06, "loss": 0.0388, "step": 6617 }, { "epoch": 2.939373750832778, "grad_norm": 0.4059304636548806, "learning_rate": 1.9911055088177967e-06, "loss": 0.0276, "step": 6618 }, { "epoch": 2.9398178991783257, "grad_norm": 0.5632995645750524, "learning_rate": 1.9895574979476717e-06, "loss": 0.0364, "step": 6619 }, { "epoch": 2.940262047523873, "grad_norm": 0.3730668889292687, "learning_rate": 1.9880099395981954e-06, "loss": 0.0266, "step": 6620 }, { "epoch": 2.9407061958694203, "grad_norm": 0.4942132883415099, "learning_rate": 1.986462834001989e-06, "loss": 0.0424, "step": 6621 }, { "epoch": 2.941150344214968, "grad_norm": 0.35051082450586374, "learning_rate": 1.984916181391609e-06, "loss": 0.029, "step": 6622 }, { "epoch": 2.9415944925605153, "grad_norm": 0.49757968739541286, "learning_rate": 1.983369981999544e-06, "loss": 0.0391, "step": 6623 }, { "epoch": 2.9420386409060626, "grad_norm": 0.3866532839130346, "learning_rate": 1.9818242360582145e-06, "loss": 0.0196, "step": 6624 }, { "epoch": 2.94248278925161, "grad_norm": 0.3312297948056805, "learning_rate": 1.980278943799974e-06, "loss": 0.024, "step": 6625 }, { "epoch": 2.942926937597157, "grad_norm": 0.369854753089198, "learning_rate": 1.978734105457103e-06, "loss": 0.0261, "step": 6626 }, { "epoch": 2.943371085942705, "grad_norm": 0.40104725649008754, "learning_rate": 1.9771897212618172e-06, "loss": 0.0338, "step": 6627 }, { "epoch": 2.943815234288252, "grad_norm": 0.34952669002936676, "learning_rate": 1.9756457914462677e-06, "loss": 0.0245, "step": 6628 }, { "epoch": 2.9442593826338, "grad_norm": 0.41129129029247236, "learning_rate": 1.9741023162425265e-06, "loss": 0.0313, "step": 6629 }, { "epoch": 2.9447035309793472, "grad_norm": 0.5211003073625236, "learning_rate": 1.9725592958826102e-06, "loss": 0.0421, "step": 6630 }, { "epoch": 2.9451476793248945, "grad_norm": 0.3984417865901208, "learning_rate": 1.9710167305984607e-06, "loss": 0.0375, "step": 6631 }, { "epoch": 2.945591827670442, "grad_norm": 0.30941970478869846, "learning_rate": 1.9694746206219477e-06, "loss": 0.0216, "step": 6632 }, { "epoch": 2.946035976015989, "grad_norm": 0.4229304742479391, "learning_rate": 1.9679329661848795e-06, "loss": 0.034, "step": 6633 }, { "epoch": 2.946480124361537, "grad_norm": 0.476591153079035, "learning_rate": 1.966391767518992e-06, "loss": 0.0516, "step": 6634 }, { "epoch": 2.946924272707084, "grad_norm": 0.6821582454050819, "learning_rate": 1.9648510248559546e-06, "loss": 0.034, "step": 6635 }, { "epoch": 2.9473684210526314, "grad_norm": 0.43772389004233875, "learning_rate": 1.9633107384273668e-06, "loss": 0.0296, "step": 6636 }, { "epoch": 2.947812569398179, "grad_norm": 0.49311057599048846, "learning_rate": 1.9617709084647584e-06, "loss": 0.0471, "step": 6637 }, { "epoch": 2.9482567177437264, "grad_norm": 0.48536816799978794, "learning_rate": 1.9602315351995928e-06, "loss": 0.0393, "step": 6638 }, { "epoch": 2.9487008660892737, "grad_norm": 0.5141630455505187, "learning_rate": 1.958692618863264e-06, "loss": 0.0342, "step": 6639 }, { "epoch": 2.949145014434821, "grad_norm": 0.4586328384368867, "learning_rate": 1.9571541596870974e-06, "loss": 0.0391, "step": 6640 }, { "epoch": 2.9495891627803688, "grad_norm": 0.4178954318106239, "learning_rate": 1.9556161579023493e-06, "loss": 0.0315, "step": 6641 }, { "epoch": 2.950033311125916, "grad_norm": 0.4366977616184424, "learning_rate": 1.9540786137402097e-06, "loss": 0.0325, "step": 6642 }, { "epoch": 2.9504774594714633, "grad_norm": 0.35044020018653177, "learning_rate": 1.952541527431794e-06, "loss": 0.0238, "step": 6643 }, { "epoch": 2.950921607817011, "grad_norm": 0.36474391941516165, "learning_rate": 1.951004899208154e-06, "loss": 0.0284, "step": 6644 }, { "epoch": 2.9513657561625584, "grad_norm": 0.3808970857069157, "learning_rate": 1.9494687293002724e-06, "loss": 0.0262, "step": 6645 }, { "epoch": 2.9518099045081057, "grad_norm": 0.49087581676544356, "learning_rate": 1.947933017939057e-06, "loss": 0.0334, "step": 6646 }, { "epoch": 2.952254052853653, "grad_norm": 0.4067523864649502, "learning_rate": 1.946397765355356e-06, "loss": 0.0224, "step": 6647 }, { "epoch": 2.9526982011992007, "grad_norm": 0.5385094196478863, "learning_rate": 1.9448629717799444e-06, "loss": 0.0274, "step": 6648 }, { "epoch": 2.953142349544748, "grad_norm": 0.4534794423284923, "learning_rate": 1.9433286374435243e-06, "loss": 0.0307, "step": 6649 }, { "epoch": 2.9535864978902953, "grad_norm": 0.42491711765749635, "learning_rate": 1.9417947625767338e-06, "loss": 0.0282, "step": 6650 }, { "epoch": 2.954030646235843, "grad_norm": 0.43191372654900584, "learning_rate": 1.9402613474101418e-06, "loss": 0.0284, "step": 6651 }, { "epoch": 2.9544747945813903, "grad_norm": 0.4659070261041793, "learning_rate": 1.9387283921742417e-06, "loss": 0.0348, "step": 6652 }, { "epoch": 2.9549189429269376, "grad_norm": 0.3521658502328422, "learning_rate": 1.9371958970994697e-06, "loss": 0.0264, "step": 6653 }, { "epoch": 2.955363091272485, "grad_norm": 0.3744257822368085, "learning_rate": 1.935663862416181e-06, "loss": 0.0233, "step": 6654 }, { "epoch": 2.955807239618032, "grad_norm": 0.3533483955176666, "learning_rate": 1.934132288354667e-06, "loss": 0.0223, "step": 6655 }, { "epoch": 2.95625138796358, "grad_norm": 0.48990756810689967, "learning_rate": 1.9326011751451523e-06, "loss": 0.0296, "step": 6656 }, { "epoch": 2.956695536309127, "grad_norm": 0.4155057113271156, "learning_rate": 1.9310705230177834e-06, "loss": 0.032, "step": 6657 }, { "epoch": 2.957139684654675, "grad_norm": 0.46467269039209746, "learning_rate": 1.9295403322026485e-06, "loss": 0.0302, "step": 6658 }, { "epoch": 2.957583833000222, "grad_norm": 0.436904391432258, "learning_rate": 1.928010602929762e-06, "loss": 0.0249, "step": 6659 }, { "epoch": 2.9580279813457695, "grad_norm": 0.4078298528030376, "learning_rate": 1.9264813354290635e-06, "loss": 0.0339, "step": 6660 }, { "epoch": 2.958472129691317, "grad_norm": 0.43921352541689956, "learning_rate": 1.92495252993043e-06, "loss": 0.0293, "step": 6661 }, { "epoch": 2.958916278036864, "grad_norm": 0.47221505355593735, "learning_rate": 1.9234241866636693e-06, "loss": 0.0267, "step": 6662 }, { "epoch": 2.959360426382412, "grad_norm": 0.3695229541487737, "learning_rate": 1.9218963058585117e-06, "loss": 0.0273, "step": 6663 }, { "epoch": 2.959804574727959, "grad_norm": 0.3129170332680405, "learning_rate": 1.9203688877446285e-06, "loss": 0.0177, "step": 6664 }, { "epoch": 2.9602487230735064, "grad_norm": 0.3892558088874507, "learning_rate": 1.9188419325516177e-06, "loss": 0.0381, "step": 6665 }, { "epoch": 2.960692871419054, "grad_norm": 0.3716168146474431, "learning_rate": 1.9173154405090024e-06, "loss": 0.0222, "step": 6666 }, { "epoch": 2.9611370197646014, "grad_norm": 0.4147701146563668, "learning_rate": 1.9157894118462416e-06, "loss": 0.0307, "step": 6667 }, { "epoch": 2.9615811681101487, "grad_norm": 0.39499526563472726, "learning_rate": 1.9142638467927254e-06, "loss": 0.0365, "step": 6668 }, { "epoch": 2.962025316455696, "grad_norm": 0.3874596806249992, "learning_rate": 1.9127387455777673e-06, "loss": 0.0321, "step": 6669 }, { "epoch": 2.9624694648012437, "grad_norm": 0.42027304319891273, "learning_rate": 1.911214108430623e-06, "loss": 0.033, "step": 6670 }, { "epoch": 2.962913613146791, "grad_norm": 0.44739921526779486, "learning_rate": 1.9096899355804655e-06, "loss": 0.0275, "step": 6671 }, { "epoch": 2.9633577614923383, "grad_norm": 0.37802801428083804, "learning_rate": 1.9081662272564055e-06, "loss": 0.0319, "step": 6672 }, { "epoch": 2.963801909837886, "grad_norm": 0.38442939573511425, "learning_rate": 1.9066429836874844e-06, "loss": 0.0295, "step": 6673 }, { "epoch": 2.9642460581834333, "grad_norm": 0.6866561926311151, "learning_rate": 1.9051202051026669e-06, "loss": 0.0381, "step": 6674 }, { "epoch": 2.9646902065289806, "grad_norm": 0.39471908589991933, "learning_rate": 1.9035978917308568e-06, "loss": 0.0282, "step": 6675 }, { "epoch": 2.965134354874528, "grad_norm": 0.3683316118105389, "learning_rate": 1.902076043800884e-06, "loss": 0.0237, "step": 6676 }, { "epoch": 2.9655785032200757, "grad_norm": 0.37626637800602947, "learning_rate": 1.9005546615415044e-06, "loss": 0.0241, "step": 6677 }, { "epoch": 2.966022651565623, "grad_norm": 0.3496930484588171, "learning_rate": 1.8990337451814095e-06, "loss": 0.0286, "step": 6678 }, { "epoch": 2.9664667999111702, "grad_norm": 0.41344810193656406, "learning_rate": 1.897513294949221e-06, "loss": 0.0273, "step": 6679 }, { "epoch": 2.966910948256718, "grad_norm": 0.378120591478157, "learning_rate": 1.895993311073483e-06, "loss": 0.0294, "step": 6680 }, { "epoch": 2.9673550966022653, "grad_norm": 0.33697535541119333, "learning_rate": 1.8944737937826813e-06, "loss": 0.0273, "step": 6681 }, { "epoch": 2.9677992449478126, "grad_norm": 0.3663439761387446, "learning_rate": 1.8929547433052202e-06, "loss": 0.032, "step": 6682 }, { "epoch": 2.96824339329336, "grad_norm": 0.5047952130541636, "learning_rate": 1.8914361598694408e-06, "loss": 0.0339, "step": 6683 }, { "epoch": 2.968687541638907, "grad_norm": 0.4039062494421022, "learning_rate": 1.8899180437036119e-06, "loss": 0.0375, "step": 6684 }, { "epoch": 2.969131689984455, "grad_norm": 0.3025200589569862, "learning_rate": 1.8884003950359337e-06, "loss": 0.0262, "step": 6685 }, { "epoch": 2.969575838330002, "grad_norm": 0.44229043636076315, "learning_rate": 1.8868832140945297e-06, "loss": 0.028, "step": 6686 }, { "epoch": 2.97001998667555, "grad_norm": 0.4708612100607534, "learning_rate": 1.8853665011074645e-06, "loss": 0.027, "step": 6687 }, { "epoch": 2.970464135021097, "grad_norm": 0.3722598107484293, "learning_rate": 1.8838502563027212e-06, "loss": 0.0253, "step": 6688 }, { "epoch": 2.9709082833666445, "grad_norm": 0.6036467003416066, "learning_rate": 1.8823344799082177e-06, "loss": 0.0284, "step": 6689 }, { "epoch": 2.9713524317121918, "grad_norm": 0.34918427015083503, "learning_rate": 1.8808191721518043e-06, "loss": 0.0294, "step": 6690 }, { "epoch": 2.971796580057739, "grad_norm": 0.42849383719138034, "learning_rate": 1.879304333261251e-06, "loss": 0.0278, "step": 6691 }, { "epoch": 2.972240728403287, "grad_norm": 0.48072735895909585, "learning_rate": 1.87778996346427e-06, "loss": 0.0417, "step": 6692 }, { "epoch": 2.972684876748834, "grad_norm": 0.29126197319559194, "learning_rate": 1.8762760629884958e-06, "loss": 0.0243, "step": 6693 }, { "epoch": 2.9731290250943814, "grad_norm": 0.3610203458255818, "learning_rate": 1.8747626320614904e-06, "loss": 0.0269, "step": 6694 }, { "epoch": 2.973573173439929, "grad_norm": 0.4145287000272817, "learning_rate": 1.87324967091075e-06, "loss": 0.0461, "step": 6695 }, { "epoch": 2.9740173217854764, "grad_norm": 0.3579965634087584, "learning_rate": 1.8717371797637002e-06, "loss": 0.0296, "step": 6696 }, { "epoch": 2.9744614701310237, "grad_norm": 0.41883841344207373, "learning_rate": 1.8702251588476889e-06, "loss": 0.0314, "step": 6697 }, { "epoch": 2.974905618476571, "grad_norm": 0.5313573343030277, "learning_rate": 1.868713608390005e-06, "loss": 0.0408, "step": 6698 }, { "epoch": 2.9753497668221187, "grad_norm": 0.39733118203820383, "learning_rate": 1.8672025286178546e-06, "loss": 0.0357, "step": 6699 }, { "epoch": 2.975793915167666, "grad_norm": 0.4305571327880009, "learning_rate": 1.8656919197583816e-06, "loss": 0.0287, "step": 6700 }, { "epoch": 2.9762380635132133, "grad_norm": 0.38007221498778115, "learning_rate": 1.8641817820386576e-06, "loss": 0.0256, "step": 6701 }, { "epoch": 2.976682211858761, "grad_norm": 0.35438367987841246, "learning_rate": 1.862672115685678e-06, "loss": 0.0318, "step": 6702 }, { "epoch": 2.9771263602043083, "grad_norm": 0.3521742278005089, "learning_rate": 1.861162920926372e-06, "loss": 0.0246, "step": 6703 }, { "epoch": 2.9775705085498556, "grad_norm": 0.4587000972835763, "learning_rate": 1.8596541979876016e-06, "loss": 0.0403, "step": 6704 }, { "epoch": 2.978014656895403, "grad_norm": 0.4391686978547565, "learning_rate": 1.8581459470961488e-06, "loss": 0.033, "step": 6705 }, { "epoch": 2.9784588052409506, "grad_norm": 0.8681641943800276, "learning_rate": 1.856638168478731e-06, "loss": 0.0271, "step": 6706 }, { "epoch": 2.978902953586498, "grad_norm": 0.3412875403024399, "learning_rate": 1.8551308623619945e-06, "loss": 0.0278, "step": 6707 }, { "epoch": 2.979347101932045, "grad_norm": 0.39261569640506333, "learning_rate": 1.8536240289725078e-06, "loss": 0.035, "step": 6708 }, { "epoch": 2.979791250277593, "grad_norm": 0.3999387719956585, "learning_rate": 1.8521176685367804e-06, "loss": 0.0287, "step": 6709 }, { "epoch": 2.9802353986231402, "grad_norm": 0.3873753971954603, "learning_rate": 1.850611781281239e-06, "loss": 0.0322, "step": 6710 }, { "epoch": 2.9806795469686875, "grad_norm": 0.4380866274093617, "learning_rate": 1.8491063674322457e-06, "loss": 0.033, "step": 6711 }, { "epoch": 2.981123695314235, "grad_norm": 0.4171386719486352, "learning_rate": 1.8476014272160896e-06, "loss": 0.0354, "step": 6712 }, { "epoch": 2.981567843659782, "grad_norm": 0.37606765896917305, "learning_rate": 1.8460969608589913e-06, "loss": 0.0309, "step": 6713 }, { "epoch": 2.98201199200533, "grad_norm": 0.33316278037626945, "learning_rate": 1.8445929685870912e-06, "loss": 0.0281, "step": 6714 }, { "epoch": 2.982456140350877, "grad_norm": 0.4295124088150545, "learning_rate": 1.8430894506264724e-06, "loss": 0.0281, "step": 6715 }, { "epoch": 2.982900288696425, "grad_norm": 0.6219155251106016, "learning_rate": 1.8415864072031335e-06, "loss": 0.0412, "step": 6716 }, { "epoch": 2.983344437041972, "grad_norm": 1.0649791746325505, "learning_rate": 1.8400838385430104e-06, "loss": 0.0296, "step": 6717 }, { "epoch": 2.9837885853875195, "grad_norm": 0.4321630566554828, "learning_rate": 1.838581744871965e-06, "loss": 0.0349, "step": 6718 }, { "epoch": 2.9842327337330667, "grad_norm": 0.3860040470778094, "learning_rate": 1.8370801264157857e-06, "loss": 0.029, "step": 6719 }, { "epoch": 2.984676882078614, "grad_norm": 0.38156983645011616, "learning_rate": 1.8355789834001898e-06, "loss": 0.0284, "step": 6720 }, { "epoch": 2.9851210304241618, "grad_norm": 0.5127281233435997, "learning_rate": 1.8340783160508297e-06, "loss": 0.0355, "step": 6721 }, { "epoch": 2.985565178769709, "grad_norm": 0.3767235771133534, "learning_rate": 1.8325781245932772e-06, "loss": 0.0297, "step": 6722 }, { "epoch": 2.9860093271152564, "grad_norm": 0.4559507813085918, "learning_rate": 1.8310784092530376e-06, "loss": 0.0362, "step": 6723 }, { "epoch": 2.986453475460804, "grad_norm": 0.36189720669347597, "learning_rate": 1.8295791702555455e-06, "loss": 0.0294, "step": 6724 }, { "epoch": 2.9868976238063514, "grad_norm": 0.3457736093542848, "learning_rate": 1.8280804078261577e-06, "loss": 0.0305, "step": 6725 }, { "epoch": 2.9873417721518987, "grad_norm": 0.5911976787943448, "learning_rate": 1.826582122190167e-06, "loss": 0.0368, "step": 6726 }, { "epoch": 2.987785920497446, "grad_norm": 0.4315279094342813, "learning_rate": 1.8250843135727898e-06, "loss": 0.0365, "step": 6727 }, { "epoch": 2.9882300688429937, "grad_norm": 0.3782115314248128, "learning_rate": 1.8235869821991726e-06, "loss": 0.0297, "step": 6728 }, { "epoch": 2.988674217188541, "grad_norm": 0.42412903793940915, "learning_rate": 1.8220901282943915e-06, "loss": 0.0348, "step": 6729 }, { "epoch": 2.9891183655340883, "grad_norm": 0.42286509801094985, "learning_rate": 1.820593752083446e-06, "loss": 0.0356, "step": 6730 }, { "epoch": 2.989562513879636, "grad_norm": 0.48807857838384455, "learning_rate": 1.8190978537912662e-06, "loss": 0.0441, "step": 6731 }, { "epoch": 2.9900066622251833, "grad_norm": 0.4660013011382873, "learning_rate": 1.8176024336427167e-06, "loss": 0.0356, "step": 6732 }, { "epoch": 2.9904508105707306, "grad_norm": 0.3894973608346122, "learning_rate": 1.8161074918625792e-06, "loss": 0.0234, "step": 6733 }, { "epoch": 2.990894958916278, "grad_norm": 0.43148032392899044, "learning_rate": 1.8146130286755704e-06, "loss": 0.0316, "step": 6734 }, { "epoch": 2.9913391072618256, "grad_norm": 0.4541609973903636, "learning_rate": 1.8131190443063357e-06, "loss": 0.0228, "step": 6735 }, { "epoch": 2.991783255607373, "grad_norm": 0.37931640734503375, "learning_rate": 1.8116255389794418e-06, "loss": 0.03, "step": 6736 }, { "epoch": 2.99222740395292, "grad_norm": 0.38246202047737776, "learning_rate": 1.8101325129193897e-06, "loss": 0.0432, "step": 6737 }, { "epoch": 2.992671552298468, "grad_norm": 0.42501044694023266, "learning_rate": 1.8086399663506099e-06, "loss": 0.0378, "step": 6738 }, { "epoch": 2.993115700644015, "grad_norm": 0.4349574825819462, "learning_rate": 1.8071478994974534e-06, "loss": 0.022, "step": 6739 }, { "epoch": 2.9935598489895625, "grad_norm": 0.36182919079127596, "learning_rate": 1.8056563125842046e-06, "loss": 0.0281, "step": 6740 }, { "epoch": 2.99400399733511, "grad_norm": 0.3297563936891292, "learning_rate": 1.8041652058350768e-06, "loss": 0.0258, "step": 6741 }, { "epoch": 2.994448145680657, "grad_norm": 0.7003299863726014, "learning_rate": 1.802674579474204e-06, "loss": 0.0351, "step": 6742 }, { "epoch": 2.994892294026205, "grad_norm": 0.6139326715543403, "learning_rate": 1.801184433725655e-06, "loss": 0.0379, "step": 6743 }, { "epoch": 2.995336442371752, "grad_norm": 0.5062057494487118, "learning_rate": 1.7996947688134241e-06, "loss": 0.0315, "step": 6744 }, { "epoch": 2.9957805907173, "grad_norm": 0.7604714501213958, "learning_rate": 1.7982055849614327e-06, "loss": 0.0435, "step": 6745 }, { "epoch": 2.996224739062847, "grad_norm": 0.4500952174598657, "learning_rate": 1.7967168823935333e-06, "loss": 0.0364, "step": 6746 }, { "epoch": 2.9966688874083944, "grad_norm": 0.6302920187503901, "learning_rate": 1.7952286613334986e-06, "loss": 0.0319, "step": 6747 }, { "epoch": 2.9971130357539417, "grad_norm": 0.36539112186296896, "learning_rate": 1.793740922005034e-06, "loss": 0.0308, "step": 6748 }, { "epoch": 2.997557184099489, "grad_norm": 0.43381962390876766, "learning_rate": 1.7922536646317767e-06, "loss": 0.0377, "step": 6749 }, { "epoch": 2.9980013324450367, "grad_norm": 0.33856021759003035, "learning_rate": 1.7907668894372826e-06, "loss": 0.0239, "step": 6750 }, { "epoch": 2.998445480790584, "grad_norm": 0.37584653604138824, "learning_rate": 1.78928059664504e-06, "loss": 0.024, "step": 6751 }, { "epoch": 2.9988896291361313, "grad_norm": 0.361912705997716, "learning_rate": 1.7877947864784662e-06, "loss": 0.0261, "step": 6752 }, { "epoch": 2.999333777481679, "grad_norm": 0.35751593403707377, "learning_rate": 1.7863094591609003e-06, "loss": 0.0254, "step": 6753 }, { "epoch": 2.9997779258272264, "grad_norm": 0.4724255587714232, "learning_rate": 1.7848246149156134e-06, "loss": 0.0421, "step": 6754 }, { "epoch": 2.9997779258272264, "eval_loss": 0.03672339767217636, "eval_runtime": 403.6019, "eval_samples_per_second": 37.577, "eval_steps_per_second": 1.174, "step": 6754 }, { "epoch": 3.0002220741727736, "grad_norm": 0.3269520644923097, "learning_rate": 1.783340253965803e-06, "loss": 0.0215, "step": 6755 }, { "epoch": 3.000666222518321, "grad_norm": 0.3360624322653449, "learning_rate": 1.7818563765345942e-06, "loss": 0.0284, "step": 6756 }, { "epoch": 3.0011103708638687, "grad_norm": 0.3740872446254905, "learning_rate": 1.7803729828450405e-06, "loss": 0.0207, "step": 6757 }, { "epoch": 3.001554519209416, "grad_norm": 0.29900746203751005, "learning_rate": 1.7788900731201174e-06, "loss": 0.0206, "step": 6758 }, { "epoch": 3.0019986675549633, "grad_norm": 0.4119590022541271, "learning_rate": 1.7774076475827335e-06, "loss": 0.0239, "step": 6759 }, { "epoch": 3.002442815900511, "grad_norm": 0.4073542383308494, "learning_rate": 1.7759257064557229e-06, "loss": 0.022, "step": 6760 }, { "epoch": 3.0028869642460583, "grad_norm": 0.6159273406264103, "learning_rate": 1.7744442499618453e-06, "loss": 0.032, "step": 6761 }, { "epoch": 3.0033311125916056, "grad_norm": 0.8514717794851285, "learning_rate": 1.77296327832379e-06, "loss": 0.0421, "step": 6762 }, { "epoch": 3.003775260937153, "grad_norm": 0.3400576483770952, "learning_rate": 1.7714827917641737e-06, "loss": 0.0209, "step": 6763 }, { "epoch": 3.0042194092827006, "grad_norm": 0.38696151832577724, "learning_rate": 1.7700027905055344e-06, "loss": 0.0232, "step": 6764 }, { "epoch": 3.004663557628248, "grad_norm": 0.40863063034907293, "learning_rate": 1.7685232747703424e-06, "loss": 0.0246, "step": 6765 }, { "epoch": 3.005107705973795, "grad_norm": 0.39214643564722806, "learning_rate": 1.7670442447809989e-06, "loss": 0.0283, "step": 6766 }, { "epoch": 3.0055518543193425, "grad_norm": 0.5937092447872463, "learning_rate": 1.7655657007598216e-06, "loss": 0.0269, "step": 6767 }, { "epoch": 3.00599600266489, "grad_norm": 0.43120240703534374, "learning_rate": 1.7640876429290633e-06, "loss": 0.0238, "step": 6768 }, { "epoch": 3.0064401510104375, "grad_norm": 0.405113495539343, "learning_rate": 1.7626100715109018e-06, "loss": 0.0206, "step": 6769 }, { "epoch": 3.006884299355985, "grad_norm": 0.35078016785761906, "learning_rate": 1.761132986727439e-06, "loss": 0.0207, "step": 6770 }, { "epoch": 3.0073284477015325, "grad_norm": 0.36832682307040443, "learning_rate": 1.7596563888007073e-06, "loss": 0.0246, "step": 6771 }, { "epoch": 3.00777259604708, "grad_norm": 0.38464153090067194, "learning_rate": 1.7581802779526642e-06, "loss": 0.0187, "step": 6772 }, { "epoch": 3.008216744392627, "grad_norm": 0.44328209442836686, "learning_rate": 1.7567046544051935e-06, "loss": 0.0254, "step": 6773 }, { "epoch": 3.0086608927381744, "grad_norm": 0.4311223508530591, "learning_rate": 1.7552295183801093e-06, "loss": 0.0282, "step": 6774 }, { "epoch": 3.009105041083722, "grad_norm": 0.4187784150098612, "learning_rate": 1.7537548700991463e-06, "loss": 0.026, "step": 6775 }, { "epoch": 3.0095491894292694, "grad_norm": 0.49752308362720904, "learning_rate": 1.75228070978397e-06, "loss": 0.0333, "step": 6776 }, { "epoch": 3.0099933377748167, "grad_norm": 0.3661955272882175, "learning_rate": 1.750807037656172e-06, "loss": 0.0263, "step": 6777 }, { "epoch": 3.010437486120364, "grad_norm": 0.3951032549450352, "learning_rate": 1.7493338539372701e-06, "loss": 0.0245, "step": 6778 }, { "epoch": 3.0108816344659117, "grad_norm": 0.4288937494263082, "learning_rate": 1.7478611588487098e-06, "loss": 0.0224, "step": 6779 }, { "epoch": 3.011325782811459, "grad_norm": 1.0043569602354414, "learning_rate": 1.7463889526118628e-06, "loss": 0.0321, "step": 6780 }, { "epoch": 3.0117699311570063, "grad_norm": 0.345117052755733, "learning_rate": 1.7449172354480236e-06, "loss": 0.0222, "step": 6781 }, { "epoch": 3.012214079502554, "grad_norm": 0.37263032859522277, "learning_rate": 1.7434460075784183e-06, "loss": 0.0371, "step": 6782 }, { "epoch": 3.0126582278481013, "grad_norm": 0.42797870753405914, "learning_rate": 1.741975269224197e-06, "loss": 0.0275, "step": 6783 }, { "epoch": 3.0131023761936486, "grad_norm": 0.4487558433311281, "learning_rate": 1.7405050206064372e-06, "loss": 0.0278, "step": 6784 }, { "epoch": 3.013546524539196, "grad_norm": 0.3104547304379723, "learning_rate": 1.739035261946142e-06, "loss": 0.0162, "step": 6785 }, { "epoch": 3.0139906728847436, "grad_norm": 0.4274226742898415, "learning_rate": 1.7375659934642425e-06, "loss": 0.0232, "step": 6786 }, { "epoch": 3.014434821230291, "grad_norm": 0.4378508355204737, "learning_rate": 1.7360972153815919e-06, "loss": 0.0263, "step": 6787 }, { "epoch": 3.0148789695758382, "grad_norm": 0.35914322608698723, "learning_rate": 1.7346289279189732e-06, "loss": 0.0236, "step": 6788 }, { "epoch": 3.015323117921386, "grad_norm": 0.3717761462102794, "learning_rate": 1.7331611312970965e-06, "loss": 0.0223, "step": 6789 }, { "epoch": 3.0157672662669333, "grad_norm": 0.49078555792657397, "learning_rate": 1.7316938257365945e-06, "loss": 0.0479, "step": 6790 }, { "epoch": 3.0162114146124805, "grad_norm": 0.5303389635543914, "learning_rate": 1.7302270114580316e-06, "loss": 0.0257, "step": 6791 }, { "epoch": 3.016655562958028, "grad_norm": 0.5963610689982617, "learning_rate": 1.7287606886818914e-06, "loss": 0.025, "step": 6792 }, { "epoch": 3.0170997113035756, "grad_norm": 0.3358156185785528, "learning_rate": 1.7272948576285874e-06, "loss": 0.0259, "step": 6793 }, { "epoch": 3.017543859649123, "grad_norm": 0.44451083436086486, "learning_rate": 1.7258295185184604e-06, "loss": 0.0371, "step": 6794 }, { "epoch": 3.01798800799467, "grad_norm": 0.8409978484969116, "learning_rate": 1.7243646715717754e-06, "loss": 0.0198, "step": 6795 }, { "epoch": 3.0184321563402174, "grad_norm": 0.40598942285053524, "learning_rate": 1.7229003170087232e-06, "loss": 0.0253, "step": 6796 }, { "epoch": 3.018876304685765, "grad_norm": 0.40045735020677964, "learning_rate": 1.7214364550494235e-06, "loss": 0.0305, "step": 6797 }, { "epoch": 3.0193204530313125, "grad_norm": 0.4205212377847487, "learning_rate": 1.7199730859139157e-06, "loss": 0.0233, "step": 6798 }, { "epoch": 3.0197646013768598, "grad_norm": 0.39200127688818526, "learning_rate": 1.7185102098221713e-06, "loss": 0.0225, "step": 6799 }, { "epoch": 3.0202087497224075, "grad_norm": 0.4741877419032876, "learning_rate": 1.717047826994085e-06, "loss": 0.0262, "step": 6800 }, { "epoch": 3.020652898067955, "grad_norm": 0.40147810647504373, "learning_rate": 1.7155859376494776e-06, "loss": 0.0267, "step": 6801 }, { "epoch": 3.021097046413502, "grad_norm": 0.4280335792767891, "learning_rate": 1.7141245420080982e-06, "loss": 0.0242, "step": 6802 }, { "epoch": 3.0215411947590494, "grad_norm": 0.34267631232424817, "learning_rate": 1.7126636402896158e-06, "loss": 0.0228, "step": 6803 }, { "epoch": 3.021985343104597, "grad_norm": 0.517542942077693, "learning_rate": 1.7112032327136296e-06, "loss": 0.0289, "step": 6804 }, { "epoch": 3.0224294914501444, "grad_norm": 0.48842613500647986, "learning_rate": 1.7097433194996654e-06, "loss": 0.0362, "step": 6805 }, { "epoch": 3.0228736397956917, "grad_norm": 0.5122441524272605, "learning_rate": 1.7082839008671714e-06, "loss": 0.0327, "step": 6806 }, { "epoch": 3.023317788141239, "grad_norm": 0.3656414000536619, "learning_rate": 1.706824977035524e-06, "loss": 0.0239, "step": 6807 }, { "epoch": 3.0237619364867867, "grad_norm": 0.35844316810361615, "learning_rate": 1.705366548224025e-06, "loss": 0.022, "step": 6808 }, { "epoch": 3.024206084832334, "grad_norm": 0.5208246273651607, "learning_rate": 1.7039086146518986e-06, "loss": 0.0396, "step": 6809 }, { "epoch": 3.0246502331778813, "grad_norm": 0.35888427376762055, "learning_rate": 1.7024511765382978e-06, "loss": 0.0218, "step": 6810 }, { "epoch": 3.025094381523429, "grad_norm": 0.4996038911163621, "learning_rate": 1.7009942341023012e-06, "loss": 0.031, "step": 6811 }, { "epoch": 3.0255385298689763, "grad_norm": 0.4715140989011031, "learning_rate": 1.699537787562911e-06, "loss": 0.027, "step": 6812 }, { "epoch": 3.0259826782145236, "grad_norm": 0.4700851815071929, "learning_rate": 1.6980818371390567e-06, "loss": 0.0352, "step": 6813 }, { "epoch": 3.026426826560071, "grad_norm": 0.42536546436391975, "learning_rate": 1.6966263830495939e-06, "loss": 0.0307, "step": 6814 }, { "epoch": 3.0268709749056186, "grad_norm": 0.3928810675523391, "learning_rate": 1.6951714255132985e-06, "loss": 0.0317, "step": 6815 }, { "epoch": 3.027315123251166, "grad_norm": 0.36328069775398497, "learning_rate": 1.6937169647488765e-06, "loss": 0.0225, "step": 6816 }, { "epoch": 3.027759271596713, "grad_norm": 0.4176283792585422, "learning_rate": 1.6922630009749592e-06, "loss": 0.0335, "step": 6817 }, { "epoch": 3.0282034199422605, "grad_norm": 0.45941453677809496, "learning_rate": 1.6908095344101016e-06, "loss": 0.0307, "step": 6818 }, { "epoch": 3.0286475682878082, "grad_norm": 0.4022633883354449, "learning_rate": 1.6893565652727857e-06, "loss": 0.0285, "step": 6819 }, { "epoch": 3.0290917166333555, "grad_norm": 0.46606194138773704, "learning_rate": 1.687904093781414e-06, "loss": 0.0346, "step": 6820 }, { "epoch": 3.029535864978903, "grad_norm": 0.3781526947765934, "learning_rate": 1.68645212015432e-06, "loss": 0.0217, "step": 6821 }, { "epoch": 3.0299800133244505, "grad_norm": 0.40815913991276614, "learning_rate": 1.68500064460976e-06, "loss": 0.0303, "step": 6822 }, { "epoch": 3.030424161669998, "grad_norm": 0.42153363035695735, "learning_rate": 1.6835496673659145e-06, "loss": 0.0329, "step": 6823 }, { "epoch": 3.030868310015545, "grad_norm": 0.5257269502253524, "learning_rate": 1.6820991886408911e-06, "loss": 0.0342, "step": 6824 }, { "epoch": 3.0313124583610924, "grad_norm": 0.36375569874164343, "learning_rate": 1.6806492086527226e-06, "loss": 0.0268, "step": 6825 }, { "epoch": 3.03175660670664, "grad_norm": 0.4517518524381332, "learning_rate": 1.6791997276193623e-06, "loss": 0.0291, "step": 6826 }, { "epoch": 3.0322007550521874, "grad_norm": 0.41358501413398874, "learning_rate": 1.6777507457586933e-06, "loss": 0.0263, "step": 6827 }, { "epoch": 3.0326449033977347, "grad_norm": 0.4371773593425646, "learning_rate": 1.6763022632885223e-06, "loss": 0.026, "step": 6828 }, { "epoch": 3.0330890517432825, "grad_norm": 0.41878585193753987, "learning_rate": 1.674854280426581e-06, "loss": 0.0289, "step": 6829 }, { "epoch": 3.0335332000888298, "grad_norm": 0.6072899498645479, "learning_rate": 1.6734067973905272e-06, "loss": 0.046, "step": 6830 }, { "epoch": 3.033977348434377, "grad_norm": 0.37768879943900463, "learning_rate": 1.6719598143979392e-06, "loss": 0.0175, "step": 6831 }, { "epoch": 3.0344214967799243, "grad_norm": 0.42243370237101113, "learning_rate": 1.6705133316663247e-06, "loss": 0.0318, "step": 6832 }, { "epoch": 3.034865645125472, "grad_norm": 0.4272111914281022, "learning_rate": 1.6690673494131143e-06, "loss": 0.0318, "step": 6833 }, { "epoch": 3.0353097934710194, "grad_norm": 0.555542458187917, "learning_rate": 1.6676218678556637e-06, "loss": 0.0333, "step": 6834 }, { "epoch": 3.0357539418165667, "grad_norm": 0.5174806483008364, "learning_rate": 1.6661768872112544e-06, "loss": 0.0257, "step": 6835 }, { "epoch": 3.036198090162114, "grad_norm": 0.40261446781293897, "learning_rate": 1.6647324076970917e-06, "loss": 0.0271, "step": 6836 }, { "epoch": 3.0366422385076617, "grad_norm": 0.28515858656757614, "learning_rate": 1.663288429530303e-06, "loss": 0.0139, "step": 6837 }, { "epoch": 3.037086386853209, "grad_norm": 0.3801745026945722, "learning_rate": 1.661844952927944e-06, "loss": 0.0278, "step": 6838 }, { "epoch": 3.0375305351987563, "grad_norm": 0.5653418682210891, "learning_rate": 1.660401978106994e-06, "loss": 0.0356, "step": 6839 }, { "epoch": 3.037974683544304, "grad_norm": 0.4351497414805404, "learning_rate": 1.6589595052843567e-06, "loss": 0.0255, "step": 6840 }, { "epoch": 3.0384188318898513, "grad_norm": 0.36924524660599933, "learning_rate": 1.6575175346768597e-06, "loss": 0.0213, "step": 6841 }, { "epoch": 3.0388629802353986, "grad_norm": 0.36560298465093416, "learning_rate": 1.6560760665012581e-06, "loss": 0.0291, "step": 6842 }, { "epoch": 3.039307128580946, "grad_norm": 0.5231650605635862, "learning_rate": 1.6546351009742252e-06, "loss": 0.0416, "step": 6843 }, { "epoch": 3.0397512769264936, "grad_norm": 0.40002607438188575, "learning_rate": 1.6531946383123647e-06, "loss": 0.0279, "step": 6844 }, { "epoch": 3.040195425272041, "grad_norm": 0.3820335905823042, "learning_rate": 1.6517546787322019e-06, "loss": 0.0332, "step": 6845 }, { "epoch": 3.040639573617588, "grad_norm": 0.4052075541407361, "learning_rate": 1.6503152224501883e-06, "loss": 0.0273, "step": 6846 }, { "epoch": 3.0410837219631355, "grad_norm": 0.40616006159309487, "learning_rate": 1.6488762696826992e-06, "loss": 0.029, "step": 6847 }, { "epoch": 3.041527870308683, "grad_norm": 0.41149108296545567, "learning_rate": 1.6474378206460306e-06, "loss": 0.0239, "step": 6848 }, { "epoch": 3.0419720186542305, "grad_norm": 0.35312483842889825, "learning_rate": 1.6459998755564078e-06, "loss": 0.0265, "step": 6849 }, { "epoch": 3.042416166999778, "grad_norm": 0.44423084298706683, "learning_rate": 1.64456243462998e-06, "loss": 0.0214, "step": 6850 }, { "epoch": 3.0428603153453255, "grad_norm": 0.3441262864593607, "learning_rate": 1.6431254980828137e-06, "loss": 0.0261, "step": 6851 }, { "epoch": 3.043304463690873, "grad_norm": 0.5333130750896956, "learning_rate": 1.6416890661309098e-06, "loss": 0.0345, "step": 6852 }, { "epoch": 3.04374861203642, "grad_norm": 0.32916273546434566, "learning_rate": 1.6402531389901894e-06, "loss": 0.0196, "step": 6853 }, { "epoch": 3.0441927603819674, "grad_norm": 0.4575672416898921, "learning_rate": 1.6388177168764919e-06, "loss": 0.0316, "step": 6854 }, { "epoch": 3.044636908727515, "grad_norm": 0.4231274981211455, "learning_rate": 1.6373828000055886e-06, "loss": 0.0267, "step": 6855 }, { "epoch": 3.0450810570730624, "grad_norm": 0.35267922234888455, "learning_rate": 1.6359483885931709e-06, "loss": 0.0224, "step": 6856 }, { "epoch": 3.0455252054186097, "grad_norm": 0.4270485321761482, "learning_rate": 1.634514482854856e-06, "loss": 0.0215, "step": 6857 }, { "epoch": 3.045969353764157, "grad_norm": 0.45838498705619496, "learning_rate": 1.6330810830061833e-06, "loss": 0.0259, "step": 6858 }, { "epoch": 3.0464135021097047, "grad_norm": 0.39622697822045655, "learning_rate": 1.6316481892626202e-06, "loss": 0.0227, "step": 6859 }, { "epoch": 3.046857650455252, "grad_norm": 0.4767421948555728, "learning_rate": 1.6302158018395504e-06, "loss": 0.021, "step": 6860 }, { "epoch": 3.0473017988007993, "grad_norm": 0.4108572908804955, "learning_rate": 1.6287839209522883e-06, "loss": 0.0243, "step": 6861 }, { "epoch": 3.047745947146347, "grad_norm": 0.3361055439360816, "learning_rate": 1.62735254681607e-06, "loss": 0.016, "step": 6862 }, { "epoch": 3.0481900954918943, "grad_norm": 0.4890820081997742, "learning_rate": 1.6259216796460553e-06, "loss": 0.0332, "step": 6863 }, { "epoch": 3.0486342438374416, "grad_norm": 0.3446651596488856, "learning_rate": 1.6244913196573291e-06, "loss": 0.0231, "step": 6864 }, { "epoch": 3.049078392182989, "grad_norm": 0.4912264351287514, "learning_rate": 1.623061467064896e-06, "loss": 0.0272, "step": 6865 }, { "epoch": 3.0495225405285367, "grad_norm": 0.4925885654573531, "learning_rate": 1.6216321220836885e-06, "loss": 0.0248, "step": 6866 }, { "epoch": 3.049966688874084, "grad_norm": 0.3824243589770029, "learning_rate": 1.6202032849285626e-06, "loss": 0.0225, "step": 6867 }, { "epoch": 3.0504108372196312, "grad_norm": 0.36019367540666763, "learning_rate": 1.618774955814293e-06, "loss": 0.0179, "step": 6868 }, { "epoch": 3.050854985565179, "grad_norm": 0.5143104427412662, "learning_rate": 1.6173471349555858e-06, "loss": 0.0376, "step": 6869 }, { "epoch": 3.0512991339107263, "grad_norm": 0.5934473311861377, "learning_rate": 1.6159198225670676e-06, "loss": 0.0448, "step": 6870 }, { "epoch": 3.0517432822562736, "grad_norm": 0.45164863726044957, "learning_rate": 1.6144930188632835e-06, "loss": 0.0278, "step": 6871 }, { "epoch": 3.052187430601821, "grad_norm": 0.5798490797388325, "learning_rate": 1.6130667240587083e-06, "loss": 0.0297, "step": 6872 }, { "epoch": 3.0526315789473686, "grad_norm": 0.38501590071391395, "learning_rate": 1.6116409383677383e-06, "loss": 0.0244, "step": 6873 }, { "epoch": 3.053075727292916, "grad_norm": 0.4626822552615348, "learning_rate": 1.6102156620046937e-06, "loss": 0.0234, "step": 6874 }, { "epoch": 3.053519875638463, "grad_norm": 0.4709097565849538, "learning_rate": 1.6087908951838193e-06, "loss": 0.0325, "step": 6875 }, { "epoch": 3.0539640239840105, "grad_norm": 0.38512614079510565, "learning_rate": 1.6073666381192777e-06, "loss": 0.0185, "step": 6876 }, { "epoch": 3.054408172329558, "grad_norm": 0.3981638223693639, "learning_rate": 1.6059428910251617e-06, "loss": 0.0297, "step": 6877 }, { "epoch": 3.0548523206751055, "grad_norm": 0.40972645998312435, "learning_rate": 1.604519654115484e-06, "loss": 0.0217, "step": 6878 }, { "epoch": 3.0552964690206528, "grad_norm": 0.44788376402765917, "learning_rate": 1.6030969276041813e-06, "loss": 0.0255, "step": 6879 }, { "epoch": 3.0557406173662005, "grad_norm": 0.45493435461264287, "learning_rate": 1.6016747117051135e-06, "loss": 0.0367, "step": 6880 }, { "epoch": 3.056184765711748, "grad_norm": 0.5367527315236188, "learning_rate": 1.6002530066320659e-06, "loss": 0.0386, "step": 6881 }, { "epoch": 3.056628914057295, "grad_norm": 0.36716772288492844, "learning_rate": 1.5988318125987412e-06, "loss": 0.0243, "step": 6882 }, { "epoch": 3.0570730624028424, "grad_norm": 0.39700109646152437, "learning_rate": 1.597411129818771e-06, "loss": 0.0331, "step": 6883 }, { "epoch": 3.05751721074839, "grad_norm": 0.4200578523175907, "learning_rate": 1.5959909585057099e-06, "loss": 0.0254, "step": 6884 }, { "epoch": 3.0579613590939374, "grad_norm": 0.5911121451638379, "learning_rate": 1.5945712988730278e-06, "loss": 0.0418, "step": 6885 }, { "epoch": 3.0584055074394847, "grad_norm": 0.3316421396306147, "learning_rate": 1.5931521511341292e-06, "loss": 0.0266, "step": 6886 }, { "epoch": 3.058849655785032, "grad_norm": 0.38444781154877616, "learning_rate": 1.5917335155023368e-06, "loss": 0.018, "step": 6887 }, { "epoch": 3.0592938041305797, "grad_norm": 0.38538195654260493, "learning_rate": 1.590315392190891e-06, "loss": 0.0223, "step": 6888 }, { "epoch": 3.059737952476127, "grad_norm": 0.3188424830635555, "learning_rate": 1.5888977814129625e-06, "loss": 0.0162, "step": 6889 }, { "epoch": 3.0601821008216743, "grad_norm": 0.5604048534911154, "learning_rate": 1.5874806833816436e-06, "loss": 0.0451, "step": 6890 }, { "epoch": 3.060626249167222, "grad_norm": 0.4396556865933855, "learning_rate": 1.5860640983099435e-06, "loss": 0.027, "step": 6891 }, { "epoch": 3.0610703975127693, "grad_norm": 0.5338475568401356, "learning_rate": 1.584648026410805e-06, "loss": 0.0428, "step": 6892 }, { "epoch": 3.0615145458583166, "grad_norm": 0.41364738868710765, "learning_rate": 1.583232467897083e-06, "loss": 0.0255, "step": 6893 }, { "epoch": 3.061958694203864, "grad_norm": 0.44128675402771145, "learning_rate": 1.581817422981562e-06, "loss": 0.0266, "step": 6894 }, { "epoch": 3.0624028425494116, "grad_norm": 0.40811877248427353, "learning_rate": 1.5804028918769488e-06, "loss": 0.0202, "step": 6895 }, { "epoch": 3.062846990894959, "grad_norm": 0.38420681148774133, "learning_rate": 1.5789888747958666e-06, "loss": 0.0299, "step": 6896 }, { "epoch": 3.0632911392405062, "grad_norm": 0.3556137019009215, "learning_rate": 1.5775753719508708e-06, "loss": 0.021, "step": 6897 }, { "epoch": 3.063735287586054, "grad_norm": 0.2882748081505788, "learning_rate": 1.5761623835544348e-06, "loss": 0.0258, "step": 6898 }, { "epoch": 3.0641794359316012, "grad_norm": 0.3787886197229445, "learning_rate": 1.5747499098189524e-06, "loss": 0.0281, "step": 6899 }, { "epoch": 3.0646235842771485, "grad_norm": 0.3354175757689207, "learning_rate": 1.5733379509567426e-06, "loss": 0.0234, "step": 6900 }, { "epoch": 3.065067732622696, "grad_norm": 0.3209071922045546, "learning_rate": 1.5719265071800498e-06, "loss": 0.0182, "step": 6901 }, { "epoch": 3.0655118809682436, "grad_norm": 0.43501967745626874, "learning_rate": 1.5705155787010324e-06, "loss": 0.0353, "step": 6902 }, { "epoch": 3.065956029313791, "grad_norm": 0.4929319158649113, "learning_rate": 1.5691051657317835e-06, "loss": 0.0271, "step": 6903 }, { "epoch": 3.066400177659338, "grad_norm": 0.3584810561885387, "learning_rate": 1.5676952684843072e-06, "loss": 0.0258, "step": 6904 }, { "epoch": 3.0668443260048854, "grad_norm": 0.4638657269405188, "learning_rate": 1.5662858871705366e-06, "loss": 0.0236, "step": 6905 }, { "epoch": 3.067288474350433, "grad_norm": 0.5065060680688717, "learning_rate": 1.5648770220023263e-06, "loss": 0.0287, "step": 6906 }, { "epoch": 3.0677326226959805, "grad_norm": 0.4859275697928332, "learning_rate": 1.5634686731914533e-06, "loss": 0.0434, "step": 6907 }, { "epoch": 3.0681767710415278, "grad_norm": 0.38551762262809786, "learning_rate": 1.562060840949612e-06, "loss": 0.0265, "step": 6908 }, { "epoch": 3.0686209193870755, "grad_norm": 0.6596634602131578, "learning_rate": 1.5606535254884297e-06, "loss": 0.0343, "step": 6909 }, { "epoch": 3.0690650677326228, "grad_norm": 0.40308217569658306, "learning_rate": 1.5592467270194456e-06, "loss": 0.0306, "step": 6910 }, { "epoch": 3.06950921607817, "grad_norm": 0.6298585513540671, "learning_rate": 1.5578404457541264e-06, "loss": 0.0276, "step": 6911 }, { "epoch": 3.0699533644237174, "grad_norm": 0.6519358046715888, "learning_rate": 1.5564346819038616e-06, "loss": 0.0335, "step": 6912 }, { "epoch": 3.070397512769265, "grad_norm": 0.4424893479760814, "learning_rate": 1.5550294356799573e-06, "loss": 0.0249, "step": 6913 }, { "epoch": 3.0708416611148124, "grad_norm": 0.44065297318246016, "learning_rate": 1.55362470729365e-06, "loss": 0.031, "step": 6914 }, { "epoch": 3.0712858094603597, "grad_norm": 0.3171014816611921, "learning_rate": 1.5522204969560945e-06, "loss": 0.0228, "step": 6915 }, { "epoch": 3.071729957805907, "grad_norm": 0.48982852285102907, "learning_rate": 1.5508168048783645e-06, "loss": 0.0243, "step": 6916 }, { "epoch": 3.0721741061514547, "grad_norm": 0.36638227513318433, "learning_rate": 1.5494136312714598e-06, "loss": 0.0209, "step": 6917 }, { "epoch": 3.072618254497002, "grad_norm": 0.40243990074588076, "learning_rate": 1.5480109763463031e-06, "loss": 0.0323, "step": 6918 }, { "epoch": 3.0730624028425493, "grad_norm": 0.6138342320939462, "learning_rate": 1.5466088403137326e-06, "loss": 0.0303, "step": 6919 }, { "epoch": 3.073506551188097, "grad_norm": 0.41638032443840284, "learning_rate": 1.5452072233845194e-06, "loss": 0.0302, "step": 6920 }, { "epoch": 3.0739506995336443, "grad_norm": 0.3958771196139635, "learning_rate": 1.5438061257693459e-06, "loss": 0.025, "step": 6921 }, { "epoch": 3.0743948478791916, "grad_norm": 0.409201101395247, "learning_rate": 1.5424055476788219e-06, "loss": 0.0281, "step": 6922 }, { "epoch": 3.074838996224739, "grad_norm": 0.36270140865155653, "learning_rate": 1.54100548932348e-06, "loss": 0.0211, "step": 6923 }, { "epoch": 3.0752831445702866, "grad_norm": 0.3939239641466573, "learning_rate": 1.5396059509137694e-06, "loss": 0.0221, "step": 6924 }, { "epoch": 3.075727292915834, "grad_norm": 0.3910124202096161, "learning_rate": 1.5382069326600645e-06, "loss": 0.0247, "step": 6925 }, { "epoch": 3.076171441261381, "grad_norm": 0.4045068362861821, "learning_rate": 1.536808434772667e-06, "loss": 0.0252, "step": 6926 }, { "epoch": 3.076615589606929, "grad_norm": 0.48842679698944363, "learning_rate": 1.5354104574617889e-06, "loss": 0.0272, "step": 6927 }, { "epoch": 3.0770597379524762, "grad_norm": 0.5568048405499652, "learning_rate": 1.5340130009375725e-06, "loss": 0.0271, "step": 6928 }, { "epoch": 3.0775038862980235, "grad_norm": 0.35344319652310796, "learning_rate": 1.5326160654100803e-06, "loss": 0.0203, "step": 6929 }, { "epoch": 3.077948034643571, "grad_norm": 0.41134278821618964, "learning_rate": 1.5312196510892907e-06, "loss": 0.0203, "step": 6930 }, { "epoch": 3.0783921829891185, "grad_norm": 0.43439941775477803, "learning_rate": 1.529823758185115e-06, "loss": 0.0307, "step": 6931 }, { "epoch": 3.078836331334666, "grad_norm": 0.36710103440734276, "learning_rate": 1.5284283869073753e-06, "loss": 0.0226, "step": 6932 }, { "epoch": 3.079280479680213, "grad_norm": 0.3795190028300577, "learning_rate": 1.5270335374658202e-06, "loss": 0.0193, "step": 6933 }, { "epoch": 3.0797246280257604, "grad_norm": 0.416086597648837, "learning_rate": 1.5256392100701201e-06, "loss": 0.0341, "step": 6934 }, { "epoch": 3.080168776371308, "grad_norm": 0.47903570547844193, "learning_rate": 1.5242454049298672e-06, "loss": 0.0326, "step": 6935 }, { "epoch": 3.0806129247168554, "grad_norm": 0.41924576417637155, "learning_rate": 1.5228521222545694e-06, "loss": 0.0232, "step": 6936 }, { "epoch": 3.0810570730624027, "grad_norm": 0.4086739351412735, "learning_rate": 1.5214593622536677e-06, "loss": 0.0191, "step": 6937 }, { "epoch": 3.0815012214079505, "grad_norm": 0.42090655404898863, "learning_rate": 1.5200671251365118e-06, "loss": 0.0234, "step": 6938 }, { "epoch": 3.0819453697534978, "grad_norm": 0.4491694516637582, "learning_rate": 1.5186754111123814e-06, "loss": 0.0309, "step": 6939 }, { "epoch": 3.082389518099045, "grad_norm": 0.3593852978556601, "learning_rate": 1.5172842203904752e-06, "loss": 0.0219, "step": 6940 }, { "epoch": 3.0828336664445923, "grad_norm": 0.4142188627731481, "learning_rate": 1.5158935531799102e-06, "loss": 0.0231, "step": 6941 }, { "epoch": 3.08327781479014, "grad_norm": 0.3433910778382263, "learning_rate": 1.5145034096897271e-06, "loss": 0.0232, "step": 6942 }, { "epoch": 3.0837219631356874, "grad_norm": 0.42040065985676595, "learning_rate": 1.5131137901288928e-06, "loss": 0.0274, "step": 6943 }, { "epoch": 3.0841661114812347, "grad_norm": 0.37811344114245793, "learning_rate": 1.5117246947062864e-06, "loss": 0.0278, "step": 6944 }, { "epoch": 3.084610259826782, "grad_norm": 0.6332645566079246, "learning_rate": 1.5103361236307135e-06, "loss": 0.0199, "step": 6945 }, { "epoch": 3.0850544081723297, "grad_norm": 0.3367889244804879, "learning_rate": 1.5089480771109021e-06, "loss": 0.0221, "step": 6946 }, { "epoch": 3.085498556517877, "grad_norm": 0.45979021485293203, "learning_rate": 1.507560555355494e-06, "loss": 0.0259, "step": 6947 }, { "epoch": 3.0859427048634243, "grad_norm": 0.3846307657242651, "learning_rate": 1.5061735585730636e-06, "loss": 0.0252, "step": 6948 }, { "epoch": 3.086386853208972, "grad_norm": 0.36534660664962226, "learning_rate": 1.504787086972096e-06, "loss": 0.0173, "step": 6949 }, { "epoch": 3.0868310015545193, "grad_norm": 0.6011247419581762, "learning_rate": 1.5034011407610021e-06, "loss": 0.0399, "step": 6950 }, { "epoch": 3.0872751499000666, "grad_norm": 0.4220887506749395, "learning_rate": 1.502015720148115e-06, "loss": 0.0265, "step": 6951 }, { "epoch": 3.087719298245614, "grad_norm": 0.383622304930661, "learning_rate": 1.5006308253416846e-06, "loss": 0.0235, "step": 6952 }, { "epoch": 3.0881634465911616, "grad_norm": 0.48234757617698376, "learning_rate": 1.4992464565498831e-06, "loss": 0.0354, "step": 6953 }, { "epoch": 3.088607594936709, "grad_norm": 0.4645377835459482, "learning_rate": 1.4978626139808094e-06, "loss": 0.0332, "step": 6954 }, { "epoch": 3.089051743282256, "grad_norm": 0.4061976190338302, "learning_rate": 1.4964792978424746e-06, "loss": 0.0231, "step": 6955 }, { "epoch": 3.089495891627804, "grad_norm": 0.5482133012861865, "learning_rate": 1.495096508342816e-06, "loss": 0.0369, "step": 6956 }, { "epoch": 3.089940039973351, "grad_norm": 0.3661650923331457, "learning_rate": 1.4937142456896907e-06, "loss": 0.0231, "step": 6957 }, { "epoch": 3.0903841883188985, "grad_norm": 0.4124097316559942, "learning_rate": 1.4923325100908749e-06, "loss": 0.026, "step": 6958 }, { "epoch": 3.090828336664446, "grad_norm": 0.5506003037235153, "learning_rate": 1.490951301754066e-06, "loss": 0.0272, "step": 6959 }, { "epoch": 3.0912724850099935, "grad_norm": 0.39939980904986866, "learning_rate": 1.4895706208868876e-06, "loss": 0.028, "step": 6960 }, { "epoch": 3.091716633355541, "grad_norm": 0.46904750344961466, "learning_rate": 1.4881904676968756e-06, "loss": 0.0258, "step": 6961 }, { "epoch": 3.092160781701088, "grad_norm": 0.46041949493275547, "learning_rate": 1.4868108423914913e-06, "loss": 0.0284, "step": 6962 }, { "epoch": 3.0926049300466354, "grad_norm": 0.3681186282877754, "learning_rate": 1.4854317451781175e-06, "loss": 0.0232, "step": 6963 }, { "epoch": 3.093049078392183, "grad_norm": 0.4723461677240559, "learning_rate": 1.4840531762640524e-06, "loss": 0.0277, "step": 6964 }, { "epoch": 3.0934932267377304, "grad_norm": 0.38175424189288976, "learning_rate": 1.4826751358565211e-06, "loss": 0.019, "step": 6965 }, { "epoch": 3.0939373750832777, "grad_norm": 0.41086502374434375, "learning_rate": 1.4812976241626659e-06, "loss": 0.0287, "step": 6966 }, { "epoch": 3.0943815234288254, "grad_norm": 0.39348852384191785, "learning_rate": 1.4799206413895494e-06, "loss": 0.026, "step": 6967 }, { "epoch": 3.0948256717743727, "grad_norm": 0.4897251060835583, "learning_rate": 1.4785441877441587e-06, "loss": 0.0283, "step": 6968 }, { "epoch": 3.09526982011992, "grad_norm": 0.396902884934688, "learning_rate": 1.4771682634333933e-06, "loss": 0.0241, "step": 6969 }, { "epoch": 3.0957139684654673, "grad_norm": 0.6024718135186303, "learning_rate": 1.4757928686640788e-06, "loss": 0.0324, "step": 6970 }, { "epoch": 3.096158116811015, "grad_norm": 0.3483981306238304, "learning_rate": 1.4744180036429656e-06, "loss": 0.0203, "step": 6971 }, { "epoch": 3.0966022651565623, "grad_norm": 0.45268776356014967, "learning_rate": 1.4730436685767135e-06, "loss": 0.0302, "step": 6972 }, { "epoch": 3.0970464135021096, "grad_norm": 0.5030795577403537, "learning_rate": 1.4716698636719107e-06, "loss": 0.0291, "step": 6973 }, { "epoch": 3.097490561847657, "grad_norm": 0.4396401120191148, "learning_rate": 1.470296589135065e-06, "loss": 0.0342, "step": 6974 }, { "epoch": 3.0979347101932047, "grad_norm": 0.3979132301675201, "learning_rate": 1.4689238451725995e-06, "loss": 0.0201, "step": 6975 }, { "epoch": 3.098378858538752, "grad_norm": 0.3787151125083963, "learning_rate": 1.4675516319908629e-06, "loss": 0.0257, "step": 6976 }, { "epoch": 3.0988230068842992, "grad_norm": 0.37709012703404926, "learning_rate": 1.466179949796121e-06, "loss": 0.031, "step": 6977 }, { "epoch": 3.099267155229847, "grad_norm": 0.46313083842641917, "learning_rate": 1.4648087987945625e-06, "loss": 0.0187, "step": 6978 }, { "epoch": 3.0997113035753943, "grad_norm": 0.41171533054686754, "learning_rate": 1.4634381791922936e-06, "loss": 0.0265, "step": 6979 }, { "epoch": 3.1001554519209416, "grad_norm": 0.35930958286366377, "learning_rate": 1.4620680911953433e-06, "loss": 0.0287, "step": 6980 }, { "epoch": 3.100599600266489, "grad_norm": 0.5934414847112055, "learning_rate": 1.460698535009657e-06, "loss": 0.021, "step": 6981 }, { "epoch": 3.1010437486120366, "grad_norm": 0.4183538172458836, "learning_rate": 1.4593295108411027e-06, "loss": 0.0239, "step": 6982 }, { "epoch": 3.101487896957584, "grad_norm": 0.3854427898688108, "learning_rate": 1.4579610188954685e-06, "loss": 0.0274, "step": 6983 }, { "epoch": 3.101932045303131, "grad_norm": 0.4711207189434762, "learning_rate": 1.4565930593784616e-06, "loss": 0.0282, "step": 6984 }, { "epoch": 3.1023761936486784, "grad_norm": 0.4616599969953312, "learning_rate": 1.455225632495712e-06, "loss": 0.029, "step": 6985 }, { "epoch": 3.102820341994226, "grad_norm": 0.4076291882386512, "learning_rate": 1.453858738452763e-06, "loss": 0.0326, "step": 6986 }, { "epoch": 3.1032644903397735, "grad_norm": 0.31683467511767754, "learning_rate": 1.4524923774550825e-06, "loss": 0.0176, "step": 6987 }, { "epoch": 3.1037086386853208, "grad_norm": 0.39424949712455315, "learning_rate": 1.4511265497080624e-06, "loss": 0.0302, "step": 6988 }, { "epoch": 3.1041527870308685, "grad_norm": 0.4103257196866521, "learning_rate": 1.4497612554170054e-06, "loss": 0.0273, "step": 6989 }, { "epoch": 3.104596935376416, "grad_norm": 0.39987357613853897, "learning_rate": 1.4483964947871392e-06, "loss": 0.0255, "step": 6990 }, { "epoch": 3.105041083721963, "grad_norm": 0.41100905630593115, "learning_rate": 1.4470322680236132e-06, "loss": 0.0302, "step": 6991 }, { "epoch": 3.1054852320675104, "grad_norm": 0.4415871306778436, "learning_rate": 1.4456685753314898e-06, "loss": 0.0248, "step": 6992 }, { "epoch": 3.105929380413058, "grad_norm": 0.3985527776293508, "learning_rate": 1.4443054169157566e-06, "loss": 0.024, "step": 6993 }, { "epoch": 3.1063735287586054, "grad_norm": 0.5025368665991552, "learning_rate": 1.4429427929813205e-06, "loss": 0.0327, "step": 6994 }, { "epoch": 3.1068176771041527, "grad_norm": 0.543210885126374, "learning_rate": 1.4415807037330065e-06, "loss": 0.0244, "step": 6995 }, { "epoch": 3.1072618254497, "grad_norm": 0.5795921769157866, "learning_rate": 1.4402191493755614e-06, "loss": 0.0319, "step": 6996 }, { "epoch": 3.1077059737952477, "grad_norm": 0.33181453971757086, "learning_rate": 1.4388581301136463e-06, "loss": 0.0239, "step": 6997 }, { "epoch": 3.108150122140795, "grad_norm": 0.35538456672100055, "learning_rate": 1.4374976461518475e-06, "loss": 0.0202, "step": 6998 }, { "epoch": 3.1085942704863423, "grad_norm": 0.30394370356689626, "learning_rate": 1.436137697694669e-06, "loss": 0.0148, "step": 6999 }, { "epoch": 3.10903841883189, "grad_norm": 0.5358782537926752, "learning_rate": 1.4347782849465335e-06, "loss": 0.0361, "step": 7000 }, { "epoch": 3.1094825671774373, "grad_norm": 0.45423657841045967, "learning_rate": 1.4334194081117853e-06, "loss": 0.0296, "step": 7001 }, { "epoch": 3.1099267155229846, "grad_norm": 0.391451201654507, "learning_rate": 1.4320610673946862e-06, "loss": 0.0219, "step": 7002 }, { "epoch": 3.110370863868532, "grad_norm": 0.3695407430957425, "learning_rate": 1.4307032629994162e-06, "loss": 0.0275, "step": 7003 }, { "epoch": 3.1108150122140796, "grad_norm": 0.3522320469665786, "learning_rate": 1.4293459951300775e-06, "loss": 0.0191, "step": 7004 }, { "epoch": 3.111259160559627, "grad_norm": 0.39441008031342695, "learning_rate": 1.4279892639906906e-06, "loss": 0.022, "step": 7005 }, { "epoch": 3.111703308905174, "grad_norm": 0.3604951087918798, "learning_rate": 1.4266330697851955e-06, "loss": 0.0291, "step": 7006 }, { "epoch": 3.112147457250722, "grad_norm": 0.3966852264525251, "learning_rate": 1.4252774127174502e-06, "loss": 0.0336, "step": 7007 }, { "epoch": 3.1125916055962692, "grad_norm": 0.4920706934740084, "learning_rate": 1.4239222929912354e-06, "loss": 0.0339, "step": 7008 }, { "epoch": 3.1130357539418165, "grad_norm": 0.452968414032015, "learning_rate": 1.422567710810246e-06, "loss": 0.0334, "step": 7009 }, { "epoch": 3.113479902287364, "grad_norm": 0.34946018464557227, "learning_rate": 1.421213666378099e-06, "loss": 0.0259, "step": 7010 }, { "epoch": 3.1139240506329116, "grad_norm": 0.3785739486603564, "learning_rate": 1.419860159898331e-06, "loss": 0.0248, "step": 7011 }, { "epoch": 3.114368198978459, "grad_norm": 0.48985400850238237, "learning_rate": 1.418507191574397e-06, "loss": 0.0245, "step": 7012 }, { "epoch": 3.114812347324006, "grad_norm": 0.37256560600518596, "learning_rate": 1.4171547616096726e-06, "loss": 0.0263, "step": 7013 }, { "epoch": 3.1152564956695534, "grad_norm": 0.4745149660869735, "learning_rate": 1.4158028702074478e-06, "loss": 0.0315, "step": 7014 }, { "epoch": 3.115700644015101, "grad_norm": 0.4272234241381395, "learning_rate": 1.4144515175709366e-06, "loss": 0.0359, "step": 7015 }, { "epoch": 3.1161447923606485, "grad_norm": 0.4717714680449159, "learning_rate": 1.4131007039032702e-06, "loss": 0.0247, "step": 7016 }, { "epoch": 3.1165889407061957, "grad_norm": 0.3661175444111234, "learning_rate": 1.4117504294074985e-06, "loss": 0.0199, "step": 7017 }, { "epoch": 3.1170330890517435, "grad_norm": 0.39156431702822175, "learning_rate": 1.4104006942865911e-06, "loss": 0.0225, "step": 7018 }, { "epoch": 3.1174772373972908, "grad_norm": 0.4841602294941045, "learning_rate": 1.4090514987434372e-06, "loss": 0.0224, "step": 7019 }, { "epoch": 3.117921385742838, "grad_norm": 0.3409883421268121, "learning_rate": 1.4077028429808415e-06, "loss": 0.0184, "step": 7020 }, { "epoch": 3.1183655340883853, "grad_norm": 0.4571971842074867, "learning_rate": 1.4063547272015305e-06, "loss": 0.0334, "step": 7021 }, { "epoch": 3.118809682433933, "grad_norm": 0.4859572155144659, "learning_rate": 1.4050071516081499e-06, "loss": 0.0236, "step": 7022 }, { "epoch": 3.1192538307794804, "grad_norm": 0.36054003070051327, "learning_rate": 1.4036601164032626e-06, "loss": 0.0244, "step": 7023 }, { "epoch": 3.1196979791250277, "grad_norm": 0.4164627314286096, "learning_rate": 1.4023136217893518e-06, "loss": 0.0276, "step": 7024 }, { "epoch": 3.120142127470575, "grad_norm": 0.5468105573468193, "learning_rate": 1.4009676679688167e-06, "loss": 0.0344, "step": 7025 }, { "epoch": 3.1205862758161227, "grad_norm": 0.5699602944440265, "learning_rate": 1.399622255143978e-06, "loss": 0.0227, "step": 7026 }, { "epoch": 3.12103042416167, "grad_norm": 0.3763296846537518, "learning_rate": 1.3982773835170738e-06, "loss": 0.0233, "step": 7027 }, { "epoch": 3.1214745725072173, "grad_norm": 0.3809901917507222, "learning_rate": 1.396933053290262e-06, "loss": 0.0261, "step": 7028 }, { "epoch": 3.121918720852765, "grad_norm": 0.3942636268702358, "learning_rate": 1.3955892646656172e-06, "loss": 0.0244, "step": 7029 }, { "epoch": 3.1223628691983123, "grad_norm": 0.4034606683568663, "learning_rate": 1.3942460178451357e-06, "loss": 0.0267, "step": 7030 }, { "epoch": 3.1228070175438596, "grad_norm": 0.4219843417449712, "learning_rate": 1.3929033130307273e-06, "loss": 0.0291, "step": 7031 }, { "epoch": 3.123251165889407, "grad_norm": 0.39268086756994935, "learning_rate": 1.3915611504242248e-06, "loss": 0.0242, "step": 7032 }, { "epoch": 3.1236953142349546, "grad_norm": 0.41199677628842846, "learning_rate": 1.390219530227378e-06, "loss": 0.0284, "step": 7033 }, { "epoch": 3.124139462580502, "grad_norm": 0.39148197640312005, "learning_rate": 1.3888784526418552e-06, "loss": 0.0268, "step": 7034 }, { "epoch": 3.124583610926049, "grad_norm": 0.6462152373026261, "learning_rate": 1.3875379178692433e-06, "loss": 0.042, "step": 7035 }, { "epoch": 3.125027759271597, "grad_norm": 0.5120068554721335, "learning_rate": 1.3861979261110493e-06, "loss": 0.0352, "step": 7036 }, { "epoch": 3.125471907617144, "grad_norm": 0.5096494527748306, "learning_rate": 1.3848584775686923e-06, "loss": 0.0329, "step": 7037 }, { "epoch": 3.1259160559626915, "grad_norm": 0.49019540876863293, "learning_rate": 1.3835195724435175e-06, "loss": 0.0274, "step": 7038 }, { "epoch": 3.126360204308239, "grad_norm": 0.4378287851338807, "learning_rate": 1.3821812109367838e-06, "loss": 0.0261, "step": 7039 }, { "epoch": 3.1268043526537865, "grad_norm": 0.4665401569139363, "learning_rate": 1.38084339324967e-06, "loss": 0.0244, "step": 7040 }, { "epoch": 3.127248500999334, "grad_norm": 0.4839568704489146, "learning_rate": 1.3795061195832749e-06, "loss": 0.0185, "step": 7041 }, { "epoch": 3.127692649344881, "grad_norm": 0.38633411841281007, "learning_rate": 1.3781693901386094e-06, "loss": 0.0206, "step": 7042 }, { "epoch": 3.1281367976904284, "grad_norm": 0.3852498984003089, "learning_rate": 1.3768332051166089e-06, "loss": 0.023, "step": 7043 }, { "epoch": 3.128580946035976, "grad_norm": 0.3527312929509418, "learning_rate": 1.3754975647181245e-06, "loss": 0.0272, "step": 7044 }, { "epoch": 3.1290250943815234, "grad_norm": 0.4078162247196801, "learning_rate": 1.374162469143926e-06, "loss": 0.0184, "step": 7045 }, { "epoch": 3.1294692427270707, "grad_norm": 0.6947202101134006, "learning_rate": 1.3728279185947002e-06, "loss": 0.0287, "step": 7046 }, { "epoch": 3.1299133910726185, "grad_norm": 0.3650330793591096, "learning_rate": 1.3714939132710547e-06, "loss": 0.0203, "step": 7047 }, { "epoch": 3.1303575394181657, "grad_norm": 0.3894198333673228, "learning_rate": 1.3701604533735102e-06, "loss": 0.0383, "step": 7048 }, { "epoch": 3.130801687763713, "grad_norm": 0.3626357108946466, "learning_rate": 1.3688275391025096e-06, "loss": 0.019, "step": 7049 }, { "epoch": 3.1312458361092603, "grad_norm": 0.421447173128161, "learning_rate": 1.3674951706584134e-06, "loss": 0.0182, "step": 7050 }, { "epoch": 3.131689984454808, "grad_norm": 0.4627577920703621, "learning_rate": 1.3661633482414977e-06, "loss": 0.0265, "step": 7051 }, { "epoch": 3.1321341328003554, "grad_norm": 0.43244182202980463, "learning_rate": 1.3648320720519592e-06, "loss": 0.0196, "step": 7052 }, { "epoch": 3.1325782811459026, "grad_norm": 0.4592125815303857, "learning_rate": 1.3635013422899124e-06, "loss": 0.0298, "step": 7053 }, { "epoch": 3.13302242949145, "grad_norm": 0.44770730468191844, "learning_rate": 1.3621711591553854e-06, "loss": 0.0347, "step": 7054 }, { "epoch": 3.1334665778369977, "grad_norm": 0.4417156773267622, "learning_rate": 1.3608415228483291e-06, "loss": 0.0286, "step": 7055 }, { "epoch": 3.133910726182545, "grad_norm": 0.33340946983600855, "learning_rate": 1.3595124335686104e-06, "loss": 0.0239, "step": 7056 }, { "epoch": 3.1343548745280922, "grad_norm": 0.45018385116758614, "learning_rate": 1.3581838915160145e-06, "loss": 0.0307, "step": 7057 }, { "epoch": 3.13479902287364, "grad_norm": 0.49692015647969867, "learning_rate": 1.3568558968902445e-06, "loss": 0.0291, "step": 7058 }, { "epoch": 3.1352431712191873, "grad_norm": 0.43270232873861747, "learning_rate": 1.3555284498909183e-06, "loss": 0.031, "step": 7059 }, { "epoch": 3.1356873195647346, "grad_norm": 0.40820462560013654, "learning_rate": 1.3542015507175743e-06, "loss": 0.0235, "step": 7060 }, { "epoch": 3.136131467910282, "grad_norm": 0.4972923121321135, "learning_rate": 1.3528751995696688e-06, "loss": 0.051, "step": 7061 }, { "epoch": 3.1365756162558296, "grad_norm": 0.41578377070138484, "learning_rate": 1.3515493966465743e-06, "loss": 0.028, "step": 7062 }, { "epoch": 3.137019764601377, "grad_norm": 0.4491202264177913, "learning_rate": 1.350224142147582e-06, "loss": 0.0172, "step": 7063 }, { "epoch": 3.137463912946924, "grad_norm": 0.4586826960329923, "learning_rate": 1.3488994362719016e-06, "loss": 0.0244, "step": 7064 }, { "epoch": 3.137908061292472, "grad_norm": 0.3366156664630001, "learning_rate": 1.3475752792186559e-06, "loss": 0.0177, "step": 7065 }, { "epoch": 3.138352209638019, "grad_norm": 0.4038403979608836, "learning_rate": 1.3462516711868894e-06, "loss": 0.0283, "step": 7066 }, { "epoch": 3.1387963579835665, "grad_norm": 0.36057350319064324, "learning_rate": 1.3449286123755628e-06, "loss": 0.0282, "step": 7067 }, { "epoch": 3.1392405063291138, "grad_norm": 0.4323404311148623, "learning_rate": 1.343606102983555e-06, "loss": 0.0285, "step": 7068 }, { "epoch": 3.1396846546746615, "grad_norm": 0.42881049252651987, "learning_rate": 1.3422841432096623e-06, "loss": 0.0299, "step": 7069 }, { "epoch": 3.140128803020209, "grad_norm": 0.2902742598750208, "learning_rate": 1.3409627332525954e-06, "loss": 0.0181, "step": 7070 }, { "epoch": 3.140572951365756, "grad_norm": 0.4433992896224678, "learning_rate": 1.3396418733109856e-06, "loss": 0.0257, "step": 7071 }, { "epoch": 3.1410170997113034, "grad_norm": 0.6894028385625418, "learning_rate": 1.3383215635833829e-06, "loss": 0.0304, "step": 7072 }, { "epoch": 3.141461248056851, "grad_norm": 0.5245286511325312, "learning_rate": 1.337001804268247e-06, "loss": 0.0269, "step": 7073 }, { "epoch": 3.1419053964023984, "grad_norm": 0.5000099376315333, "learning_rate": 1.3356825955639645e-06, "loss": 0.0308, "step": 7074 }, { "epoch": 3.1423495447479457, "grad_norm": 0.6132659503324829, "learning_rate": 1.3343639376688355e-06, "loss": 0.032, "step": 7075 }, { "epoch": 3.1427936930934934, "grad_norm": 0.4546873288387578, "learning_rate": 1.3330458307810734e-06, "loss": 0.0246, "step": 7076 }, { "epoch": 3.1432378414390407, "grad_norm": 0.385185952241093, "learning_rate": 1.3317282750988137e-06, "loss": 0.0184, "step": 7077 }, { "epoch": 3.143681989784588, "grad_norm": 0.39438378600816204, "learning_rate": 1.3304112708201073e-06, "loss": 0.0279, "step": 7078 }, { "epoch": 3.1441261381301353, "grad_norm": 0.4806029181761233, "learning_rate": 1.329094818142922e-06, "loss": 0.0241, "step": 7079 }, { "epoch": 3.144570286475683, "grad_norm": 0.5228838911668552, "learning_rate": 1.327778917265144e-06, "loss": 0.0274, "step": 7080 }, { "epoch": 3.1450144348212303, "grad_norm": 0.46531223651019554, "learning_rate": 1.3264635683845755e-06, "loss": 0.0241, "step": 7081 }, { "epoch": 3.1454585831667776, "grad_norm": 0.32902026350154623, "learning_rate": 1.3251487716989341e-06, "loss": 0.0169, "step": 7082 }, { "epoch": 3.145902731512325, "grad_norm": 0.3895325085859838, "learning_rate": 1.3238345274058572e-06, "loss": 0.0219, "step": 7083 }, { "epoch": 3.1463468798578726, "grad_norm": 0.4481405965406686, "learning_rate": 1.322520835702898e-06, "loss": 0.025, "step": 7084 }, { "epoch": 3.14679102820342, "grad_norm": 0.5128203909715668, "learning_rate": 1.3212076967875265e-06, "loss": 0.0348, "step": 7085 }, { "epoch": 3.1472351765489672, "grad_norm": 0.46447445611495575, "learning_rate": 1.3198951108571312e-06, "loss": 0.0313, "step": 7086 }, { "epoch": 3.147679324894515, "grad_norm": 0.29983798874260054, "learning_rate": 1.3185830781090136e-06, "loss": 0.0142, "step": 7087 }, { "epoch": 3.1481234732400623, "grad_norm": 0.39996284685169026, "learning_rate": 1.3172715987403955e-06, "loss": 0.0279, "step": 7088 }, { "epoch": 3.1485676215856095, "grad_norm": 0.44271681407903213, "learning_rate": 1.3159606729484165e-06, "loss": 0.0309, "step": 7089 }, { "epoch": 3.149011769931157, "grad_norm": 0.39836027232808446, "learning_rate": 1.3146503009301258e-06, "loss": 0.0258, "step": 7090 }, { "epoch": 3.1494559182767046, "grad_norm": 0.38922372733595656, "learning_rate": 1.3133404828824998e-06, "loss": 0.0249, "step": 7091 }, { "epoch": 3.149900066622252, "grad_norm": 0.4629939120495913, "learning_rate": 1.3120312190024265e-06, "loss": 0.0177, "step": 7092 }, { "epoch": 3.150344214967799, "grad_norm": 0.5175669897186087, "learning_rate": 1.3107225094867066e-06, "loss": 0.0352, "step": 7093 }, { "epoch": 3.150788363313347, "grad_norm": 0.4487683854548024, "learning_rate": 1.3094143545320636e-06, "loss": 0.0309, "step": 7094 }, { "epoch": 3.151232511658894, "grad_norm": 0.4853986633083335, "learning_rate": 1.3081067543351351e-06, "loss": 0.0351, "step": 7095 }, { "epoch": 3.1516766600044415, "grad_norm": 0.4346141380035006, "learning_rate": 1.3067997090924755e-06, "loss": 0.0275, "step": 7096 }, { "epoch": 3.1521208083499888, "grad_norm": 0.7565904341486527, "learning_rate": 1.305493219000558e-06, "loss": 0.0331, "step": 7097 }, { "epoch": 3.1525649566955365, "grad_norm": 0.4041064888863333, "learning_rate": 1.3041872842557669e-06, "loss": 0.0303, "step": 7098 }, { "epoch": 3.153009105041084, "grad_norm": 0.39758421079788286, "learning_rate": 1.3028819050544078e-06, "loss": 0.02, "step": 7099 }, { "epoch": 3.153453253386631, "grad_norm": 0.43003387120599507, "learning_rate": 1.3015770815927009e-06, "loss": 0.034, "step": 7100 }, { "epoch": 3.1538974017321784, "grad_norm": 0.4278201479573488, "learning_rate": 1.3002728140667847e-06, "loss": 0.0233, "step": 7101 }, { "epoch": 3.154341550077726, "grad_norm": 0.37161324406542584, "learning_rate": 1.2989691026727114e-06, "loss": 0.0197, "step": 7102 }, { "epoch": 3.1547856984232734, "grad_norm": 0.49261598275828133, "learning_rate": 1.2976659476064528e-06, "loss": 0.0332, "step": 7103 }, { "epoch": 3.1552298467688207, "grad_norm": 0.41968097121084996, "learning_rate": 1.2963633490638927e-06, "loss": 0.0266, "step": 7104 }, { "epoch": 3.155673995114368, "grad_norm": 0.37929111645310976, "learning_rate": 1.2950613072408352e-06, "loss": 0.0206, "step": 7105 }, { "epoch": 3.1561181434599157, "grad_norm": 0.4490037141777807, "learning_rate": 1.2937598223330006e-06, "loss": 0.0335, "step": 7106 }, { "epoch": 3.156562291805463, "grad_norm": 0.6757819685260033, "learning_rate": 1.2924588945360195e-06, "loss": 0.0387, "step": 7107 }, { "epoch": 3.1570064401510103, "grad_norm": 0.38030068064549954, "learning_rate": 1.2911585240454483e-06, "loss": 0.0225, "step": 7108 }, { "epoch": 3.157450588496558, "grad_norm": 0.3303111905779037, "learning_rate": 1.2898587110567546e-06, "loss": 0.0158, "step": 7109 }, { "epoch": 3.1578947368421053, "grad_norm": 0.36066815697274596, "learning_rate": 1.2885594557653197e-06, "loss": 0.0271, "step": 7110 }, { "epoch": 3.1583388851876526, "grad_norm": 0.5116023254049137, "learning_rate": 1.2872607583664443e-06, "loss": 0.0332, "step": 7111 }, { "epoch": 3.1587830335332, "grad_norm": 0.4098679802743982, "learning_rate": 1.2859626190553459e-06, "loss": 0.0206, "step": 7112 }, { "epoch": 3.1592271818787476, "grad_norm": 0.3664260571550095, "learning_rate": 1.2846650380271563e-06, "loss": 0.0215, "step": 7113 }, { "epoch": 3.159671330224295, "grad_norm": 0.4801826025425368, "learning_rate": 1.283368015476925e-06, "loss": 0.0207, "step": 7114 }, { "epoch": 3.160115478569842, "grad_norm": 0.43450502438961686, "learning_rate": 1.2820715515996146e-06, "loss": 0.02, "step": 7115 }, { "epoch": 3.16055962691539, "grad_norm": 0.4768531731139893, "learning_rate": 1.280775646590106e-06, "loss": 0.0321, "step": 7116 }, { "epoch": 3.1610037752609372, "grad_norm": 0.3425693376069199, "learning_rate": 1.2794803006431984e-06, "loss": 0.0188, "step": 7117 }, { "epoch": 3.1614479236064845, "grad_norm": 0.5214149879054594, "learning_rate": 1.2781855139535988e-06, "loss": 0.0262, "step": 7118 }, { "epoch": 3.161892071952032, "grad_norm": 0.40187878997324955, "learning_rate": 1.2768912867159406e-06, "loss": 0.0361, "step": 7119 }, { "epoch": 3.1623362202975795, "grad_norm": 0.5083763707824389, "learning_rate": 1.2755976191247682e-06, "loss": 0.021, "step": 7120 }, { "epoch": 3.162780368643127, "grad_norm": 0.4055192408506625, "learning_rate": 1.2743045113745385e-06, "loss": 0.0227, "step": 7121 }, { "epoch": 3.163224516988674, "grad_norm": 0.4094976103066204, "learning_rate": 1.2730119636596288e-06, "loss": 0.0281, "step": 7122 }, { "epoch": 3.163668665334222, "grad_norm": 0.45282087571650875, "learning_rate": 1.2717199761743336e-06, "loss": 0.0251, "step": 7123 }, { "epoch": 3.164112813679769, "grad_norm": 0.4299624187324527, "learning_rate": 1.2704285491128553e-06, "loss": 0.0283, "step": 7124 }, { "epoch": 3.1645569620253164, "grad_norm": 0.4009907594057322, "learning_rate": 1.2691376826693235e-06, "loss": 0.0281, "step": 7125 }, { "epoch": 3.1650011103708637, "grad_norm": 0.36139049623487757, "learning_rate": 1.2678473770377726e-06, "loss": 0.0227, "step": 7126 }, { "epoch": 3.1654452587164115, "grad_norm": 0.3911090099172185, "learning_rate": 1.2665576324121587e-06, "loss": 0.019, "step": 7127 }, { "epoch": 3.1658894070619588, "grad_norm": 0.4537050187193512, "learning_rate": 1.2652684489863532e-06, "loss": 0.0171, "step": 7128 }, { "epoch": 3.166333555407506, "grad_norm": 0.4737431638279446, "learning_rate": 1.2639798269541432e-06, "loss": 0.0262, "step": 7129 }, { "epoch": 3.1667777037530533, "grad_norm": 0.5356720104817232, "learning_rate": 1.2626917665092265e-06, "loss": 0.0367, "step": 7130 }, { "epoch": 3.167221852098601, "grad_norm": 0.41884008779805304, "learning_rate": 1.2614042678452254e-06, "loss": 0.0277, "step": 7131 }, { "epoch": 3.1676660004441484, "grad_norm": 0.3813088744442865, "learning_rate": 1.260117331155669e-06, "loss": 0.0241, "step": 7132 }, { "epoch": 3.1681101487896957, "grad_norm": 0.4824659904614872, "learning_rate": 1.258830956634008e-06, "loss": 0.0266, "step": 7133 }, { "epoch": 3.168554297135243, "grad_norm": 0.3933046865818541, "learning_rate": 1.2575451444736065e-06, "loss": 0.0241, "step": 7134 }, { "epoch": 3.1689984454807907, "grad_norm": 0.4675792699661695, "learning_rate": 1.25625989486774e-06, "loss": 0.0231, "step": 7135 }, { "epoch": 3.169442593826338, "grad_norm": 0.47235339239033924, "learning_rate": 1.2549752080096078e-06, "loss": 0.0228, "step": 7136 }, { "epoch": 3.1698867421718853, "grad_norm": 0.5375969536724705, "learning_rate": 1.2536910840923205e-06, "loss": 0.0221, "step": 7137 }, { "epoch": 3.170330890517433, "grad_norm": 0.4497621301881044, "learning_rate": 1.2524075233089e-06, "loss": 0.0343, "step": 7138 }, { "epoch": 3.1707750388629803, "grad_norm": 0.292034357924613, "learning_rate": 1.251124525852289e-06, "loss": 0.0126, "step": 7139 }, { "epoch": 3.1712191872085276, "grad_norm": 0.35964553866038385, "learning_rate": 1.2498420919153464e-06, "loss": 0.0275, "step": 7140 }, { "epoch": 3.171663335554075, "grad_norm": 0.35000228782660847, "learning_rate": 1.2485602216908378e-06, "loss": 0.0175, "step": 7141 }, { "epoch": 3.1721074838996226, "grad_norm": 0.3682428993293255, "learning_rate": 1.2472789153714572e-06, "loss": 0.0239, "step": 7142 }, { "epoch": 3.17255163224517, "grad_norm": 0.5552689170116238, "learning_rate": 1.245998173149801e-06, "loss": 0.0296, "step": 7143 }, { "epoch": 3.172995780590717, "grad_norm": 0.4582090480824963, "learning_rate": 1.244717995218389e-06, "loss": 0.0331, "step": 7144 }, { "epoch": 3.173439928936265, "grad_norm": 0.5212899725680629, "learning_rate": 1.2434383817696548e-06, "loss": 0.0258, "step": 7145 }, { "epoch": 3.173884077281812, "grad_norm": 0.381695700080771, "learning_rate": 1.2421593329959437e-06, "loss": 0.0243, "step": 7146 }, { "epoch": 3.1743282256273595, "grad_norm": 0.3974320066485413, "learning_rate": 1.2408808490895176e-06, "loss": 0.0296, "step": 7147 }, { "epoch": 3.174772373972907, "grad_norm": 0.37908843626651684, "learning_rate": 1.2396029302425589e-06, "loss": 0.0233, "step": 7148 }, { "epoch": 3.1752165223184545, "grad_norm": 0.3916584808786519, "learning_rate": 1.2383255766471564e-06, "loss": 0.0231, "step": 7149 }, { "epoch": 3.175660670664002, "grad_norm": 0.4921323330468778, "learning_rate": 1.2370487884953198e-06, "loss": 0.0282, "step": 7150 }, { "epoch": 3.176104819009549, "grad_norm": 0.4474378513318365, "learning_rate": 1.2357725659789727e-06, "loss": 0.0316, "step": 7151 }, { "epoch": 3.1765489673550964, "grad_norm": 0.507768145335404, "learning_rate": 1.234496909289949e-06, "loss": 0.0314, "step": 7152 }, { "epoch": 3.176993115700644, "grad_norm": 0.43929762431880826, "learning_rate": 1.2332218186200062e-06, "loss": 0.036, "step": 7153 }, { "epoch": 3.1774372640461914, "grad_norm": 0.45050762765015684, "learning_rate": 1.2319472941608118e-06, "loss": 0.0243, "step": 7154 }, { "epoch": 3.1778814123917387, "grad_norm": 0.35294587274588873, "learning_rate": 1.2306733361039457e-06, "loss": 0.0214, "step": 7155 }, { "epoch": 3.1783255607372864, "grad_norm": 0.41208302752606807, "learning_rate": 1.2293999446409067e-06, "loss": 0.0261, "step": 7156 }, { "epoch": 3.1787697090828337, "grad_norm": 0.5041315813475921, "learning_rate": 1.228127119963109e-06, "loss": 0.0285, "step": 7157 }, { "epoch": 3.179213857428381, "grad_norm": 0.44300292068122016, "learning_rate": 1.2268548622618753e-06, "loss": 0.0297, "step": 7158 }, { "epoch": 3.1796580057739283, "grad_norm": 0.45740073144316085, "learning_rate": 1.2255831717284528e-06, "loss": 0.0279, "step": 7159 }, { "epoch": 3.180102154119476, "grad_norm": 0.3604803514556297, "learning_rate": 1.2243120485539944e-06, "loss": 0.0223, "step": 7160 }, { "epoch": 3.1805463024650233, "grad_norm": 0.3321847278445319, "learning_rate": 1.223041492929573e-06, "loss": 0.0165, "step": 7161 }, { "epoch": 3.1809904508105706, "grad_norm": 0.35593201085972326, "learning_rate": 1.221771505046176e-06, "loss": 0.0214, "step": 7162 }, { "epoch": 3.181434599156118, "grad_norm": 0.48430071345607734, "learning_rate": 1.2205020850947009e-06, "loss": 0.0324, "step": 7163 }, { "epoch": 3.1818787475016657, "grad_norm": 0.3786369491408754, "learning_rate": 1.219233233265964e-06, "loss": 0.0196, "step": 7164 }, { "epoch": 3.182322895847213, "grad_norm": 0.422396003313245, "learning_rate": 1.2179649497506984e-06, "loss": 0.0235, "step": 7165 }, { "epoch": 3.1827670441927602, "grad_norm": 0.5088819210251279, "learning_rate": 1.216697234739545e-06, "loss": 0.0245, "step": 7166 }, { "epoch": 3.183211192538308, "grad_norm": 0.44593024209496107, "learning_rate": 1.2154300884230647e-06, "loss": 0.0248, "step": 7167 }, { "epoch": 3.1836553408838553, "grad_norm": 0.44151282819218973, "learning_rate": 1.2141635109917322e-06, "loss": 0.0243, "step": 7168 }, { "epoch": 3.1840994892294026, "grad_norm": 0.5179310919609202, "learning_rate": 1.2128975026359308e-06, "loss": 0.0303, "step": 7169 }, { "epoch": 3.18454363757495, "grad_norm": 0.47378558079444577, "learning_rate": 1.2116320635459694e-06, "loss": 0.0308, "step": 7170 }, { "epoch": 3.1849877859204976, "grad_norm": 0.3483756788167636, "learning_rate": 1.2103671939120603e-06, "loss": 0.0205, "step": 7171 }, { "epoch": 3.185431934266045, "grad_norm": 0.43627278086001237, "learning_rate": 1.2091028939243372e-06, "loss": 0.0348, "step": 7172 }, { "epoch": 3.185876082611592, "grad_norm": 0.49521087150532456, "learning_rate": 1.207839163772845e-06, "loss": 0.0312, "step": 7173 }, { "epoch": 3.18632023095714, "grad_norm": 0.4856268760629859, "learning_rate": 1.206576003647545e-06, "loss": 0.02, "step": 7174 }, { "epoch": 3.186764379302687, "grad_norm": 0.39637861480749204, "learning_rate": 1.2053134137383082e-06, "loss": 0.0255, "step": 7175 }, { "epoch": 3.1872085276482345, "grad_norm": 0.43819489132093886, "learning_rate": 1.2040513942349285e-06, "loss": 0.0197, "step": 7176 }, { "epoch": 3.1876526759937818, "grad_norm": 0.4216663259343465, "learning_rate": 1.2027899453271046e-06, "loss": 0.023, "step": 7177 }, { "epoch": 3.1880968243393295, "grad_norm": 0.36509628207338957, "learning_rate": 1.2015290672044555e-06, "loss": 0.0213, "step": 7178 }, { "epoch": 3.188540972684877, "grad_norm": 0.5214855632257979, "learning_rate": 1.2002687600565138e-06, "loss": 0.0326, "step": 7179 }, { "epoch": 3.188985121030424, "grad_norm": 0.4141851354865393, "learning_rate": 1.199009024072722e-06, "loss": 0.0224, "step": 7180 }, { "epoch": 3.1894292693759714, "grad_norm": 0.33059616966272004, "learning_rate": 1.1977498594424404e-06, "loss": 0.0197, "step": 7181 }, { "epoch": 3.189873417721519, "grad_norm": 0.5492371676948207, "learning_rate": 1.196491266354946e-06, "loss": 0.0251, "step": 7182 }, { "epoch": 3.1903175660670664, "grad_norm": 0.4321665285793292, "learning_rate": 1.1952332449994236e-06, "loss": 0.0254, "step": 7183 }, { "epoch": 3.1907617144126137, "grad_norm": 0.5421889351399802, "learning_rate": 1.1939757955649762e-06, "loss": 0.0205, "step": 7184 }, { "epoch": 3.1912058627581614, "grad_norm": 0.3637124855912809, "learning_rate": 1.1927189182406207e-06, "loss": 0.028, "step": 7185 }, { "epoch": 3.1916500111037087, "grad_norm": 0.4272124427935157, "learning_rate": 1.191462613215284e-06, "loss": 0.0245, "step": 7186 }, { "epoch": 3.192094159449256, "grad_norm": 0.5695918619211989, "learning_rate": 1.190206880677815e-06, "loss": 0.0275, "step": 7187 }, { "epoch": 3.1925383077948033, "grad_norm": 0.47921802084665843, "learning_rate": 1.188951720816967e-06, "loss": 0.0327, "step": 7188 }, { "epoch": 3.192982456140351, "grad_norm": 0.47676512936038173, "learning_rate": 1.1876971338214144e-06, "loss": 0.0237, "step": 7189 }, { "epoch": 3.1934266044858983, "grad_norm": 0.32414238199829704, "learning_rate": 1.1864431198797433e-06, "loss": 0.0223, "step": 7190 }, { "epoch": 3.1938707528314456, "grad_norm": 0.37243806607221175, "learning_rate": 1.1851896791804507e-06, "loss": 0.0226, "step": 7191 }, { "epoch": 3.194314901176993, "grad_norm": 0.45790234199301505, "learning_rate": 1.1839368119119504e-06, "loss": 0.0366, "step": 7192 }, { "epoch": 3.1947590495225406, "grad_norm": 0.3980186449229834, "learning_rate": 1.182684518262574e-06, "loss": 0.0285, "step": 7193 }, { "epoch": 3.195203197868088, "grad_norm": 0.37521542650727085, "learning_rate": 1.1814327984205576e-06, "loss": 0.0262, "step": 7194 }, { "epoch": 3.195647346213635, "grad_norm": 0.5161650488172221, "learning_rate": 1.1801816525740578e-06, "loss": 0.0314, "step": 7195 }, { "epoch": 3.196091494559183, "grad_norm": 0.4308857816197324, "learning_rate": 1.1789310809111444e-06, "loss": 0.0214, "step": 7196 }, { "epoch": 3.1965356429047302, "grad_norm": 0.34195463841205975, "learning_rate": 1.1776810836197965e-06, "loss": 0.0148, "step": 7197 }, { "epoch": 3.1969797912502775, "grad_norm": 0.40193187911181305, "learning_rate": 1.1764316608879122e-06, "loss": 0.0246, "step": 7198 }, { "epoch": 3.197423939595825, "grad_norm": 0.45090018760439604, "learning_rate": 1.1751828129033e-06, "loss": 0.0332, "step": 7199 }, { "epoch": 3.1978680879413726, "grad_norm": 0.45570245011519855, "learning_rate": 1.1739345398536834e-06, "loss": 0.0287, "step": 7200 }, { "epoch": 3.19831223628692, "grad_norm": 0.3951478023748843, "learning_rate": 1.1726868419266985e-06, "loss": 0.022, "step": 7201 }, { "epoch": 3.198756384632467, "grad_norm": 0.45157628259237137, "learning_rate": 1.1714397193098975e-06, "loss": 0.0254, "step": 7202 }, { "epoch": 3.199200532978015, "grad_norm": 0.577449890397031, "learning_rate": 1.1701931721907417e-06, "loss": 0.0475, "step": 7203 }, { "epoch": 3.199644681323562, "grad_norm": 0.3377142483011571, "learning_rate": 1.1689472007566082e-06, "loss": 0.0233, "step": 7204 }, { "epoch": 3.2000888296691095, "grad_norm": 0.5632120326062124, "learning_rate": 1.1677018051947898e-06, "loss": 0.0306, "step": 7205 }, { "epoch": 3.2005329780146567, "grad_norm": 0.3154600517639766, "learning_rate": 1.1664569856924885e-06, "loss": 0.0185, "step": 7206 }, { "epoch": 3.2009771263602045, "grad_norm": 0.3451810701202185, "learning_rate": 1.1652127424368248e-06, "loss": 0.0188, "step": 7207 }, { "epoch": 3.2014212747057518, "grad_norm": 0.4315345094850551, "learning_rate": 1.1639690756148258e-06, "loss": 0.0247, "step": 7208 }, { "epoch": 3.201865423051299, "grad_norm": 0.46106139760465975, "learning_rate": 1.162725985413436e-06, "loss": 0.0373, "step": 7209 }, { "epoch": 3.2023095713968464, "grad_norm": 0.35725860550074096, "learning_rate": 1.1614834720195173e-06, "loss": 0.0225, "step": 7210 }, { "epoch": 3.202753719742394, "grad_norm": 0.4457350062298839, "learning_rate": 1.1602415356198366e-06, "loss": 0.0257, "step": 7211 }, { "epoch": 3.2031978680879414, "grad_norm": 0.414436238208724, "learning_rate": 1.1590001764010795e-06, "loss": 0.0233, "step": 7212 }, { "epoch": 3.2036420164334887, "grad_norm": 0.5552444496185882, "learning_rate": 1.1577593945498439e-06, "loss": 0.0267, "step": 7213 }, { "epoch": 3.2040861647790364, "grad_norm": 0.9572529892460224, "learning_rate": 1.156519190252638e-06, "loss": 0.0208, "step": 7214 }, { "epoch": 3.2045303131245837, "grad_norm": 0.4000797957812149, "learning_rate": 1.1552795636958874e-06, "loss": 0.021, "step": 7215 }, { "epoch": 3.204974461470131, "grad_norm": 0.44923949008034364, "learning_rate": 1.154040515065929e-06, "loss": 0.0257, "step": 7216 }, { "epoch": 3.2054186098156783, "grad_norm": 0.3620392807393453, "learning_rate": 1.1528020445490122e-06, "loss": 0.0168, "step": 7217 }, { "epoch": 3.205862758161226, "grad_norm": 0.40007534369795833, "learning_rate": 1.1515641523313026e-06, "loss": 0.019, "step": 7218 }, { "epoch": 3.2063069065067733, "grad_norm": 0.48298838416688933, "learning_rate": 1.1503268385988726e-06, "loss": 0.0289, "step": 7219 }, { "epoch": 3.2067510548523206, "grad_norm": 0.42002715971416216, "learning_rate": 1.1490901035377127e-06, "loss": 0.0218, "step": 7220 }, { "epoch": 3.207195203197868, "grad_norm": 0.3286114150705751, "learning_rate": 1.147853947333727e-06, "loss": 0.0197, "step": 7221 }, { "epoch": 3.2076393515434156, "grad_norm": 0.4609222221897965, "learning_rate": 1.1466183701727285e-06, "loss": 0.0324, "step": 7222 }, { "epoch": 3.208083499888963, "grad_norm": 0.685447809015932, "learning_rate": 1.1453833722404467e-06, "loss": 0.0324, "step": 7223 }, { "epoch": 3.20852764823451, "grad_norm": 0.3436945045836064, "learning_rate": 1.1441489537225242e-06, "loss": 0.0157, "step": 7224 }, { "epoch": 3.208971796580058, "grad_norm": 0.5032138454546828, "learning_rate": 1.142915114804512e-06, "loss": 0.0305, "step": 7225 }, { "epoch": 3.2094159449256052, "grad_norm": 0.4731526252440794, "learning_rate": 1.1416818556718766e-06, "loss": 0.0213, "step": 7226 }, { "epoch": 3.2098600932711525, "grad_norm": 0.4103989387772799, "learning_rate": 1.1404491765100028e-06, "loss": 0.0233, "step": 7227 }, { "epoch": 3.2103042416167, "grad_norm": 0.4494816164510247, "learning_rate": 1.1392170775041788e-06, "loss": 0.0297, "step": 7228 }, { "epoch": 3.2107483899622475, "grad_norm": 0.49371762049111106, "learning_rate": 1.1379855588396111e-06, "loss": 0.0352, "step": 7229 }, { "epoch": 3.211192538307795, "grad_norm": 0.5061710158808476, "learning_rate": 1.1367546207014197e-06, "loss": 0.029, "step": 7230 }, { "epoch": 3.211636686653342, "grad_norm": 0.8145152873713564, "learning_rate": 1.1355242632746322e-06, "loss": 0.0262, "step": 7231 }, { "epoch": 3.21208083499889, "grad_norm": 0.3714708449077717, "learning_rate": 1.134294486744194e-06, "loss": 0.0195, "step": 7232 }, { "epoch": 3.212524983344437, "grad_norm": 0.3746783694058875, "learning_rate": 1.1330652912949614e-06, "loss": 0.0315, "step": 7233 }, { "epoch": 3.2129691316899844, "grad_norm": 0.3552761658812224, "learning_rate": 1.131836677111703e-06, "loss": 0.0195, "step": 7234 }, { "epoch": 3.2134132800355317, "grad_norm": 0.425007652814607, "learning_rate": 1.130608644379102e-06, "loss": 0.025, "step": 7235 }, { "epoch": 3.2138574283810795, "grad_norm": 0.43005151453900625, "learning_rate": 1.12938119328175e-06, "loss": 0.0209, "step": 7236 }, { "epoch": 3.2143015767266268, "grad_norm": 0.4595931551885754, "learning_rate": 1.1281543240041553e-06, "loss": 0.0299, "step": 7237 }, { "epoch": 3.214745725072174, "grad_norm": 0.46371024210057155, "learning_rate": 1.1269280367307366e-06, "loss": 0.0323, "step": 7238 }, { "epoch": 3.2151898734177213, "grad_norm": 0.6313936018053397, "learning_rate": 1.125702331645826e-06, "loss": 0.045, "step": 7239 }, { "epoch": 3.215634021763269, "grad_norm": 0.4072252776090633, "learning_rate": 1.1244772089336676e-06, "loss": 0.0201, "step": 7240 }, { "epoch": 3.2160781701088164, "grad_norm": 0.4916818117136901, "learning_rate": 1.1232526687784196e-06, "loss": 0.0321, "step": 7241 }, { "epoch": 3.2165223184543636, "grad_norm": 0.4020576743291894, "learning_rate": 1.1220287113641487e-06, "loss": 0.0235, "step": 7242 }, { "epoch": 3.216966466799911, "grad_norm": 0.3816616945761622, "learning_rate": 1.1208053368748379e-06, "loss": 0.0195, "step": 7243 }, { "epoch": 3.2174106151454587, "grad_norm": 0.7096021175300753, "learning_rate": 1.1195825454943805e-06, "loss": 0.0277, "step": 7244 }, { "epoch": 3.217854763491006, "grad_norm": 0.4537143505718799, "learning_rate": 1.1183603374065832e-06, "loss": 0.0317, "step": 7245 }, { "epoch": 3.2182989118365533, "grad_norm": 0.3124829853038914, "learning_rate": 1.1171387127951667e-06, "loss": 0.0134, "step": 7246 }, { "epoch": 3.218743060182101, "grad_norm": 0.3523041765470259, "learning_rate": 1.1159176718437581e-06, "loss": 0.0197, "step": 7247 }, { "epoch": 3.2191872085276483, "grad_norm": 0.46781406748141774, "learning_rate": 1.114697214735903e-06, "loss": 0.03, "step": 7248 }, { "epoch": 3.2196313568731956, "grad_norm": 0.45497498562705085, "learning_rate": 1.113477341655056e-06, "loss": 0.0326, "step": 7249 }, { "epoch": 3.220075505218743, "grad_norm": 0.34355725383797536, "learning_rate": 1.1122580527845844e-06, "loss": 0.0199, "step": 7250 }, { "epoch": 3.2205196535642906, "grad_norm": 0.4222581331896639, "learning_rate": 1.1110393483077697e-06, "loss": 0.0248, "step": 7251 }, { "epoch": 3.220963801909838, "grad_norm": 0.3466800241459592, "learning_rate": 1.1098212284078037e-06, "loss": 0.0262, "step": 7252 }, { "epoch": 3.221407950255385, "grad_norm": 0.4157855377746249, "learning_rate": 1.108603693267788e-06, "loss": 0.0256, "step": 7253 }, { "epoch": 3.221852098600933, "grad_norm": 0.383389548238552, "learning_rate": 1.1073867430707409e-06, "loss": 0.024, "step": 7254 }, { "epoch": 3.22229624694648, "grad_norm": 0.30999218839566933, "learning_rate": 1.1061703779995903e-06, "loss": 0.0132, "step": 7255 }, { "epoch": 3.2227403952920275, "grad_norm": 0.4028830849965787, "learning_rate": 1.1049545982371763e-06, "loss": 0.0237, "step": 7256 }, { "epoch": 3.223184543637575, "grad_norm": 0.399211499707528, "learning_rate": 1.1037394039662514e-06, "loss": 0.025, "step": 7257 }, { "epoch": 3.2236286919831225, "grad_norm": 0.4718629067386526, "learning_rate": 1.1025247953694812e-06, "loss": 0.022, "step": 7258 }, { "epoch": 3.22407284032867, "grad_norm": 0.37912161530644367, "learning_rate": 1.1013107726294398e-06, "loss": 0.0218, "step": 7259 }, { "epoch": 3.224516988674217, "grad_norm": 0.34533958399869225, "learning_rate": 1.100097335928616e-06, "loss": 0.0165, "step": 7260 }, { "epoch": 3.224961137019765, "grad_norm": 0.5551950864778356, "learning_rate": 1.0988844854494108e-06, "loss": 0.0351, "step": 7261 }, { "epoch": 3.225405285365312, "grad_norm": 0.5024454449508828, "learning_rate": 1.0976722213741353e-06, "loss": 0.0264, "step": 7262 }, { "epoch": 3.2258494337108594, "grad_norm": 0.44064209832820683, "learning_rate": 1.0964605438850157e-06, "loss": 0.0277, "step": 7263 }, { "epoch": 3.2262935820564067, "grad_norm": 0.3886834550548472, "learning_rate": 1.0952494531641845e-06, "loss": 0.0254, "step": 7264 }, { "epoch": 3.2267377304019544, "grad_norm": 0.44369145821666817, "learning_rate": 1.0940389493936903e-06, "loss": 0.0305, "step": 7265 }, { "epoch": 3.2271818787475017, "grad_norm": 0.5083410379311883, "learning_rate": 1.092829032755493e-06, "loss": 0.0234, "step": 7266 }, { "epoch": 3.227626027093049, "grad_norm": 0.5641232827648482, "learning_rate": 1.091619703431463e-06, "loss": 0.033, "step": 7267 }, { "epoch": 3.2280701754385963, "grad_norm": 0.49450571188792775, "learning_rate": 1.0904109616033837e-06, "loss": 0.0259, "step": 7268 }, { "epoch": 3.228514323784144, "grad_norm": 0.40475539158874035, "learning_rate": 1.0892028074529504e-06, "loss": 0.0304, "step": 7269 }, { "epoch": 3.2289584721296913, "grad_norm": 0.3822868070952512, "learning_rate": 1.0879952411617668e-06, "loss": 0.0258, "step": 7270 }, { "epoch": 3.2294026204752386, "grad_norm": 0.32534348009223135, "learning_rate": 1.0867882629113512e-06, "loss": 0.0165, "step": 7271 }, { "epoch": 3.229846768820786, "grad_norm": 0.44997009638741614, "learning_rate": 1.085581872883134e-06, "loss": 0.0288, "step": 7272 }, { "epoch": 3.2302909171663337, "grad_norm": 0.3775740023239124, "learning_rate": 1.0843760712584557e-06, "loss": 0.0191, "step": 7273 }, { "epoch": 3.230735065511881, "grad_norm": 0.36873567439006616, "learning_rate": 1.0831708582185684e-06, "loss": 0.0215, "step": 7274 }, { "epoch": 3.2311792138574282, "grad_norm": 0.41830159166514674, "learning_rate": 1.081966233944638e-06, "loss": 0.0349, "step": 7275 }, { "epoch": 3.231623362202976, "grad_norm": 0.3474261761721994, "learning_rate": 1.0807621986177369e-06, "loss": 0.0197, "step": 7276 }, { "epoch": 3.2320675105485233, "grad_norm": 0.4726466217876324, "learning_rate": 1.0795587524188532e-06, "loss": 0.0257, "step": 7277 }, { "epoch": 3.2325116588940705, "grad_norm": 0.5853092492971105, "learning_rate": 1.0783558955288864e-06, "loss": 0.033, "step": 7278 }, { "epoch": 3.232955807239618, "grad_norm": 0.4076799037255477, "learning_rate": 1.0771536281286454e-06, "loss": 0.0251, "step": 7279 }, { "epoch": 3.2333999555851656, "grad_norm": 0.3975030901670652, "learning_rate": 1.0759519503988525e-06, "loss": 0.0223, "step": 7280 }, { "epoch": 3.233844103930713, "grad_norm": 0.40051231781743457, "learning_rate": 1.0747508625201387e-06, "loss": 0.0247, "step": 7281 }, { "epoch": 3.23428825227626, "grad_norm": 0.44390849962202206, "learning_rate": 1.0735503646730483e-06, "loss": 0.0281, "step": 7282 }, { "epoch": 3.234732400621808, "grad_norm": 0.4052200206226338, "learning_rate": 1.0723504570380367e-06, "loss": 0.0286, "step": 7283 }, { "epoch": 3.235176548967355, "grad_norm": 0.37902885647278484, "learning_rate": 1.0711511397954706e-06, "loss": 0.0205, "step": 7284 }, { "epoch": 3.2356206973129025, "grad_norm": 0.3395460769637531, "learning_rate": 1.0699524131256273e-06, "loss": 0.0201, "step": 7285 }, { "epoch": 3.2360648456584498, "grad_norm": 0.3790836643740825, "learning_rate": 1.0687542772086978e-06, "loss": 0.0213, "step": 7286 }, { "epoch": 3.2365089940039975, "grad_norm": 0.3344270224539923, "learning_rate": 1.0675567322247794e-06, "loss": 0.0188, "step": 7287 }, { "epoch": 3.236953142349545, "grad_norm": 0.49193821672434174, "learning_rate": 1.0663597783538843e-06, "loss": 0.0258, "step": 7288 }, { "epoch": 3.237397290695092, "grad_norm": 0.40685832395170973, "learning_rate": 1.0651634157759361e-06, "loss": 0.0266, "step": 7289 }, { "epoch": 3.2378414390406394, "grad_norm": 0.4090407200808684, "learning_rate": 1.063967644670767e-06, "loss": 0.0264, "step": 7290 }, { "epoch": 3.238285587386187, "grad_norm": 0.3883951968082906, "learning_rate": 1.0627724652181237e-06, "loss": 0.033, "step": 7291 }, { "epoch": 3.2387297357317344, "grad_norm": 0.43888578494036506, "learning_rate": 1.06157787759766e-06, "loss": 0.0291, "step": 7292 }, { "epoch": 3.2391738840772817, "grad_norm": 0.5336092270346524, "learning_rate": 1.0603838819889429e-06, "loss": 0.031, "step": 7293 }, { "epoch": 3.2396180324228294, "grad_norm": 0.4556708038719475, "learning_rate": 1.0591904785714507e-06, "loss": 0.0189, "step": 7294 }, { "epoch": 3.2400621807683767, "grad_norm": 0.42245566909222265, "learning_rate": 1.0579976675245724e-06, "loss": 0.0241, "step": 7295 }, { "epoch": 3.240506329113924, "grad_norm": 0.4528542240509129, "learning_rate": 1.0568054490276075e-06, "loss": 0.0251, "step": 7296 }, { "epoch": 3.2409504774594713, "grad_norm": 0.4541061461339724, "learning_rate": 1.0556138232597684e-06, "loss": 0.0201, "step": 7297 }, { "epoch": 3.241394625805019, "grad_norm": 0.3790673698790023, "learning_rate": 1.054422790400173e-06, "loss": 0.019, "step": 7298 }, { "epoch": 3.2418387741505663, "grad_norm": 0.41427572592093276, "learning_rate": 1.0532323506278564e-06, "loss": 0.0302, "step": 7299 }, { "epoch": 3.2422829224961136, "grad_norm": 0.4164805864455333, "learning_rate": 1.0520425041217613e-06, "loss": 0.0374, "step": 7300 }, { "epoch": 3.242727070841661, "grad_norm": 0.5061586285788595, "learning_rate": 1.0508532510607421e-06, "loss": 0.0369, "step": 7301 }, { "epoch": 3.2431712191872086, "grad_norm": 0.51039712429684, "learning_rate": 1.049664591623563e-06, "loss": 0.0256, "step": 7302 }, { "epoch": 3.243615367532756, "grad_norm": 0.3147661792422783, "learning_rate": 1.0484765259889024e-06, "loss": 0.0178, "step": 7303 }, { "epoch": 3.244059515878303, "grad_norm": 0.34967275161446504, "learning_rate": 1.0472890543353425e-06, "loss": 0.022, "step": 7304 }, { "epoch": 3.244503664223851, "grad_norm": 0.425569161792133, "learning_rate": 1.0461021768413827e-06, "loss": 0.0327, "step": 7305 }, { "epoch": 3.2449478125693982, "grad_norm": 0.3260603432721134, "learning_rate": 1.0449158936854308e-06, "loss": 0.019, "step": 7306 }, { "epoch": 3.2453919609149455, "grad_norm": 0.40232295144807395, "learning_rate": 1.0437302050458053e-06, "loss": 0.0343, "step": 7307 }, { "epoch": 3.245836109260493, "grad_norm": 0.435190410566104, "learning_rate": 1.0425451111007368e-06, "loss": 0.0371, "step": 7308 }, { "epoch": 3.2462802576060406, "grad_norm": 0.36886684345005827, "learning_rate": 1.0413606120283616e-06, "loss": 0.0159, "step": 7309 }, { "epoch": 3.246724405951588, "grad_norm": 0.6185604106742186, "learning_rate": 1.040176708006732e-06, "loss": 0.0317, "step": 7310 }, { "epoch": 3.247168554297135, "grad_norm": 0.4287929879516384, "learning_rate": 1.0389933992138106e-06, "loss": 0.0285, "step": 7311 }, { "epoch": 3.247612702642683, "grad_norm": 0.5172218558312743, "learning_rate": 1.0378106858274639e-06, "loss": 0.0269, "step": 7312 }, { "epoch": 3.24805685098823, "grad_norm": 0.47500334875701583, "learning_rate": 1.036628568025479e-06, "loss": 0.0291, "step": 7313 }, { "epoch": 3.2485009993337775, "grad_norm": 0.36315045000352464, "learning_rate": 1.035447045985547e-06, "loss": 0.0205, "step": 7314 }, { "epoch": 3.2489451476793247, "grad_norm": 0.3702839335927883, "learning_rate": 1.0342661198852689e-06, "loss": 0.0272, "step": 7315 }, { "epoch": 3.2493892960248725, "grad_norm": 0.4172702979013527, "learning_rate": 1.0330857899021584e-06, "loss": 0.0288, "step": 7316 }, { "epoch": 3.2498334443704198, "grad_norm": 0.4003396546345466, "learning_rate": 1.03190605621364e-06, "loss": 0.0249, "step": 7317 }, { "epoch": 3.250277592715967, "grad_norm": 0.36419349273758006, "learning_rate": 1.0307269189970482e-06, "loss": 0.0227, "step": 7318 }, { "epoch": 3.250721741061515, "grad_norm": 0.46337961532186456, "learning_rate": 1.0295483784296274e-06, "loss": 0.0292, "step": 7319 }, { "epoch": 3.251165889407062, "grad_norm": 0.4844395758839405, "learning_rate": 1.0283704346885303e-06, "loss": 0.0315, "step": 7320 }, { "epoch": 3.2516100377526094, "grad_norm": 0.43388879742046443, "learning_rate": 1.027193087950823e-06, "loss": 0.0253, "step": 7321 }, { "epoch": 3.2520541860981567, "grad_norm": 0.43253571684648157, "learning_rate": 1.0260163383934807e-06, "loss": 0.0292, "step": 7322 }, { "epoch": 3.252498334443704, "grad_norm": 0.37106610809103213, "learning_rate": 1.0248401861933888e-06, "loss": 0.0208, "step": 7323 }, { "epoch": 3.2529424827892517, "grad_norm": 0.35259048739158694, "learning_rate": 1.0236646315273436e-06, "loss": 0.0223, "step": 7324 }, { "epoch": 3.253386631134799, "grad_norm": 0.5151899180549023, "learning_rate": 1.0224896745720513e-06, "loss": 0.0313, "step": 7325 }, { "epoch": 3.2538307794803463, "grad_norm": 0.4063218873557718, "learning_rate": 1.0213153155041255e-06, "loss": 0.0291, "step": 7326 }, { "epoch": 3.254274927825894, "grad_norm": 0.6251543341151611, "learning_rate": 1.0201415545000941e-06, "loss": 0.0267, "step": 7327 }, { "epoch": 3.2547190761714413, "grad_norm": 0.4171212598426182, "learning_rate": 1.0189683917363947e-06, "loss": 0.0227, "step": 7328 }, { "epoch": 3.2551632245169886, "grad_norm": 0.49598266444833977, "learning_rate": 1.0177958273893684e-06, "loss": 0.0239, "step": 7329 }, { "epoch": 3.255607372862536, "grad_norm": 0.3899965486282789, "learning_rate": 1.016623861635277e-06, "loss": 0.0209, "step": 7330 }, { "epoch": 3.2560515212080836, "grad_norm": 0.43687191924955027, "learning_rate": 1.0154524946502864e-06, "loss": 0.0272, "step": 7331 }, { "epoch": 3.256495669553631, "grad_norm": 0.4063747702825569, "learning_rate": 1.01428172661047e-06, "loss": 0.0255, "step": 7332 }, { "epoch": 3.256939817899178, "grad_norm": 0.48223387404929335, "learning_rate": 1.0131115576918154e-06, "loss": 0.031, "step": 7333 }, { "epoch": 3.257383966244726, "grad_norm": 0.39589428565081675, "learning_rate": 1.011941988070219e-06, "loss": 0.0222, "step": 7334 }, { "epoch": 3.257828114590273, "grad_norm": 0.4100082037653643, "learning_rate": 1.0107730179214875e-06, "loss": 0.0359, "step": 7335 }, { "epoch": 3.2582722629358205, "grad_norm": 0.7048819567321716, "learning_rate": 1.0096046474213378e-06, "loss": 0.0239, "step": 7336 }, { "epoch": 3.258716411281368, "grad_norm": 0.4205899532427439, "learning_rate": 1.008436876745393e-06, "loss": 0.0329, "step": 7337 }, { "epoch": 3.2591605596269155, "grad_norm": 0.3747110714323454, "learning_rate": 1.00726970606919e-06, "loss": 0.0217, "step": 7338 }, { "epoch": 3.259604707972463, "grad_norm": 0.4970634544493451, "learning_rate": 1.0061031355681766e-06, "loss": 0.029, "step": 7339 }, { "epoch": 3.26004885631801, "grad_norm": 0.35455430411205807, "learning_rate": 1.0049371654177036e-06, "loss": 0.0241, "step": 7340 }, { "epoch": 3.260493004663558, "grad_norm": 0.38458418543297923, "learning_rate": 1.0037717957930404e-06, "loss": 0.0264, "step": 7341 }, { "epoch": 3.260937153009105, "grad_norm": 0.44567315711458705, "learning_rate": 1.0026070268693616e-06, "loss": 0.033, "step": 7342 }, { "epoch": 3.2613813013546524, "grad_norm": 0.6191592999982571, "learning_rate": 1.0014428588217495e-06, "loss": 0.0433, "step": 7343 }, { "epoch": 3.2618254497001997, "grad_norm": 0.31811859198013076, "learning_rate": 1.0002792918251991e-06, "loss": 0.0178, "step": 7344 }, { "epoch": 3.2622695980457475, "grad_norm": 0.4500830212917085, "learning_rate": 9.991163260546154e-07, "loss": 0.0283, "step": 7345 }, { "epoch": 3.2627137463912947, "grad_norm": 0.45351163208898665, "learning_rate": 9.979539616848088e-07, "loss": 0.0347, "step": 7346 }, { "epoch": 3.263157894736842, "grad_norm": 0.35335672660967254, "learning_rate": 9.96792198890506e-07, "loss": 0.0188, "step": 7347 }, { "epoch": 3.2636020430823893, "grad_norm": 0.44507187430087025, "learning_rate": 9.956310378463397e-07, "loss": 0.028, "step": 7348 }, { "epoch": 3.264046191427937, "grad_norm": 0.5621390992373146, "learning_rate": 9.94470478726849e-07, "loss": 0.0318, "step": 7349 }, { "epoch": 3.2644903397734844, "grad_norm": 0.4752468135015428, "learning_rate": 9.933105217064876e-07, "loss": 0.0301, "step": 7350 }, { "epoch": 3.2649344881190316, "grad_norm": 0.4866383052323718, "learning_rate": 9.921511669596169e-07, "loss": 0.0258, "step": 7351 }, { "epoch": 3.265378636464579, "grad_norm": 0.4858680526762395, "learning_rate": 9.909924146605065e-07, "loss": 0.0337, "step": 7352 }, { "epoch": 3.2658227848101267, "grad_norm": 0.39515608234465965, "learning_rate": 9.898342649833392e-07, "loss": 0.0192, "step": 7353 }, { "epoch": 3.266266933155674, "grad_norm": 0.36506881604203717, "learning_rate": 9.88676718102201e-07, "loss": 0.021, "step": 7354 }, { "epoch": 3.2667110815012212, "grad_norm": 0.4187792232437751, "learning_rate": 9.87519774191093e-07, "loss": 0.0331, "step": 7355 }, { "epoch": 3.267155229846769, "grad_norm": 0.46579982125230246, "learning_rate": 9.863634334239241e-07, "loss": 0.0231, "step": 7356 }, { "epoch": 3.2675993781923163, "grad_norm": 0.38694241705258164, "learning_rate": 9.852076959745082e-07, "loss": 0.0259, "step": 7357 }, { "epoch": 3.2680435265378636, "grad_norm": 0.46212859105639015, "learning_rate": 9.840525620165763e-07, "loss": 0.0317, "step": 7358 }, { "epoch": 3.268487674883411, "grad_norm": 0.5473555001130638, "learning_rate": 9.828980317237652e-07, "loss": 0.0396, "step": 7359 }, { "epoch": 3.2689318232289586, "grad_norm": 0.36341029826160076, "learning_rate": 9.817441052696164e-07, "loss": 0.0185, "step": 7360 }, { "epoch": 3.269375971574506, "grad_norm": 0.44926471264353585, "learning_rate": 9.805907828275874e-07, "loss": 0.0315, "step": 7361 }, { "epoch": 3.269820119920053, "grad_norm": 0.3017207630293574, "learning_rate": 9.794380645710428e-07, "loss": 0.017, "step": 7362 }, { "epoch": 3.270264268265601, "grad_norm": 0.4066309090272054, "learning_rate": 9.782859506732517e-07, "loss": 0.0252, "step": 7363 }, { "epoch": 3.270708416611148, "grad_norm": 0.45822398666676095, "learning_rate": 9.771344413074018e-07, "loss": 0.0289, "step": 7364 }, { "epoch": 3.2711525649566955, "grad_norm": 0.45754692936117375, "learning_rate": 9.75983536646581e-07, "loss": 0.0204, "step": 7365 }, { "epoch": 3.2715967133022428, "grad_norm": 0.4003872658949929, "learning_rate": 9.748332368637903e-07, "loss": 0.0239, "step": 7366 }, { "epoch": 3.2720408616477905, "grad_norm": 0.40491759770498775, "learning_rate": 9.736835421319397e-07, "loss": 0.016, "step": 7367 }, { "epoch": 3.272485009993338, "grad_norm": 0.40672999266918314, "learning_rate": 9.725344526238495e-07, "loss": 0.0275, "step": 7368 }, { "epoch": 3.272929158338885, "grad_norm": 0.4720762654089572, "learning_rate": 9.713859685122428e-07, "loss": 0.0267, "step": 7369 }, { "epoch": 3.273373306684433, "grad_norm": 0.41020011208140905, "learning_rate": 9.702380899697621e-07, "loss": 0.0231, "step": 7370 }, { "epoch": 3.27381745502998, "grad_norm": 0.51306227756855, "learning_rate": 9.69090817168949e-07, "loss": 0.0213, "step": 7371 }, { "epoch": 3.2742616033755274, "grad_norm": 0.5514386921409069, "learning_rate": 9.67944150282259e-07, "loss": 0.0388, "step": 7372 }, { "epoch": 3.2747057517210747, "grad_norm": 0.47876201361463355, "learning_rate": 9.667980894820572e-07, "loss": 0.0402, "step": 7373 }, { "epoch": 3.2751499000666224, "grad_norm": 0.35596410172080417, "learning_rate": 9.65652634940612e-07, "loss": 0.0211, "step": 7374 }, { "epoch": 3.2755940484121697, "grad_norm": 0.5330920908251411, "learning_rate": 9.64507786830109e-07, "loss": 0.0312, "step": 7375 }, { "epoch": 3.276038196757717, "grad_norm": 0.44331071434194597, "learning_rate": 9.633635453226376e-07, "loss": 0.0241, "step": 7376 }, { "epoch": 3.2764823451032643, "grad_norm": 0.4178851244614469, "learning_rate": 9.622199105901947e-07, "loss": 0.0251, "step": 7377 }, { "epoch": 3.276926493448812, "grad_norm": 0.4925055411838903, "learning_rate": 9.610768828046891e-07, "loss": 0.0312, "step": 7378 }, { "epoch": 3.2773706417943593, "grad_norm": 0.3674042441682116, "learning_rate": 9.59934462137938e-07, "loss": 0.0176, "step": 7379 }, { "epoch": 3.2778147901399066, "grad_norm": 0.41837211074854475, "learning_rate": 9.58792648761664e-07, "loss": 0.0272, "step": 7380 }, { "epoch": 3.278258938485454, "grad_norm": 0.35983457178846384, "learning_rate": 9.576514428475058e-07, "loss": 0.0182, "step": 7381 }, { "epoch": 3.2787030868310016, "grad_norm": 0.5213483263717448, "learning_rate": 9.565108445670013e-07, "loss": 0.0285, "step": 7382 }, { "epoch": 3.279147235176549, "grad_norm": 0.3798413510246776, "learning_rate": 9.55370854091604e-07, "loss": 0.0199, "step": 7383 }, { "epoch": 3.2795913835220962, "grad_norm": 0.40316029556208016, "learning_rate": 9.542314715926753e-07, "loss": 0.023, "step": 7384 }, { "epoch": 3.280035531867644, "grad_norm": 0.3279127936611918, "learning_rate": 9.5309269724148e-07, "loss": 0.0197, "step": 7385 }, { "epoch": 3.2804796802131913, "grad_norm": 0.3077370020937381, "learning_rate": 9.519545312091966e-07, "loss": 0.0186, "step": 7386 }, { "epoch": 3.2809238285587385, "grad_norm": 0.29637415446133053, "learning_rate": 9.508169736669137e-07, "loss": 0.0149, "step": 7387 }, { "epoch": 3.281367976904286, "grad_norm": 0.36806133901057364, "learning_rate": 9.496800247856219e-07, "loss": 0.0255, "step": 7388 }, { "epoch": 3.2818121252498336, "grad_norm": 0.42318209672526275, "learning_rate": 9.485436847362257e-07, "loss": 0.0368, "step": 7389 }, { "epoch": 3.282256273595381, "grad_norm": 0.4481117630581104, "learning_rate": 9.474079536895365e-07, "loss": 0.0323, "step": 7390 }, { "epoch": 3.282700421940928, "grad_norm": 0.4094546536057865, "learning_rate": 9.462728318162712e-07, "loss": 0.0216, "step": 7391 }, { "epoch": 3.283144570286476, "grad_norm": 0.4833214117317346, "learning_rate": 9.451383192870623e-07, "loss": 0.0314, "step": 7392 }, { "epoch": 3.283588718632023, "grad_norm": 0.49172490663115703, "learning_rate": 9.440044162724432e-07, "loss": 0.0229, "step": 7393 }, { "epoch": 3.2840328669775705, "grad_norm": 0.3837206768061979, "learning_rate": 9.428711229428594e-07, "loss": 0.0281, "step": 7394 }, { "epoch": 3.2844770153231178, "grad_norm": 0.38763854772417794, "learning_rate": 9.417384394686646e-07, "loss": 0.0256, "step": 7395 }, { "epoch": 3.2849211636686655, "grad_norm": 0.4292993654748122, "learning_rate": 9.406063660201214e-07, "loss": 0.0178, "step": 7396 }, { "epoch": 3.285365312014213, "grad_norm": 0.4608324783734576, "learning_rate": 9.394749027673955e-07, "loss": 0.03, "step": 7397 }, { "epoch": 3.28580946035976, "grad_norm": 0.4116770755294882, "learning_rate": 9.383440498805712e-07, "loss": 0.0266, "step": 7398 }, { "epoch": 3.286253608705308, "grad_norm": 0.4151574100082846, "learning_rate": 9.3721380752963e-07, "loss": 0.0316, "step": 7399 }, { "epoch": 3.286697757050855, "grad_norm": 0.5201578859027275, "learning_rate": 9.36084175884468e-07, "loss": 0.0273, "step": 7400 }, { "epoch": 3.2871419053964024, "grad_norm": 0.517408046194756, "learning_rate": 9.3495515511489e-07, "loss": 0.039, "step": 7401 }, { "epoch": 3.2875860537419497, "grad_norm": 0.4065347285711147, "learning_rate": 9.338267453906036e-07, "loss": 0.0193, "step": 7402 }, { "epoch": 3.2880302020874974, "grad_norm": 0.35637207302672474, "learning_rate": 9.326989468812281e-07, "loss": 0.0252, "step": 7403 }, { "epoch": 3.2884743504330447, "grad_norm": 0.3942806613898594, "learning_rate": 9.315717597562951e-07, "loss": 0.0224, "step": 7404 }, { "epoch": 3.288918498778592, "grad_norm": 0.38601773480071766, "learning_rate": 9.304451841852358e-07, "loss": 0.0291, "step": 7405 }, { "epoch": 3.2893626471241393, "grad_norm": 0.4412874940460958, "learning_rate": 9.293192203373952e-07, "loss": 0.0223, "step": 7406 }, { "epoch": 3.289806795469687, "grad_norm": 0.36047506716707345, "learning_rate": 9.281938683820258e-07, "loss": 0.0196, "step": 7407 }, { "epoch": 3.2902509438152343, "grad_norm": 0.38105181354876544, "learning_rate": 9.270691284882826e-07, "loss": 0.0215, "step": 7408 }, { "epoch": 3.2906950921607816, "grad_norm": 0.5134105812079635, "learning_rate": 9.259450008252396e-07, "loss": 0.0407, "step": 7409 }, { "epoch": 3.291139240506329, "grad_norm": 0.3562398576839381, "learning_rate": 9.248214855618676e-07, "loss": 0.0267, "step": 7410 }, { "epoch": 3.2915833888518766, "grad_norm": 0.40663195402797286, "learning_rate": 9.236985828670519e-07, "loss": 0.0187, "step": 7411 }, { "epoch": 3.292027537197424, "grad_norm": 0.7855995128015365, "learning_rate": 9.225762929095844e-07, "loss": 0.0372, "step": 7412 }, { "epoch": 3.292471685542971, "grad_norm": 0.4097302898181189, "learning_rate": 9.214546158581622e-07, "loss": 0.0252, "step": 7413 }, { "epoch": 3.292915833888519, "grad_norm": 0.4179153487359494, "learning_rate": 9.203335518813922e-07, "loss": 0.0285, "step": 7414 }, { "epoch": 3.2933599822340662, "grad_norm": 0.5431971211223403, "learning_rate": 9.192131011477934e-07, "loss": 0.0303, "step": 7415 }, { "epoch": 3.2938041305796135, "grad_norm": 0.5065980372967815, "learning_rate": 9.180932638257845e-07, "loss": 0.028, "step": 7416 }, { "epoch": 3.294248278925161, "grad_norm": 0.38822134135830527, "learning_rate": 9.169740400836974e-07, "loss": 0.029, "step": 7417 }, { "epoch": 3.2946924272707085, "grad_norm": 0.3332433458833639, "learning_rate": 9.158554300897727e-07, "loss": 0.0174, "step": 7418 }, { "epoch": 3.295136575616256, "grad_norm": 0.3544713822711631, "learning_rate": 9.147374340121523e-07, "loss": 0.022, "step": 7419 }, { "epoch": 3.295580723961803, "grad_norm": 0.32602497933389224, "learning_rate": 9.13620052018892e-07, "loss": 0.0205, "step": 7420 }, { "epoch": 3.296024872307351, "grad_norm": 0.45839560888184433, "learning_rate": 9.125032842779535e-07, "loss": 0.0285, "step": 7421 }, { "epoch": 3.296469020652898, "grad_norm": 0.4451229775575657, "learning_rate": 9.113871309572059e-07, "loss": 0.0314, "step": 7422 }, { "epoch": 3.2969131689984454, "grad_norm": 0.4294843373554615, "learning_rate": 9.10271592224426e-07, "loss": 0.0279, "step": 7423 }, { "epoch": 3.2973573173439927, "grad_norm": 0.49025642020365773, "learning_rate": 9.091566682472991e-07, "loss": 0.0319, "step": 7424 }, { "epoch": 3.2978014656895405, "grad_norm": 0.39446663575000246, "learning_rate": 9.08042359193414e-07, "loss": 0.0264, "step": 7425 }, { "epoch": 3.2982456140350878, "grad_norm": 0.3073338148065069, "learning_rate": 9.06928665230275e-07, "loss": 0.022, "step": 7426 }, { "epoch": 3.298689762380635, "grad_norm": 0.5363424487989449, "learning_rate": 9.058155865252854e-07, "loss": 0.0369, "step": 7427 }, { "epoch": 3.299133910726183, "grad_norm": 0.4241449124561232, "learning_rate": 9.047031232457609e-07, "loss": 0.0284, "step": 7428 }, { "epoch": 3.29957805907173, "grad_norm": 0.5039477021882938, "learning_rate": 9.035912755589254e-07, "loss": 0.0296, "step": 7429 }, { "epoch": 3.3000222074172774, "grad_norm": 0.6325313716948877, "learning_rate": 9.024800436319059e-07, "loss": 0.0356, "step": 7430 }, { "epoch": 3.3004663557628247, "grad_norm": 0.35326550695737485, "learning_rate": 9.013694276317392e-07, "loss": 0.0293, "step": 7431 }, { "epoch": 3.3009105041083724, "grad_norm": 0.3943172052588995, "learning_rate": 9.002594277253735e-07, "loss": 0.0256, "step": 7432 }, { "epoch": 3.3013546524539197, "grad_norm": 0.3559437962740476, "learning_rate": 8.991500440796569e-07, "loss": 0.0261, "step": 7433 }, { "epoch": 3.301798800799467, "grad_norm": 0.3905214750787488, "learning_rate": 8.9804127686135e-07, "loss": 0.0166, "step": 7434 }, { "epoch": 3.3022429491450143, "grad_norm": 0.5100939910295724, "learning_rate": 8.969331262371206e-07, "loss": 0.0305, "step": 7435 }, { "epoch": 3.302687097490562, "grad_norm": 0.4081830294338292, "learning_rate": 8.958255923735404e-07, "loss": 0.0229, "step": 7436 }, { "epoch": 3.3031312458361093, "grad_norm": 0.41704083656049834, "learning_rate": 8.947186754370907e-07, "loss": 0.0281, "step": 7437 }, { "epoch": 3.3035753941816566, "grad_norm": 0.4789522896492798, "learning_rate": 8.936123755941611e-07, "loss": 0.0309, "step": 7438 }, { "epoch": 3.304019542527204, "grad_norm": 0.5252768558443547, "learning_rate": 8.925066930110465e-07, "loss": 0.0265, "step": 7439 }, { "epoch": 3.3044636908727516, "grad_norm": 0.34510316412181224, "learning_rate": 8.914016278539516e-07, "loss": 0.019, "step": 7440 }, { "epoch": 3.304907839218299, "grad_norm": 0.5857701871825775, "learning_rate": 8.902971802889832e-07, "loss": 0.0387, "step": 7441 }, { "epoch": 3.305351987563846, "grad_norm": 0.5033019496461394, "learning_rate": 8.891933504821604e-07, "loss": 0.0275, "step": 7442 }, { "epoch": 3.305796135909394, "grad_norm": 0.3999151889282477, "learning_rate": 8.880901385994079e-07, "loss": 0.0233, "step": 7443 }, { "epoch": 3.306240284254941, "grad_norm": 0.45097536087731294, "learning_rate": 8.869875448065563e-07, "loss": 0.0281, "step": 7444 }, { "epoch": 3.3066844326004885, "grad_norm": 0.4146176937666059, "learning_rate": 8.858855692693446e-07, "loss": 0.0294, "step": 7445 }, { "epoch": 3.307128580946036, "grad_norm": 0.32341587117342246, "learning_rate": 8.847842121534195e-07, "loss": 0.0254, "step": 7446 }, { "epoch": 3.3075727292915835, "grad_norm": 0.3794842748021583, "learning_rate": 8.836834736243316e-07, "loss": 0.0219, "step": 7447 }, { "epoch": 3.308016877637131, "grad_norm": 0.5092740919087244, "learning_rate": 8.825833538475403e-07, "loss": 0.0279, "step": 7448 }, { "epoch": 3.308461025982678, "grad_norm": 0.3833821462873475, "learning_rate": 8.814838529884162e-07, "loss": 0.0263, "step": 7449 }, { "epoch": 3.308905174328226, "grad_norm": 0.7887453397226605, "learning_rate": 8.803849712122292e-07, "loss": 0.0208, "step": 7450 }, { "epoch": 3.309349322673773, "grad_norm": 0.3907962831950114, "learning_rate": 8.792867086841605e-07, "loss": 0.0206, "step": 7451 }, { "epoch": 3.3097934710193204, "grad_norm": 0.3760419094116477, "learning_rate": 8.781890655692998e-07, "loss": 0.022, "step": 7452 }, { "epoch": 3.3102376193648677, "grad_norm": 0.32255052704787457, "learning_rate": 8.770920420326384e-07, "loss": 0.0181, "step": 7453 }, { "epoch": 3.3106817677104154, "grad_norm": 0.6578667511681319, "learning_rate": 8.759956382390794e-07, "loss": 0.0254, "step": 7454 }, { "epoch": 3.3111259160559627, "grad_norm": 0.42423201463173366, "learning_rate": 8.748998543534304e-07, "loss": 0.0331, "step": 7455 }, { "epoch": 3.31157006440151, "grad_norm": 0.4695388522211957, "learning_rate": 8.738046905404069e-07, "loss": 0.04, "step": 7456 }, { "epoch": 3.3120142127470578, "grad_norm": 0.488606567568898, "learning_rate": 8.72710146964631e-07, "loss": 0.0274, "step": 7457 }, { "epoch": 3.312458361092605, "grad_norm": 0.5406502377024667, "learning_rate": 8.716162237906289e-07, "loss": 0.028, "step": 7458 }, { "epoch": 3.3129025094381523, "grad_norm": 0.5699097094725243, "learning_rate": 8.705229211828376e-07, "loss": 0.03, "step": 7459 }, { "epoch": 3.3133466577836996, "grad_norm": 0.44717354800350295, "learning_rate": 8.694302393055992e-07, "loss": 0.0208, "step": 7460 }, { "epoch": 3.313790806129247, "grad_norm": 0.4253321381991377, "learning_rate": 8.683381783231615e-07, "loss": 0.0234, "step": 7461 }, { "epoch": 3.3142349544747947, "grad_norm": 0.3858599803822746, "learning_rate": 8.672467383996802e-07, "loss": 0.0281, "step": 7462 }, { "epoch": 3.314679102820342, "grad_norm": 1.0071452454502907, "learning_rate": 8.661559196992186e-07, "loss": 0.029, "step": 7463 }, { "epoch": 3.3151232511658892, "grad_norm": 0.5059569170522437, "learning_rate": 8.650657223857428e-07, "loss": 0.0304, "step": 7464 }, { "epoch": 3.315567399511437, "grad_norm": 0.4605836239110675, "learning_rate": 8.639761466231294e-07, "loss": 0.0374, "step": 7465 }, { "epoch": 3.3160115478569843, "grad_norm": 0.5663222302112452, "learning_rate": 8.628871925751598e-07, "loss": 0.029, "step": 7466 }, { "epoch": 3.3164556962025316, "grad_norm": 0.4770838107638845, "learning_rate": 8.617988604055222e-07, "loss": 0.029, "step": 7467 }, { "epoch": 3.316899844548079, "grad_norm": 0.5186866939183644, "learning_rate": 8.607111502778121e-07, "loss": 0.0367, "step": 7468 }, { "epoch": 3.3173439928936266, "grad_norm": 0.43011652502356673, "learning_rate": 8.596240623555313e-07, "loss": 0.0276, "step": 7469 }, { "epoch": 3.317788141239174, "grad_norm": 0.39270558043084164, "learning_rate": 8.585375968020854e-07, "loss": 0.0244, "step": 7470 }, { "epoch": 3.318232289584721, "grad_norm": 0.3922390336468883, "learning_rate": 8.574517537807897e-07, "loss": 0.0247, "step": 7471 }, { "epoch": 3.318676437930269, "grad_norm": 0.4905605263072924, "learning_rate": 8.563665334548654e-07, "loss": 0.0281, "step": 7472 }, { "epoch": 3.319120586275816, "grad_norm": 0.4695814471855208, "learning_rate": 8.552819359874387e-07, "loss": 0.0222, "step": 7473 }, { "epoch": 3.3195647346213635, "grad_norm": 0.49486185686147216, "learning_rate": 8.541979615415446e-07, "loss": 0.0304, "step": 7474 }, { "epoch": 3.3200088829669108, "grad_norm": 0.40290781496927697, "learning_rate": 8.531146102801208e-07, "loss": 0.0221, "step": 7475 }, { "epoch": 3.3204530313124585, "grad_norm": 0.45538047230870293, "learning_rate": 8.520318823660146e-07, "loss": 0.0257, "step": 7476 }, { "epoch": 3.320897179658006, "grad_norm": 0.4425830829496296, "learning_rate": 8.50949777961978e-07, "loss": 0.027, "step": 7477 }, { "epoch": 3.321341328003553, "grad_norm": 0.32164145024703766, "learning_rate": 8.498682972306693e-07, "loss": 0.0171, "step": 7478 }, { "epoch": 3.321785476349101, "grad_norm": 0.33243355641729805, "learning_rate": 8.487874403346547e-07, "loss": 0.0171, "step": 7479 }, { "epoch": 3.322229624694648, "grad_norm": 0.4774020513789674, "learning_rate": 8.477072074364051e-07, "loss": 0.0242, "step": 7480 }, { "epoch": 3.3226737730401954, "grad_norm": 0.7842160045286404, "learning_rate": 8.466275986982963e-07, "loss": 0.0482, "step": 7481 }, { "epoch": 3.3231179213857427, "grad_norm": 0.4462293368851325, "learning_rate": 8.455486142826135e-07, "loss": 0.0206, "step": 7482 }, { "epoch": 3.3235620697312904, "grad_norm": 0.3371098785501344, "learning_rate": 8.444702543515454e-07, "loss": 0.016, "step": 7483 }, { "epoch": 3.3240062180768377, "grad_norm": 0.4689818865427122, "learning_rate": 8.433925190671876e-07, "loss": 0.0294, "step": 7484 }, { "epoch": 3.324450366422385, "grad_norm": 0.46126294870782475, "learning_rate": 8.423154085915447e-07, "loss": 0.0405, "step": 7485 }, { "epoch": 3.3248945147679323, "grad_norm": 0.4269127171229397, "learning_rate": 8.412389230865209e-07, "loss": 0.0215, "step": 7486 }, { "epoch": 3.32533866311348, "grad_norm": 0.3601691103633931, "learning_rate": 8.401630627139317e-07, "loss": 0.0283, "step": 7487 }, { "epoch": 3.3257828114590273, "grad_norm": 0.3308167717075761, "learning_rate": 8.39087827635498e-07, "loss": 0.0203, "step": 7488 }, { "epoch": 3.3262269598045746, "grad_norm": 0.4504744849786426, "learning_rate": 8.380132180128453e-07, "loss": 0.0308, "step": 7489 }, { "epoch": 3.326671108150122, "grad_norm": 0.43274682572551426, "learning_rate": 8.369392340075056e-07, "loss": 0.0192, "step": 7490 }, { "epoch": 3.3271152564956696, "grad_norm": 0.38752988514350534, "learning_rate": 8.358658757809179e-07, "loss": 0.0326, "step": 7491 }, { "epoch": 3.327559404841217, "grad_norm": 0.3966193669552188, "learning_rate": 8.347931434944245e-07, "loss": 0.0218, "step": 7492 }, { "epoch": 3.328003553186764, "grad_norm": 0.37005902190632034, "learning_rate": 8.337210373092763e-07, "loss": 0.02, "step": 7493 }, { "epoch": 3.328447701532312, "grad_norm": 0.3982123833521082, "learning_rate": 8.326495573866284e-07, "loss": 0.0216, "step": 7494 }, { "epoch": 3.3288918498778592, "grad_norm": 0.41678676657338065, "learning_rate": 8.315787038875434e-07, "loss": 0.0284, "step": 7495 }, { "epoch": 3.3293359982234065, "grad_norm": 0.36841312927749376, "learning_rate": 8.305084769729882e-07, "loss": 0.0294, "step": 7496 }, { "epoch": 3.329780146568954, "grad_norm": 0.42365099250088206, "learning_rate": 8.294388768038375e-07, "loss": 0.0294, "step": 7497 }, { "epoch": 3.3302242949145016, "grad_norm": 0.40673867597662916, "learning_rate": 8.283699035408677e-07, "loss": 0.0235, "step": 7498 }, { "epoch": 3.330668443260049, "grad_norm": 0.40817718954362553, "learning_rate": 8.273015573447646e-07, "loss": 0.0229, "step": 7499 }, { "epoch": 3.331112591605596, "grad_norm": 0.47791402955813106, "learning_rate": 8.262338383761199e-07, "loss": 0.0306, "step": 7500 }, { "epoch": 3.331556739951144, "grad_norm": 0.3730312585666912, "learning_rate": 8.251667467954289e-07, "loss": 0.0241, "step": 7501 }, { "epoch": 3.332000888296691, "grad_norm": 0.5006016285563061, "learning_rate": 8.241002827630945e-07, "loss": 0.0284, "step": 7502 }, { "epoch": 3.3324450366422385, "grad_norm": 0.5039257261200133, "learning_rate": 8.230344464394236e-07, "loss": 0.0234, "step": 7503 }, { "epoch": 3.3328891849877857, "grad_norm": 0.38046904572274926, "learning_rate": 8.219692379846289e-07, "loss": 0.033, "step": 7504 }, { "epoch": 3.3333333333333335, "grad_norm": 0.3757296330009624, "learning_rate": 8.209046575588303e-07, "loss": 0.0291, "step": 7505 }, { "epoch": 3.3337774816788808, "grad_norm": 0.380415197644764, "learning_rate": 8.198407053220519e-07, "loss": 0.0219, "step": 7506 }, { "epoch": 3.334221630024428, "grad_norm": 0.39647003849848267, "learning_rate": 8.187773814342242e-07, "loss": 0.0307, "step": 7507 }, { "epoch": 3.334665778369976, "grad_norm": 0.48671927289549416, "learning_rate": 8.177146860551838e-07, "loss": 0.0289, "step": 7508 }, { "epoch": 3.335109926715523, "grad_norm": 0.4169191245545791, "learning_rate": 8.166526193446695e-07, "loss": 0.0286, "step": 7509 }, { "epoch": 3.3355540750610704, "grad_norm": 0.6209631358998482, "learning_rate": 8.155911814623291e-07, "loss": 0.0268, "step": 7510 }, { "epoch": 3.3359982234066177, "grad_norm": 0.4569957343588699, "learning_rate": 8.145303725677145e-07, "loss": 0.025, "step": 7511 }, { "epoch": 3.3364423717521654, "grad_norm": 0.45123275529280976, "learning_rate": 8.134701928202843e-07, "loss": 0.0207, "step": 7512 }, { "epoch": 3.3368865200977127, "grad_norm": 0.3644402406540846, "learning_rate": 8.124106423794015e-07, "loss": 0.0156, "step": 7513 }, { "epoch": 3.33733066844326, "grad_norm": 0.6935267382582291, "learning_rate": 8.113517214043326e-07, "loss": 0.042, "step": 7514 }, { "epoch": 3.3377748167888073, "grad_norm": 0.40069923978491, "learning_rate": 8.102934300542531e-07, "loss": 0.0282, "step": 7515 }, { "epoch": 3.338218965134355, "grad_norm": 0.4259846167113701, "learning_rate": 8.092357684882413e-07, "loss": 0.0289, "step": 7516 }, { "epoch": 3.3386631134799023, "grad_norm": 0.4192011063222261, "learning_rate": 8.081787368652822e-07, "loss": 0.023, "step": 7517 }, { "epoch": 3.3391072618254496, "grad_norm": 0.35804521930200645, "learning_rate": 8.071223353442658e-07, "loss": 0.0294, "step": 7518 }, { "epoch": 3.339551410170997, "grad_norm": 0.3873707899082061, "learning_rate": 8.060665640839882e-07, "loss": 0.0228, "step": 7519 }, { "epoch": 3.3399955585165446, "grad_norm": 0.33653268211511883, "learning_rate": 8.050114232431472e-07, "loss": 0.0229, "step": 7520 }, { "epoch": 3.340439706862092, "grad_norm": 0.4424670871666316, "learning_rate": 8.039569129803493e-07, "loss": 0.0273, "step": 7521 }, { "epoch": 3.340883855207639, "grad_norm": 0.36754810698365975, "learning_rate": 8.029030334541061e-07, "loss": 0.0212, "step": 7522 }, { "epoch": 3.341328003553187, "grad_norm": 0.40011975008051315, "learning_rate": 8.01849784822833e-07, "loss": 0.0213, "step": 7523 }, { "epoch": 3.3417721518987342, "grad_norm": 0.4091440045335287, "learning_rate": 8.007971672448511e-07, "loss": 0.0326, "step": 7524 }, { "epoch": 3.3422163002442815, "grad_norm": 0.3979067624482782, "learning_rate": 7.997451808783884e-07, "loss": 0.0292, "step": 7525 }, { "epoch": 3.342660448589829, "grad_norm": 0.4350141704240594, "learning_rate": 7.986938258815741e-07, "loss": 0.0294, "step": 7526 }, { "epoch": 3.3431045969353765, "grad_norm": 0.3288238940788284, "learning_rate": 7.976431024124448e-07, "loss": 0.02, "step": 7527 }, { "epoch": 3.343548745280924, "grad_norm": 0.40499639306856394, "learning_rate": 7.965930106289432e-07, "loss": 0.0249, "step": 7528 }, { "epoch": 3.343992893626471, "grad_norm": 0.4511936733531736, "learning_rate": 7.955435506889154e-07, "loss": 0.0225, "step": 7529 }, { "epoch": 3.344437041972019, "grad_norm": 0.32996474391647085, "learning_rate": 7.944947227501143e-07, "loss": 0.019, "step": 7530 }, { "epoch": 3.344881190317566, "grad_norm": 0.33989523301533825, "learning_rate": 7.934465269701941e-07, "loss": 0.0281, "step": 7531 }, { "epoch": 3.3453253386631134, "grad_norm": 0.5424696702981813, "learning_rate": 7.923989635067181e-07, "loss": 0.0341, "step": 7532 }, { "epoch": 3.3457694870086607, "grad_norm": 0.5354455170131127, "learning_rate": 7.913520325171537e-07, "loss": 0.0332, "step": 7533 }, { "epoch": 3.3462136353542085, "grad_norm": 0.5188210769342587, "learning_rate": 7.903057341588683e-07, "loss": 0.0346, "step": 7534 }, { "epoch": 3.3466577836997558, "grad_norm": 0.32324444187136564, "learning_rate": 7.892600685891433e-07, "loss": 0.0202, "step": 7535 }, { "epoch": 3.347101932045303, "grad_norm": 0.38834278141098566, "learning_rate": 7.882150359651586e-07, "loss": 0.0233, "step": 7536 }, { "epoch": 3.3475460803908508, "grad_norm": 0.44587329880583815, "learning_rate": 7.871706364439985e-07, "loss": 0.0285, "step": 7537 }, { "epoch": 3.347990228736398, "grad_norm": 0.38089139300391106, "learning_rate": 7.861268701826552e-07, "loss": 0.0264, "step": 7538 }, { "epoch": 3.3484343770819454, "grad_norm": 0.3825436500451749, "learning_rate": 7.850837373380244e-07, "loss": 0.0206, "step": 7539 }, { "epoch": 3.3488785254274926, "grad_norm": 0.3827903172923659, "learning_rate": 7.840412380669071e-07, "loss": 0.0221, "step": 7540 }, { "epoch": 3.3493226737730404, "grad_norm": 0.38625911378052546, "learning_rate": 7.829993725260082e-07, "loss": 0.0192, "step": 7541 }, { "epoch": 3.3497668221185877, "grad_norm": 0.37721835784709495, "learning_rate": 7.81958140871939e-07, "loss": 0.0291, "step": 7542 }, { "epoch": 3.350210970464135, "grad_norm": 0.39810907293172026, "learning_rate": 7.809175432612126e-07, "loss": 0.0239, "step": 7543 }, { "epoch": 3.3506551188096823, "grad_norm": 1.1010267945006187, "learning_rate": 7.798775798502484e-07, "loss": 0.0314, "step": 7544 }, { "epoch": 3.35109926715523, "grad_norm": 0.4353628491116733, "learning_rate": 7.788382507953718e-07, "loss": 0.022, "step": 7545 }, { "epoch": 3.3515434155007773, "grad_norm": 0.392235080799356, "learning_rate": 7.777995562528107e-07, "loss": 0.0187, "step": 7546 }, { "epoch": 3.3519875638463246, "grad_norm": 0.4931466449389555, "learning_rate": 7.767614963787007e-07, "loss": 0.0176, "step": 7547 }, { "epoch": 3.352431712191872, "grad_norm": 0.39853671183474193, "learning_rate": 7.757240713290764e-07, "loss": 0.0259, "step": 7548 }, { "epoch": 3.3528758605374196, "grad_norm": 0.4566981980981673, "learning_rate": 7.746872812598821e-07, "loss": 0.0255, "step": 7549 }, { "epoch": 3.353320008882967, "grad_norm": 0.4075307870306547, "learning_rate": 7.736511263269664e-07, "loss": 0.0259, "step": 7550 }, { "epoch": 3.353764157228514, "grad_norm": 0.44508580022829525, "learning_rate": 7.726156066860769e-07, "loss": 0.0221, "step": 7551 }, { "epoch": 3.354208305574062, "grad_norm": 0.5633393812829065, "learning_rate": 7.715807224928734e-07, "loss": 0.0295, "step": 7552 }, { "epoch": 3.354652453919609, "grad_norm": 0.4540973249501273, "learning_rate": 7.705464739029172e-07, "loss": 0.0262, "step": 7553 }, { "epoch": 3.3550966022651565, "grad_norm": 0.5423667462933897, "learning_rate": 7.695128610716707e-07, "loss": 0.0373, "step": 7554 }, { "epoch": 3.355540750610704, "grad_norm": 0.41533591822222976, "learning_rate": 7.684798841545043e-07, "loss": 0.02, "step": 7555 }, { "epoch": 3.3559848989562515, "grad_norm": 0.37206716629030406, "learning_rate": 7.674475433066925e-07, "loss": 0.027, "step": 7556 }, { "epoch": 3.356429047301799, "grad_norm": 0.489116896622887, "learning_rate": 7.664158386834131e-07, "loss": 0.0276, "step": 7557 }, { "epoch": 3.356873195647346, "grad_norm": 0.3415873437320046, "learning_rate": 7.653847704397504e-07, "loss": 0.0199, "step": 7558 }, { "epoch": 3.357317343992894, "grad_norm": 0.4918207209258021, "learning_rate": 7.643543387306896e-07, "loss": 0.0232, "step": 7559 }, { "epoch": 3.357761492338441, "grad_norm": 0.4284932846117368, "learning_rate": 7.63324543711122e-07, "loss": 0.0304, "step": 7560 }, { "epoch": 3.3582056406839884, "grad_norm": 0.3697104694086724, "learning_rate": 7.622953855358456e-07, "loss": 0.0206, "step": 7561 }, { "epoch": 3.3586497890295357, "grad_norm": 0.3548831840036858, "learning_rate": 7.612668643595561e-07, "loss": 0.0241, "step": 7562 }, { "epoch": 3.3590939373750834, "grad_norm": 0.3814303904037155, "learning_rate": 7.60238980336862e-07, "loss": 0.0285, "step": 7563 }, { "epoch": 3.3595380857206307, "grad_norm": 0.40972342736425, "learning_rate": 7.592117336222709e-07, "loss": 0.0231, "step": 7564 }, { "epoch": 3.359982234066178, "grad_norm": 0.4313904313100148, "learning_rate": 7.581851243701938e-07, "loss": 0.0272, "step": 7565 }, { "epoch": 3.3604263824117258, "grad_norm": 0.42249063117216307, "learning_rate": 7.571591527349481e-07, "loss": 0.0245, "step": 7566 }, { "epoch": 3.360870530757273, "grad_norm": 0.6077788918648797, "learning_rate": 7.561338188707562e-07, "loss": 0.0245, "step": 7567 }, { "epoch": 3.3613146791028203, "grad_norm": 0.3504403283983759, "learning_rate": 7.551091229317398e-07, "loss": 0.0233, "step": 7568 }, { "epoch": 3.3617588274483676, "grad_norm": 0.5216233787292206, "learning_rate": 7.540850650719317e-07, "loss": 0.0323, "step": 7569 }, { "epoch": 3.3622029757939154, "grad_norm": 0.350140151698931, "learning_rate": 7.530616454452644e-07, "loss": 0.0177, "step": 7570 }, { "epoch": 3.3626471241394627, "grad_norm": 0.4407132954191557, "learning_rate": 7.520388642055737e-07, "loss": 0.0315, "step": 7571 }, { "epoch": 3.36309127248501, "grad_norm": 0.39660867426600405, "learning_rate": 7.510167215066022e-07, "loss": 0.0219, "step": 7572 }, { "epoch": 3.3635354208305572, "grad_norm": 0.4580507155342704, "learning_rate": 7.499952175019947e-07, "loss": 0.0289, "step": 7573 }, { "epoch": 3.363979569176105, "grad_norm": 0.3682289326212527, "learning_rate": 7.489743523453013e-07, "loss": 0.0239, "step": 7574 }, { "epoch": 3.3644237175216523, "grad_norm": 0.488168671743432, "learning_rate": 7.479541261899758e-07, "loss": 0.0257, "step": 7575 }, { "epoch": 3.3648678658671995, "grad_norm": 0.5131286442085937, "learning_rate": 7.469345391893739e-07, "loss": 0.0251, "step": 7576 }, { "epoch": 3.365312014212747, "grad_norm": 0.5086234654496984, "learning_rate": 7.459155914967581e-07, "loss": 0.0312, "step": 7577 }, { "epoch": 3.3657561625582946, "grad_norm": 1.2159009232304825, "learning_rate": 7.448972832652939e-07, "loss": 0.034, "step": 7578 }, { "epoch": 3.366200310903842, "grad_norm": 0.42264429800124254, "learning_rate": 7.438796146480471e-07, "loss": 0.0366, "step": 7579 }, { "epoch": 3.366644459249389, "grad_norm": 0.474511735916579, "learning_rate": 7.428625857979943e-07, "loss": 0.0282, "step": 7580 }, { "epoch": 3.367088607594937, "grad_norm": 0.48689953790784196, "learning_rate": 7.418461968680124e-07, "loss": 0.0303, "step": 7581 }, { "epoch": 3.367532755940484, "grad_norm": 0.34637453409423774, "learning_rate": 7.408304480108791e-07, "loss": 0.0213, "step": 7582 }, { "epoch": 3.3679769042860315, "grad_norm": 0.4489470189672078, "learning_rate": 7.398153393792801e-07, "loss": 0.0314, "step": 7583 }, { "epoch": 3.3684210526315788, "grad_norm": 0.38625164102595216, "learning_rate": 7.388008711258049e-07, "loss": 0.0205, "step": 7584 }, { "epoch": 3.3688652009771265, "grad_norm": 0.38974030926942727, "learning_rate": 7.37787043402941e-07, "loss": 0.0263, "step": 7585 }, { "epoch": 3.369309349322674, "grad_norm": 0.43774082705869743, "learning_rate": 7.367738563630894e-07, "loss": 0.0218, "step": 7586 }, { "epoch": 3.369753497668221, "grad_norm": 0.6108985899563969, "learning_rate": 7.357613101585459e-07, "loss": 0.0311, "step": 7587 }, { "epoch": 3.370197646013769, "grad_norm": 0.46815029644071443, "learning_rate": 7.347494049415139e-07, "loss": 0.031, "step": 7588 }, { "epoch": 3.370641794359316, "grad_norm": 0.38759059737412377, "learning_rate": 7.337381408641004e-07, "loss": 0.0201, "step": 7589 }, { "epoch": 3.3710859427048634, "grad_norm": 0.44352496378452017, "learning_rate": 7.327275180783156e-07, "loss": 0.0321, "step": 7590 }, { "epoch": 3.3715300910504107, "grad_norm": 0.48865244874942404, "learning_rate": 7.317175367360729e-07, "loss": 0.024, "step": 7591 }, { "epoch": 3.3719742393959584, "grad_norm": 0.4173965077203295, "learning_rate": 7.30708196989191e-07, "loss": 0.0208, "step": 7592 }, { "epoch": 3.3724183877415057, "grad_norm": 0.4555470735195906, "learning_rate": 7.296994989893885e-07, "loss": 0.0217, "step": 7593 }, { "epoch": 3.372862536087053, "grad_norm": 0.3350515129151666, "learning_rate": 7.286914428882913e-07, "loss": 0.021, "step": 7594 }, { "epoch": 3.3733066844326007, "grad_norm": 0.310648697981219, "learning_rate": 7.276840288374281e-07, "loss": 0.0257, "step": 7595 }, { "epoch": 3.373750832778148, "grad_norm": 0.5863731801123462, "learning_rate": 7.266772569882269e-07, "loss": 0.0311, "step": 7596 }, { "epoch": 3.3741949811236953, "grad_norm": 0.31868586904538343, "learning_rate": 7.256711274920264e-07, "loss": 0.0194, "step": 7597 }, { "epoch": 3.3746391294692426, "grad_norm": 0.3360320574193569, "learning_rate": 7.246656405000646e-07, "loss": 0.0219, "step": 7598 }, { "epoch": 3.37508327781479, "grad_norm": 0.4187991374923001, "learning_rate": 7.236607961634812e-07, "loss": 0.0266, "step": 7599 }, { "epoch": 3.3755274261603376, "grad_norm": 0.3828524070852837, "learning_rate": 7.22656594633322e-07, "loss": 0.026, "step": 7600 }, { "epoch": 3.375971574505885, "grad_norm": 0.4895737837652768, "learning_rate": 7.216530360605379e-07, "loss": 0.0413, "step": 7601 }, { "epoch": 3.376415722851432, "grad_norm": 0.5857587373369051, "learning_rate": 7.206501205959759e-07, "loss": 0.0219, "step": 7602 }, { "epoch": 3.37685987119698, "grad_norm": 0.383441583877617, "learning_rate": 7.196478483903968e-07, "loss": 0.0196, "step": 7603 }, { "epoch": 3.3773040195425272, "grad_norm": 0.33141940449265894, "learning_rate": 7.186462195944555e-07, "loss": 0.0217, "step": 7604 }, { "epoch": 3.3777481678880745, "grad_norm": 0.37300245379755964, "learning_rate": 7.176452343587148e-07, "loss": 0.0216, "step": 7605 }, { "epoch": 3.378192316233622, "grad_norm": 0.33936395007321735, "learning_rate": 7.166448928336411e-07, "loss": 0.0222, "step": 7606 }, { "epoch": 3.3786364645791696, "grad_norm": 0.5246214544898155, "learning_rate": 7.156451951696003e-07, "loss": 0.0235, "step": 7607 }, { "epoch": 3.379080612924717, "grad_norm": 0.3948135131762046, "learning_rate": 7.146461415168637e-07, "loss": 0.0231, "step": 7608 }, { "epoch": 3.379524761270264, "grad_norm": 0.37493268321499074, "learning_rate": 7.136477320256102e-07, "loss": 0.0328, "step": 7609 }, { "epoch": 3.379968909615812, "grad_norm": 0.38860406022862326, "learning_rate": 7.126499668459135e-07, "loss": 0.0215, "step": 7610 }, { "epoch": 3.380413057961359, "grad_norm": 0.443708157737675, "learning_rate": 7.116528461277561e-07, "loss": 0.0272, "step": 7611 }, { "epoch": 3.3808572063069064, "grad_norm": 0.4409205615918148, "learning_rate": 7.106563700210234e-07, "loss": 0.0199, "step": 7612 }, { "epoch": 3.3813013546524537, "grad_norm": 0.37530972086273984, "learning_rate": 7.096605386754995e-07, "loss": 0.0291, "step": 7613 }, { "epoch": 3.3817455029980015, "grad_norm": 0.4774461806256775, "learning_rate": 7.086653522408788e-07, "loss": 0.0274, "step": 7614 }, { "epoch": 3.3821896513435488, "grad_norm": 0.4294747376572903, "learning_rate": 7.076708108667512e-07, "loss": 0.024, "step": 7615 }, { "epoch": 3.382633799689096, "grad_norm": 0.41100245317627226, "learning_rate": 7.066769147026154e-07, "loss": 0.0287, "step": 7616 }, { "epoch": 3.383077948034644, "grad_norm": 0.3045061371856552, "learning_rate": 7.056836638978698e-07, "loss": 0.0198, "step": 7617 }, { "epoch": 3.383522096380191, "grad_norm": 0.3136165513411521, "learning_rate": 7.046910586018186e-07, "loss": 0.0145, "step": 7618 }, { "epoch": 3.3839662447257384, "grad_norm": 0.443848462635569, "learning_rate": 7.036990989636628e-07, "loss": 0.0296, "step": 7619 }, { "epoch": 3.3844103930712857, "grad_norm": 0.4329635290692527, "learning_rate": 7.027077851325164e-07, "loss": 0.0286, "step": 7620 }, { "epoch": 3.3848545414168334, "grad_norm": 0.4425986943763542, "learning_rate": 7.017171172573872e-07, "loss": 0.0319, "step": 7621 }, { "epoch": 3.3852986897623807, "grad_norm": 0.3486098761277207, "learning_rate": 7.007270954871903e-07, "loss": 0.0269, "step": 7622 }, { "epoch": 3.385742838107928, "grad_norm": 0.35813998328060914, "learning_rate": 6.997377199707439e-07, "loss": 0.0212, "step": 7623 }, { "epoch": 3.3861869864534753, "grad_norm": 0.38415273327037347, "learning_rate": 6.987489908567663e-07, "loss": 0.0283, "step": 7624 }, { "epoch": 3.386631134799023, "grad_norm": 0.42667107713144753, "learning_rate": 6.977609082938791e-07, "loss": 0.0364, "step": 7625 }, { "epoch": 3.3870752831445703, "grad_norm": 0.4255004286761174, "learning_rate": 6.967734724306119e-07, "loss": 0.0374, "step": 7626 }, { "epoch": 3.3875194314901176, "grad_norm": 0.3765149927746781, "learning_rate": 6.957866834153898e-07, "loss": 0.028, "step": 7627 }, { "epoch": 3.387963579835665, "grad_norm": 0.5330001693156503, "learning_rate": 6.948005413965448e-07, "loss": 0.0379, "step": 7628 }, { "epoch": 3.3884077281812126, "grad_norm": 0.5059080740347456, "learning_rate": 6.938150465223126e-07, "loss": 0.0244, "step": 7629 }, { "epoch": 3.38885187652676, "grad_norm": 0.43454836222929405, "learning_rate": 6.928301989408253e-07, "loss": 0.0323, "step": 7630 }, { "epoch": 3.389296024872307, "grad_norm": 0.3908471485523718, "learning_rate": 6.918459988001281e-07, "loss": 0.0312, "step": 7631 }, { "epoch": 3.389740173217855, "grad_norm": 0.4420676727706239, "learning_rate": 6.908624462481584e-07, "loss": 0.026, "step": 7632 }, { "epoch": 3.390184321563402, "grad_norm": 0.35748857241333987, "learning_rate": 6.898795414327624e-07, "loss": 0.0212, "step": 7633 }, { "epoch": 3.3906284699089495, "grad_norm": 0.3032775147821817, "learning_rate": 6.888972845016889e-07, "loss": 0.0255, "step": 7634 }, { "epoch": 3.391072618254497, "grad_norm": 0.3982622931840567, "learning_rate": 6.879156756025851e-07, "loss": 0.0281, "step": 7635 }, { "epoch": 3.3915167666000445, "grad_norm": 0.5203243077622092, "learning_rate": 6.869347148830035e-07, "loss": 0.0249, "step": 7636 }, { "epoch": 3.391960914945592, "grad_norm": 0.5073777888896281, "learning_rate": 6.85954402490403e-07, "loss": 0.0408, "step": 7637 }, { "epoch": 3.392405063291139, "grad_norm": 0.6112073451304307, "learning_rate": 6.849747385721373e-07, "loss": 0.0262, "step": 7638 }, { "epoch": 3.392849211636687, "grad_norm": 0.4334382118989557, "learning_rate": 6.839957232754679e-07, "loss": 0.0247, "step": 7639 }, { "epoch": 3.393293359982234, "grad_norm": 0.37151198090225157, "learning_rate": 6.830173567475584e-07, "loss": 0.0251, "step": 7640 }, { "epoch": 3.3937375083277814, "grad_norm": 0.47432508657043243, "learning_rate": 6.820396391354722e-07, "loss": 0.0361, "step": 7641 }, { "epoch": 3.3941816566733287, "grad_norm": 0.39488273115178324, "learning_rate": 6.810625705861762e-07, "loss": 0.0226, "step": 7642 }, { "epoch": 3.3946258050188765, "grad_norm": 0.5607339834034196, "learning_rate": 6.80086151246544e-07, "loss": 0.0403, "step": 7643 }, { "epoch": 3.3950699533644237, "grad_norm": 0.4287176433981967, "learning_rate": 6.791103812633443e-07, "loss": 0.026, "step": 7644 }, { "epoch": 3.395514101709971, "grad_norm": 0.4160527081898972, "learning_rate": 6.781352607832536e-07, "loss": 0.0248, "step": 7645 }, { "epoch": 3.3959582500555188, "grad_norm": 0.31792810620229006, "learning_rate": 6.771607899528504e-07, "loss": 0.0179, "step": 7646 }, { "epoch": 3.396402398401066, "grad_norm": 0.5611086221445625, "learning_rate": 6.761869689186101e-07, "loss": 0.0266, "step": 7647 }, { "epoch": 3.3968465467466133, "grad_norm": 0.5050890369850448, "learning_rate": 6.752137978269191e-07, "loss": 0.0362, "step": 7648 }, { "epoch": 3.3972906950921606, "grad_norm": 0.3650794330064718, "learning_rate": 6.742412768240586e-07, "loss": 0.0259, "step": 7649 }, { "epoch": 3.3977348434377084, "grad_norm": 0.38912734986088593, "learning_rate": 6.732694060562162e-07, "loss": 0.0222, "step": 7650 }, { "epoch": 3.3981789917832557, "grad_norm": 0.3696530901709022, "learning_rate": 6.722981856694811e-07, "loss": 0.0242, "step": 7651 }, { "epoch": 3.398623140128803, "grad_norm": 0.35581748069826435, "learning_rate": 6.713276158098425e-07, "loss": 0.0203, "step": 7652 }, { "epoch": 3.3990672884743502, "grad_norm": 0.3267819000046619, "learning_rate": 6.703576966231939e-07, "loss": 0.0177, "step": 7653 }, { "epoch": 3.399511436819898, "grad_norm": 1.0254709261428898, "learning_rate": 6.693884282553332e-07, "loss": 0.0272, "step": 7654 }, { "epoch": 3.3999555851654453, "grad_norm": 0.5090905796653779, "learning_rate": 6.684198108519546e-07, "loss": 0.026, "step": 7655 }, { "epoch": 3.4003997335109926, "grad_norm": 0.47438772486404496, "learning_rate": 6.674518445586592e-07, "loss": 0.0293, "step": 7656 }, { "epoch": 3.40084388185654, "grad_norm": 0.508053594793761, "learning_rate": 6.664845295209499e-07, "loss": 0.0397, "step": 7657 }, { "epoch": 3.4012880302020876, "grad_norm": 0.361105004393522, "learning_rate": 6.655178658842282e-07, "loss": 0.0197, "step": 7658 }, { "epoch": 3.401732178547635, "grad_norm": 0.36194311552740505, "learning_rate": 6.645518537938012e-07, "loss": 0.0233, "step": 7659 }, { "epoch": 3.402176326893182, "grad_norm": 0.4835513612825343, "learning_rate": 6.635864933948771e-07, "loss": 0.0243, "step": 7660 }, { "epoch": 3.40262047523873, "grad_norm": 0.40734978570499325, "learning_rate": 6.626217848325656e-07, "loss": 0.0235, "step": 7661 }, { "epoch": 3.403064623584277, "grad_norm": 0.48301835002180094, "learning_rate": 6.616577282518794e-07, "loss": 0.0307, "step": 7662 }, { "epoch": 3.4035087719298245, "grad_norm": 0.4251997392123568, "learning_rate": 6.606943237977331e-07, "loss": 0.0193, "step": 7663 }, { "epoch": 3.4039529202753718, "grad_norm": 0.45591767603385597, "learning_rate": 6.597315716149394e-07, "loss": 0.0348, "step": 7664 }, { "epoch": 3.4043970686209195, "grad_norm": 0.38978320299224184, "learning_rate": 6.587694718482213e-07, "loss": 0.024, "step": 7665 }, { "epoch": 3.404841216966467, "grad_norm": 0.36098640085183586, "learning_rate": 6.578080246421947e-07, "loss": 0.0231, "step": 7666 }, { "epoch": 3.405285365312014, "grad_norm": 0.42381127870689095, "learning_rate": 6.568472301413836e-07, "loss": 0.0293, "step": 7667 }, { "epoch": 3.405729513657562, "grad_norm": 0.36978104460309597, "learning_rate": 6.558870884902119e-07, "loss": 0.0228, "step": 7668 }, { "epoch": 3.406173662003109, "grad_norm": 0.5467829172657439, "learning_rate": 6.549275998330029e-07, "loss": 0.027, "step": 7669 }, { "epoch": 3.4066178103486564, "grad_norm": 0.42439207241368543, "learning_rate": 6.539687643139847e-07, "loss": 0.0243, "step": 7670 }, { "epoch": 3.4070619586942037, "grad_norm": 0.34272251460913966, "learning_rate": 6.530105820772897e-07, "loss": 0.0151, "step": 7671 }, { "epoch": 3.4075061070397514, "grad_norm": 0.4182566186357329, "learning_rate": 6.52053053266945e-07, "loss": 0.0205, "step": 7672 }, { "epoch": 3.4079502553852987, "grad_norm": 0.350434375510393, "learning_rate": 6.51096178026886e-07, "loss": 0.0237, "step": 7673 }, { "epoch": 3.408394403730846, "grad_norm": 0.36842396127077415, "learning_rate": 6.50139956500947e-07, "loss": 0.0248, "step": 7674 }, { "epoch": 3.4088385520763937, "grad_norm": 0.3776577384417059, "learning_rate": 6.491843888328625e-07, "loss": 0.026, "step": 7675 }, { "epoch": 3.409282700421941, "grad_norm": 0.4739319012198071, "learning_rate": 6.482294751662721e-07, "loss": 0.0353, "step": 7676 }, { "epoch": 3.4097268487674883, "grad_norm": 0.35847335034757755, "learning_rate": 6.472752156447148e-07, "loss": 0.0198, "step": 7677 }, { "epoch": 3.4101709971130356, "grad_norm": 0.41470003345548434, "learning_rate": 6.463216104116327e-07, "loss": 0.0206, "step": 7678 }, { "epoch": 3.4106151454585834, "grad_norm": 0.41774456358583356, "learning_rate": 6.453686596103697e-07, "loss": 0.0296, "step": 7679 }, { "epoch": 3.4110592938041306, "grad_norm": 0.4014386567309541, "learning_rate": 6.444163633841688e-07, "loss": 0.0283, "step": 7680 }, { "epoch": 3.411503442149678, "grad_norm": 0.5677232388670659, "learning_rate": 6.434647218761764e-07, "loss": 0.0399, "step": 7681 }, { "epoch": 3.4119475904952252, "grad_norm": 0.38057897363042437, "learning_rate": 6.425137352294408e-07, "loss": 0.0241, "step": 7682 }, { "epoch": 3.412391738840773, "grad_norm": 0.4520213263061807, "learning_rate": 6.415634035869117e-07, "loss": 0.0242, "step": 7683 }, { "epoch": 3.4128358871863202, "grad_norm": 0.4445568386773207, "learning_rate": 6.406137270914404e-07, "loss": 0.0327, "step": 7684 }, { "epoch": 3.4132800355318675, "grad_norm": 0.48242749215653374, "learning_rate": 6.396647058857792e-07, "loss": 0.031, "step": 7685 }, { "epoch": 3.413724183877415, "grad_norm": 0.3708344900625483, "learning_rate": 6.387163401125812e-07, "loss": 0.0233, "step": 7686 }, { "epoch": 3.4141683322229626, "grad_norm": 0.3782593306434555, "learning_rate": 6.377686299144025e-07, "loss": 0.0269, "step": 7687 }, { "epoch": 3.41461248056851, "grad_norm": 0.43908345913471114, "learning_rate": 6.368215754337004e-07, "loss": 0.0307, "step": 7688 }, { "epoch": 3.415056628914057, "grad_norm": 0.5407716737755903, "learning_rate": 6.358751768128324e-07, "loss": 0.0259, "step": 7689 }, { "epoch": 3.415500777259605, "grad_norm": 0.3714076929525467, "learning_rate": 6.349294341940593e-07, "loss": 0.0182, "step": 7690 }, { "epoch": 3.415944925605152, "grad_norm": 0.5168319772596983, "learning_rate": 6.339843477195423e-07, "loss": 0.0405, "step": 7691 }, { "epoch": 3.4163890739506995, "grad_norm": 0.470888647389716, "learning_rate": 6.330399175313429e-07, "loss": 0.0326, "step": 7692 }, { "epoch": 3.4168332222962468, "grad_norm": 0.3446195227420253, "learning_rate": 6.320961437714257e-07, "loss": 0.0136, "step": 7693 }, { "epoch": 3.4172773706417945, "grad_norm": 0.37511059576115724, "learning_rate": 6.311530265816551e-07, "loss": 0.0336, "step": 7694 }, { "epoch": 3.4177215189873418, "grad_norm": 0.362187597907496, "learning_rate": 6.302105661037988e-07, "loss": 0.0212, "step": 7695 }, { "epoch": 3.418165667332889, "grad_norm": 0.4435954280766343, "learning_rate": 6.292687624795257e-07, "loss": 0.0315, "step": 7696 }, { "epoch": 3.418609815678437, "grad_norm": 0.36818994844478536, "learning_rate": 6.283276158504015e-07, "loss": 0.0226, "step": 7697 }, { "epoch": 3.419053964023984, "grad_norm": 0.35373090512917094, "learning_rate": 6.27387126357899e-07, "loss": 0.0165, "step": 7698 }, { "epoch": 3.4194981123695314, "grad_norm": 0.364744423870996, "learning_rate": 6.264472941433886e-07, "loss": 0.0159, "step": 7699 }, { "epoch": 3.4199422607150787, "grad_norm": 0.43254863668472154, "learning_rate": 6.255081193481438e-07, "loss": 0.0269, "step": 7700 }, { "epoch": 3.4203864090606264, "grad_norm": 0.3606692968436507, "learning_rate": 6.24569602113338e-07, "loss": 0.0304, "step": 7701 }, { "epoch": 3.4208305574061737, "grad_norm": 0.5007533786139542, "learning_rate": 6.236317425800481e-07, "loss": 0.0287, "step": 7702 }, { "epoch": 3.421274705751721, "grad_norm": 0.41340230947723855, "learning_rate": 6.226945408892477e-07, "loss": 0.0199, "step": 7703 }, { "epoch": 3.4217188540972687, "grad_norm": 0.365085912872857, "learning_rate": 6.21757997181815e-07, "loss": 0.021, "step": 7704 }, { "epoch": 3.422163002442816, "grad_norm": 0.5548854003945028, "learning_rate": 6.208221115985285e-07, "loss": 0.0336, "step": 7705 }, { "epoch": 3.4226071507883633, "grad_norm": 0.5898794387281542, "learning_rate": 6.198868842800681e-07, "loss": 0.0329, "step": 7706 }, { "epoch": 3.4230512991339106, "grad_norm": 0.670696139265725, "learning_rate": 6.189523153670152e-07, "loss": 0.0411, "step": 7707 }, { "epoch": 3.4234954474794583, "grad_norm": 0.44881661557818314, "learning_rate": 6.180184049998489e-07, "loss": 0.0294, "step": 7708 }, { "epoch": 3.4239395958250056, "grad_norm": 0.41059208128468105, "learning_rate": 6.170851533189537e-07, "loss": 0.0221, "step": 7709 }, { "epoch": 3.424383744170553, "grad_norm": 0.4264078976239932, "learning_rate": 6.161525604646124e-07, "loss": 0.0247, "step": 7710 }, { "epoch": 3.4248278925161, "grad_norm": 0.4696315553933537, "learning_rate": 6.152206265770095e-07, "loss": 0.026, "step": 7711 }, { "epoch": 3.425272040861648, "grad_norm": 0.45378823926056083, "learning_rate": 6.142893517962312e-07, "loss": 0.0235, "step": 7712 }, { "epoch": 3.4257161892071952, "grad_norm": 0.4121833782140256, "learning_rate": 6.133587362622645e-07, "loss": 0.0272, "step": 7713 }, { "epoch": 3.4261603375527425, "grad_norm": 0.3934369295423382, "learning_rate": 6.124287801149942e-07, "loss": 0.0254, "step": 7714 }, { "epoch": 3.42660448589829, "grad_norm": 0.5182408476406597, "learning_rate": 6.114994834942106e-07, "loss": 0.0311, "step": 7715 }, { "epoch": 3.4270486342438375, "grad_norm": 0.443783825192744, "learning_rate": 6.105708465396021e-07, "loss": 0.0245, "step": 7716 }, { "epoch": 3.427492782589385, "grad_norm": 0.47129625335261466, "learning_rate": 6.096428693907591e-07, "loss": 0.0254, "step": 7717 }, { "epoch": 3.427936930934932, "grad_norm": 0.38995292895347056, "learning_rate": 6.087155521871713e-07, "loss": 0.028, "step": 7718 }, { "epoch": 3.42838107928048, "grad_norm": 0.38715826339897036, "learning_rate": 6.077888950682326e-07, "loss": 0.0228, "step": 7719 }, { "epoch": 3.428825227626027, "grad_norm": 0.4108315391155564, "learning_rate": 6.068628981732322e-07, "loss": 0.0235, "step": 7720 }, { "epoch": 3.4292693759715744, "grad_norm": 0.3702934691579192, "learning_rate": 6.059375616413643e-07, "loss": 0.0211, "step": 7721 }, { "epoch": 3.4297135243171217, "grad_norm": 0.3963620092994827, "learning_rate": 6.050128856117232e-07, "loss": 0.019, "step": 7722 }, { "epoch": 3.4301576726626695, "grad_norm": 0.37150524249459593, "learning_rate": 6.040888702233033e-07, "loss": 0.0224, "step": 7723 }, { "epoch": 3.4306018210082168, "grad_norm": 0.44821126190431915, "learning_rate": 6.031655156150007e-07, "loss": 0.0305, "step": 7724 }, { "epoch": 3.431045969353764, "grad_norm": 0.35283254586034607, "learning_rate": 6.022428219256087e-07, "loss": 0.024, "step": 7725 }, { "epoch": 3.431490117699312, "grad_norm": 0.4935930259013348, "learning_rate": 6.013207892938261e-07, "loss": 0.0457, "step": 7726 }, { "epoch": 3.431934266044859, "grad_norm": 0.45369029101077796, "learning_rate": 6.003994178582489e-07, "loss": 0.0243, "step": 7727 }, { "epoch": 3.4323784143904064, "grad_norm": 0.48244218877941225, "learning_rate": 5.994787077573754e-07, "loss": 0.0231, "step": 7728 }, { "epoch": 3.4328225627359537, "grad_norm": 0.3599007846172956, "learning_rate": 5.985586591296044e-07, "loss": 0.0182, "step": 7729 }, { "epoch": 3.4332667110815014, "grad_norm": 0.40382161309843, "learning_rate": 5.976392721132351e-07, "loss": 0.0152, "step": 7730 }, { "epoch": 3.4337108594270487, "grad_norm": 0.44806877408277146, "learning_rate": 5.967205468464648e-07, "loss": 0.026, "step": 7731 }, { "epoch": 3.434155007772596, "grad_norm": 0.3566806457323869, "learning_rate": 5.958024834673953e-07, "loss": 0.0192, "step": 7732 }, { "epoch": 3.4345991561181437, "grad_norm": 0.4251189473202698, "learning_rate": 5.948850821140267e-07, "loss": 0.0218, "step": 7733 }, { "epoch": 3.435043304463691, "grad_norm": 0.38225003080929293, "learning_rate": 5.939683429242604e-07, "loss": 0.0217, "step": 7734 }, { "epoch": 3.4354874528092383, "grad_norm": 0.34970498765009345, "learning_rate": 5.930522660358973e-07, "loss": 0.0167, "step": 7735 }, { "epoch": 3.4359316011547856, "grad_norm": 0.41748998353130423, "learning_rate": 5.921368515866405e-07, "loss": 0.0314, "step": 7736 }, { "epoch": 3.436375749500333, "grad_norm": 0.5166214998053038, "learning_rate": 5.912220997140905e-07, "loss": 0.0327, "step": 7737 }, { "epoch": 3.4368198978458806, "grad_norm": 0.4503364511144795, "learning_rate": 5.903080105557507e-07, "loss": 0.0305, "step": 7738 }, { "epoch": 3.437264046191428, "grad_norm": 0.38794068759388, "learning_rate": 5.893945842490245e-07, "loss": 0.0234, "step": 7739 }, { "epoch": 3.437708194536975, "grad_norm": 0.4805056088393158, "learning_rate": 5.884818209312159e-07, "loss": 0.0298, "step": 7740 }, { "epoch": 3.438152342882523, "grad_norm": 0.5076559758304271, "learning_rate": 5.875697207395286e-07, "loss": 0.035, "step": 7741 }, { "epoch": 3.43859649122807, "grad_norm": 0.42338511678046004, "learning_rate": 5.866582838110657e-07, "loss": 0.0363, "step": 7742 }, { "epoch": 3.4390406395736175, "grad_norm": 0.36808395417139134, "learning_rate": 5.857475102828325e-07, "loss": 0.0188, "step": 7743 }, { "epoch": 3.439484787919165, "grad_norm": 0.3177380221983695, "learning_rate": 5.848374002917329e-07, "loss": 0.0193, "step": 7744 }, { "epoch": 3.4399289362647125, "grad_norm": 0.3914364144553906, "learning_rate": 5.839279539745729e-07, "loss": 0.0223, "step": 7745 }, { "epoch": 3.44037308461026, "grad_norm": 0.402651938874195, "learning_rate": 5.830191714680578e-07, "loss": 0.0199, "step": 7746 }, { "epoch": 3.440817232955807, "grad_norm": 0.3522412455115687, "learning_rate": 5.821110529087932e-07, "loss": 0.0247, "step": 7747 }, { "epoch": 3.441261381301355, "grad_norm": 0.4021635163074727, "learning_rate": 5.812035984332832e-07, "loss": 0.0229, "step": 7748 }, { "epoch": 3.441705529646902, "grad_norm": 0.4251094753016676, "learning_rate": 5.802968081779342e-07, "loss": 0.024, "step": 7749 }, { "epoch": 3.4421496779924494, "grad_norm": 0.3662362137440131, "learning_rate": 5.79390682279053e-07, "loss": 0.0247, "step": 7750 }, { "epoch": 3.4425938263379967, "grad_norm": 0.4585215842279442, "learning_rate": 5.784852208728453e-07, "loss": 0.0295, "step": 7751 }, { "epoch": 3.4430379746835444, "grad_norm": 0.5141537675737569, "learning_rate": 5.775804240954181e-07, "loss": 0.0318, "step": 7752 }, { "epoch": 3.4434821230290917, "grad_norm": 0.5180769012624266, "learning_rate": 5.766762920827762e-07, "loss": 0.0255, "step": 7753 }, { "epoch": 3.443926271374639, "grad_norm": 0.4003620998147694, "learning_rate": 5.757728249708261e-07, "loss": 0.0271, "step": 7754 }, { "epoch": 3.4443704197201868, "grad_norm": 0.3418440737923465, "learning_rate": 5.748700228953758e-07, "loss": 0.0184, "step": 7755 }, { "epoch": 3.444814568065734, "grad_norm": 0.687451428270991, "learning_rate": 5.739678859921299e-07, "loss": 0.0362, "step": 7756 }, { "epoch": 3.4452587164112813, "grad_norm": 0.5024182127797775, "learning_rate": 5.730664143966969e-07, "loss": 0.0278, "step": 7757 }, { "epoch": 3.4457028647568286, "grad_norm": 0.38303277714696055, "learning_rate": 5.721656082445825e-07, "loss": 0.0193, "step": 7758 }, { "epoch": 3.4461470131023764, "grad_norm": 0.47510936774303797, "learning_rate": 5.712654676711921e-07, "loss": 0.0246, "step": 7759 }, { "epoch": 3.4465911614479237, "grad_norm": 0.3741311721785513, "learning_rate": 5.703659928118333e-07, "loss": 0.0181, "step": 7760 }, { "epoch": 3.447035309793471, "grad_norm": 0.4576763768736328, "learning_rate": 5.694671838017119e-07, "loss": 0.0274, "step": 7761 }, { "epoch": 3.4474794581390182, "grad_norm": 0.36322150440607, "learning_rate": 5.685690407759342e-07, "loss": 0.0198, "step": 7762 }, { "epoch": 3.447923606484566, "grad_norm": 0.5039426250866698, "learning_rate": 5.676715638695063e-07, "loss": 0.0299, "step": 7763 }, { "epoch": 3.4483677548301133, "grad_norm": 0.499760373842697, "learning_rate": 5.667747532173362e-07, "loss": 0.0244, "step": 7764 }, { "epoch": 3.4488119031756606, "grad_norm": 0.42349126436586376, "learning_rate": 5.658786089542262e-07, "loss": 0.0265, "step": 7765 }, { "epoch": 3.449256051521208, "grad_norm": 0.5133192715069692, "learning_rate": 5.649831312148845e-07, "loss": 0.0381, "step": 7766 }, { "epoch": 3.4497001998667556, "grad_norm": 0.4265674001652618, "learning_rate": 5.640883201339154e-07, "loss": 0.0254, "step": 7767 }, { "epoch": 3.450144348212303, "grad_norm": 0.5615805590965591, "learning_rate": 5.631941758458254e-07, "loss": 0.028, "step": 7768 }, { "epoch": 3.45058849655785, "grad_norm": 0.6650175323233352, "learning_rate": 5.623006984850193e-07, "loss": 0.0313, "step": 7769 }, { "epoch": 3.451032644903398, "grad_norm": 0.4291032020769662, "learning_rate": 5.61407888185801e-07, "loss": 0.031, "step": 7770 }, { "epoch": 3.451476793248945, "grad_norm": 0.438818710394981, "learning_rate": 5.60515745082375e-07, "loss": 0.0271, "step": 7771 }, { "epoch": 3.4519209415944925, "grad_norm": 0.4109123087663869, "learning_rate": 5.596242693088478e-07, "loss": 0.022, "step": 7772 }, { "epoch": 3.4523650899400398, "grad_norm": 0.5366106862877935, "learning_rate": 5.587334609992195e-07, "loss": 0.04, "step": 7773 }, { "epoch": 3.4528092382855875, "grad_norm": 0.37769149540290886, "learning_rate": 5.578433202873967e-07, "loss": 0.0292, "step": 7774 }, { "epoch": 3.453253386631135, "grad_norm": 0.4649378802579783, "learning_rate": 5.569538473071834e-07, "loss": 0.0227, "step": 7775 }, { "epoch": 3.453697534976682, "grad_norm": 0.48743583603374796, "learning_rate": 5.560650421922798e-07, "loss": 0.0315, "step": 7776 }, { "epoch": 3.45414168332223, "grad_norm": 0.5101610442676535, "learning_rate": 5.551769050762895e-07, "loss": 0.03, "step": 7777 }, { "epoch": 3.454585831667777, "grad_norm": 0.3279417409470981, "learning_rate": 5.542894360927148e-07, "loss": 0.0171, "step": 7778 }, { "epoch": 3.4550299800133244, "grad_norm": 0.36215562405473767, "learning_rate": 5.534026353749572e-07, "loss": 0.0249, "step": 7779 }, { "epoch": 3.4554741283588717, "grad_norm": 0.33716482541449544, "learning_rate": 5.52516503056319e-07, "loss": 0.0205, "step": 7780 }, { "epoch": 3.4559182767044194, "grad_norm": 0.3903797663782476, "learning_rate": 5.516310392699991e-07, "loss": 0.0192, "step": 7781 }, { "epoch": 3.4563624250499667, "grad_norm": 0.4369763600207604, "learning_rate": 5.507462441490985e-07, "loss": 0.0254, "step": 7782 }, { "epoch": 3.456806573395514, "grad_norm": 0.4196928785980343, "learning_rate": 5.498621178266167e-07, "loss": 0.0242, "step": 7783 }, { "epoch": 3.4572507217410617, "grad_norm": 0.36290486329076327, "learning_rate": 5.489786604354535e-07, "loss": 0.0284, "step": 7784 }, { "epoch": 3.457694870086609, "grad_norm": 0.4926206504376682, "learning_rate": 5.480958721084074e-07, "loss": 0.0281, "step": 7785 }, { "epoch": 3.4581390184321563, "grad_norm": 0.4384128683223243, "learning_rate": 5.472137529781768e-07, "loss": 0.0281, "step": 7786 }, { "epoch": 3.4585831667777036, "grad_norm": 0.42664316505284405, "learning_rate": 5.463323031773581e-07, "loss": 0.0286, "step": 7787 }, { "epoch": 3.4590273151232513, "grad_norm": 0.47490376622622354, "learning_rate": 5.454515228384493e-07, "loss": 0.0208, "step": 7788 }, { "epoch": 3.4594714634687986, "grad_norm": 0.40714987915665685, "learning_rate": 5.445714120938467e-07, "loss": 0.0321, "step": 7789 }, { "epoch": 3.459915611814346, "grad_norm": 0.3507834802550716, "learning_rate": 5.436919710758432e-07, "loss": 0.0225, "step": 7790 }, { "epoch": 3.460359760159893, "grad_norm": 0.3642385320392474, "learning_rate": 5.42813199916637e-07, "loss": 0.025, "step": 7791 }, { "epoch": 3.460803908505441, "grad_norm": 0.3676585444584738, "learning_rate": 5.419350987483224e-07, "loss": 0.026, "step": 7792 }, { "epoch": 3.4612480568509882, "grad_norm": 0.42929474369532705, "learning_rate": 5.410576677028906e-07, "loss": 0.0316, "step": 7793 }, { "epoch": 3.4616922051965355, "grad_norm": 0.45518167323880976, "learning_rate": 5.401809069122354e-07, "loss": 0.0284, "step": 7794 }, { "epoch": 3.462136353542083, "grad_norm": 0.4983281791130265, "learning_rate": 5.393048165081493e-07, "loss": 0.0411, "step": 7795 }, { "epoch": 3.4625805018876306, "grad_norm": 0.419339473620287, "learning_rate": 5.384293966223231e-07, "loss": 0.0215, "step": 7796 }, { "epoch": 3.463024650233178, "grad_norm": 0.3599252115295799, "learning_rate": 5.37554647386348e-07, "loss": 0.0254, "step": 7797 }, { "epoch": 3.463468798578725, "grad_norm": 0.35977012849103657, "learning_rate": 5.366805689317129e-07, "loss": 0.0293, "step": 7798 }, { "epoch": 3.463912946924273, "grad_norm": 0.5206144817249008, "learning_rate": 5.358071613898064e-07, "loss": 0.0272, "step": 7799 }, { "epoch": 3.46435709526982, "grad_norm": 0.2890681249275682, "learning_rate": 5.349344248919175e-07, "loss": 0.0199, "step": 7800 }, { "epoch": 3.4648012436153675, "grad_norm": 0.4535682999458182, "learning_rate": 5.340623595692313e-07, "loss": 0.0312, "step": 7801 }, { "epoch": 3.4652453919609147, "grad_norm": 0.4712735624226538, "learning_rate": 5.331909655528361e-07, "loss": 0.0294, "step": 7802 }, { "epoch": 3.4656895403064625, "grad_norm": 0.39969281186884054, "learning_rate": 5.323202429737179e-07, "loss": 0.0214, "step": 7803 }, { "epoch": 3.4661336886520098, "grad_norm": 0.40892930316739623, "learning_rate": 5.31450191962759e-07, "loss": 0.0273, "step": 7804 }, { "epoch": 3.466577836997557, "grad_norm": 0.40420963258096243, "learning_rate": 5.305808126507433e-07, "loss": 0.0257, "step": 7805 }, { "epoch": 3.467021985343105, "grad_norm": 0.4298719054004784, "learning_rate": 5.297121051683546e-07, "loss": 0.0202, "step": 7806 }, { "epoch": 3.467466133688652, "grad_norm": 0.4156335033725289, "learning_rate": 5.288440696461716e-07, "loss": 0.0206, "step": 7807 }, { "epoch": 3.4679102820341994, "grad_norm": 0.6442580410348813, "learning_rate": 5.279767062146784e-07, "loss": 0.0335, "step": 7808 }, { "epoch": 3.4683544303797467, "grad_norm": 0.42219725211126974, "learning_rate": 5.271100150042518e-07, "loss": 0.0263, "step": 7809 }, { "epoch": 3.4687985787252944, "grad_norm": 0.39548539897151663, "learning_rate": 5.262439961451709e-07, "loss": 0.0249, "step": 7810 }, { "epoch": 3.4692427270708417, "grad_norm": 0.3080627076539832, "learning_rate": 5.253786497676134e-07, "loss": 0.0189, "step": 7811 }, { "epoch": 3.469686875416389, "grad_norm": 0.435674321335036, "learning_rate": 5.245139760016549e-07, "loss": 0.026, "step": 7812 }, { "epoch": 3.4701310237619367, "grad_norm": 0.3308167388953746, "learning_rate": 5.236499749772716e-07, "loss": 0.0192, "step": 7813 }, { "epoch": 3.470575172107484, "grad_norm": 0.411730156267874, "learning_rate": 5.227866468243376e-07, "loss": 0.0278, "step": 7814 }, { "epoch": 3.4710193204530313, "grad_norm": 0.4770932092793088, "learning_rate": 5.219239916726243e-07, "loss": 0.0356, "step": 7815 }, { "epoch": 3.4714634687985786, "grad_norm": 0.5082942621753359, "learning_rate": 5.210620096518044e-07, "loss": 0.0335, "step": 7816 }, { "epoch": 3.4719076171441263, "grad_norm": 0.5849666623331924, "learning_rate": 5.202007008914489e-07, "loss": 0.0298, "step": 7817 }, { "epoch": 3.4723517654896736, "grad_norm": 0.5043080758971297, "learning_rate": 5.193400655210251e-07, "loss": 0.0257, "step": 7818 }, { "epoch": 3.472795913835221, "grad_norm": 0.35362037908035154, "learning_rate": 5.184801036699033e-07, "loss": 0.0195, "step": 7819 }, { "epoch": 3.473240062180768, "grad_norm": 0.3793466003982311, "learning_rate": 5.176208154673502e-07, "loss": 0.0198, "step": 7820 }, { "epoch": 3.473684210526316, "grad_norm": 0.3602963289642836, "learning_rate": 5.167622010425305e-07, "loss": 0.0241, "step": 7821 }, { "epoch": 3.474128358871863, "grad_norm": 0.4045415464931093, "learning_rate": 5.159042605245085e-07, "loss": 0.0308, "step": 7822 }, { "epoch": 3.4745725072174105, "grad_norm": 0.3631349056889571, "learning_rate": 5.150469940422487e-07, "loss": 0.0199, "step": 7823 }, { "epoch": 3.475016655562958, "grad_norm": 0.47233718849299045, "learning_rate": 5.141904017246097e-07, "loss": 0.0192, "step": 7824 }, { "epoch": 3.4754608039085055, "grad_norm": 0.4948999022011918, "learning_rate": 5.133344837003557e-07, "loss": 0.0354, "step": 7825 }, { "epoch": 3.475904952254053, "grad_norm": 0.37036897500750726, "learning_rate": 5.124792400981432e-07, "loss": 0.0236, "step": 7826 }, { "epoch": 3.4763491005996, "grad_norm": 0.33159665048430614, "learning_rate": 5.116246710465306e-07, "loss": 0.0243, "step": 7827 }, { "epoch": 3.476793248945148, "grad_norm": 0.4934189727153532, "learning_rate": 5.10770776673975e-07, "loss": 0.0221, "step": 7828 }, { "epoch": 3.477237397290695, "grad_norm": 0.40096875281363703, "learning_rate": 5.099175571088283e-07, "loss": 0.0224, "step": 7829 }, { "epoch": 3.4776815456362424, "grad_norm": 0.5694968604223205, "learning_rate": 5.090650124793472e-07, "loss": 0.0344, "step": 7830 }, { "epoch": 3.4781256939817897, "grad_norm": 0.347016380477264, "learning_rate": 5.082131429136833e-07, "loss": 0.0246, "step": 7831 }, { "epoch": 3.4785698423273375, "grad_norm": 0.3939110962708003, "learning_rate": 5.073619485398845e-07, "loss": 0.0216, "step": 7832 }, { "epoch": 3.4790139906728847, "grad_norm": 0.5312038258908143, "learning_rate": 5.065114294859019e-07, "loss": 0.026, "step": 7833 }, { "epoch": 3.479458139018432, "grad_norm": 0.3820612120014884, "learning_rate": 5.056615858795838e-07, "loss": 0.0231, "step": 7834 }, { "epoch": 3.4799022873639798, "grad_norm": 0.5500948375085558, "learning_rate": 5.048124178486724e-07, "loss": 0.0371, "step": 7835 }, { "epoch": 3.480346435709527, "grad_norm": 0.4385367139000315, "learning_rate": 5.039639255208156e-07, "loss": 0.032, "step": 7836 }, { "epoch": 3.4807905840550744, "grad_norm": 0.3825085947680331, "learning_rate": 5.031161090235559e-07, "loss": 0.0214, "step": 7837 }, { "epoch": 3.4812347324006216, "grad_norm": 0.42377290578412, "learning_rate": 5.022689684843329e-07, "loss": 0.0253, "step": 7838 }, { "epoch": 3.4816788807461694, "grad_norm": 0.3200977768252115, "learning_rate": 5.014225040304871e-07, "loss": 0.0172, "step": 7839 }, { "epoch": 3.4821230290917167, "grad_norm": 0.4075927728120571, "learning_rate": 5.005767157892572e-07, "loss": 0.0234, "step": 7840 }, { "epoch": 3.482567177437264, "grad_norm": 0.494712633880696, "learning_rate": 4.99731603887777e-07, "loss": 0.0308, "step": 7841 }, { "epoch": 3.4830113257828117, "grad_norm": 0.3951757300545286, "learning_rate": 4.98887168453085e-07, "loss": 0.0249, "step": 7842 }, { "epoch": 3.483455474128359, "grad_norm": 0.3887321324351885, "learning_rate": 4.980434096121106e-07, "loss": 0.0299, "step": 7843 }, { "epoch": 3.4838996224739063, "grad_norm": 0.403812873928378, "learning_rate": 4.97200327491687e-07, "loss": 0.0219, "step": 7844 }, { "epoch": 3.4843437708194536, "grad_norm": 0.4454713027127471, "learning_rate": 4.963579222185444e-07, "loss": 0.0247, "step": 7845 }, { "epoch": 3.4847879191650013, "grad_norm": 0.41374791040066133, "learning_rate": 4.955161939193087e-07, "loss": 0.0213, "step": 7846 }, { "epoch": 3.4852320675105486, "grad_norm": 0.460381233413127, "learning_rate": 4.946751427205054e-07, "loss": 0.0359, "step": 7847 }, { "epoch": 3.485676215856096, "grad_norm": 0.5274863598727876, "learning_rate": 4.938347687485629e-07, "loss": 0.0249, "step": 7848 }, { "epoch": 3.486120364201643, "grad_norm": 0.3376353042657825, "learning_rate": 4.929950721297993e-07, "loss": 0.0197, "step": 7849 }, { "epoch": 3.486564512547191, "grad_norm": 0.4896159029658331, "learning_rate": 4.921560529904374e-07, "loss": 0.0291, "step": 7850 }, { "epoch": 3.487008660892738, "grad_norm": 0.36649328899036365, "learning_rate": 4.913177114565964e-07, "loss": 0.0198, "step": 7851 }, { "epoch": 3.4874528092382855, "grad_norm": 0.46757175677374957, "learning_rate": 4.90480047654291e-07, "loss": 0.0359, "step": 7852 }, { "epoch": 3.487896957583833, "grad_norm": 0.4027846982490894, "learning_rate": 4.896430617094389e-07, "loss": 0.0284, "step": 7853 }, { "epoch": 3.4883411059293805, "grad_norm": 0.41506130883345627, "learning_rate": 4.888067537478519e-07, "loss": 0.0253, "step": 7854 }, { "epoch": 3.488785254274928, "grad_norm": 0.4526519663181836, "learning_rate": 4.879711238952412e-07, "loss": 0.0327, "step": 7855 }, { "epoch": 3.489229402620475, "grad_norm": 0.3779652676361818, "learning_rate": 4.871361722772166e-07, "loss": 0.0336, "step": 7856 }, { "epoch": 3.489673550966023, "grad_norm": 0.6066583572691198, "learning_rate": 4.86301899019287e-07, "loss": 0.0277, "step": 7857 }, { "epoch": 3.49011769931157, "grad_norm": 0.516592567980747, "learning_rate": 4.854683042468538e-07, "loss": 0.0345, "step": 7858 }, { "epoch": 3.4905618476571174, "grad_norm": 0.3300726184397244, "learning_rate": 4.84635388085225e-07, "loss": 0.0208, "step": 7859 }, { "epoch": 3.4910059960026647, "grad_norm": 0.4075507938842562, "learning_rate": 4.838031506595992e-07, "loss": 0.0253, "step": 7860 }, { "epoch": 3.4914501443482124, "grad_norm": 0.41601787060330486, "learning_rate": 4.829715920950761e-07, "loss": 0.0237, "step": 7861 }, { "epoch": 3.4918942926937597, "grad_norm": 0.5863384888723406, "learning_rate": 4.821407125166549e-07, "loss": 0.0264, "step": 7862 }, { "epoch": 3.492338441039307, "grad_norm": 0.47433808650477144, "learning_rate": 4.81310512049229e-07, "loss": 0.0285, "step": 7863 }, { "epoch": 3.4927825893848548, "grad_norm": 0.4906763276486267, "learning_rate": 4.804809908175911e-07, "loss": 0.0251, "step": 7864 }, { "epoch": 3.493226737730402, "grad_norm": 0.46826469445803337, "learning_rate": 4.796521489464351e-07, "loss": 0.0269, "step": 7865 }, { "epoch": 3.4936708860759493, "grad_norm": 0.3869063106968133, "learning_rate": 4.788239865603478e-07, "loss": 0.0304, "step": 7866 }, { "epoch": 3.4941150344214966, "grad_norm": 0.4155671863362175, "learning_rate": 4.779965037838164e-07, "loss": 0.0222, "step": 7867 }, { "epoch": 3.4945591827670444, "grad_norm": 0.40399166433294564, "learning_rate": 4.771697007412268e-07, "loss": 0.0296, "step": 7868 }, { "epoch": 3.4950033311125916, "grad_norm": 0.4132725467300106, "learning_rate": 4.763435775568592e-07, "loss": 0.0263, "step": 7869 }, { "epoch": 3.495447479458139, "grad_norm": 0.40246185290766967, "learning_rate": 4.7551813435489703e-07, "loss": 0.0256, "step": 7870 }, { "epoch": 3.4958916278036867, "grad_norm": 0.4363439376254432, "learning_rate": 4.746933712594154e-07, "loss": 0.0303, "step": 7871 }, { "epoch": 3.496335776149234, "grad_norm": 0.346578595904209, "learning_rate": 4.7386928839439183e-07, "loss": 0.0216, "step": 7872 }, { "epoch": 3.4967799244947813, "grad_norm": 0.40218292971261804, "learning_rate": 4.7304588588370113e-07, "loss": 0.022, "step": 7873 }, { "epoch": 3.4972240728403285, "grad_norm": 0.3613752874610863, "learning_rate": 4.722231638511121e-07, "loss": 0.0188, "step": 7874 }, { "epoch": 3.497668221185876, "grad_norm": 0.4305489342659995, "learning_rate": 4.7140112242029356e-07, "loss": 0.0244, "step": 7875 }, { "epoch": 3.4981123695314236, "grad_norm": 0.3319846218251371, "learning_rate": 4.7057976171481614e-07, "loss": 0.0159, "step": 7876 }, { "epoch": 3.498556517876971, "grad_norm": 0.3685184737755979, "learning_rate": 4.69759081858141e-07, "loss": 0.0248, "step": 7877 }, { "epoch": 3.499000666222518, "grad_norm": 0.5404043718315673, "learning_rate": 4.689390829736312e-07, "loss": 0.031, "step": 7878 }, { "epoch": 3.499444814568066, "grad_norm": 0.3833802780855093, "learning_rate": 4.681197651845476e-07, "loss": 0.0156, "step": 7879 }, { "epoch": 3.499888962913613, "grad_norm": 0.387890807698906, "learning_rate": 4.6730112861404497e-07, "loss": 0.028, "step": 7880 }, { "epoch": 3.5003331112591605, "grad_norm": 0.5278897289398781, "learning_rate": 4.6648317338518045e-07, "loss": 0.0305, "step": 7881 }, { "epoch": 3.5007772596047078, "grad_norm": 0.429242176272356, "learning_rate": 4.656658996209057e-07, "loss": 0.0299, "step": 7882 }, { "epoch": 3.5012214079502555, "grad_norm": 0.4369670295816076, "learning_rate": 4.6484930744407074e-07, "loss": 0.0327, "step": 7883 }, { "epoch": 3.501665556295803, "grad_norm": 0.4901187441061219, "learning_rate": 4.6403339697742413e-07, "loss": 0.0228, "step": 7884 }, { "epoch": 3.50210970464135, "grad_norm": 0.3420486455934448, "learning_rate": 4.63218168343611e-07, "loss": 0.0216, "step": 7885 }, { "epoch": 3.502553852986898, "grad_norm": 0.40939983156980164, "learning_rate": 4.624036216651723e-07, "loss": 0.0263, "step": 7886 }, { "epoch": 3.502998001332445, "grad_norm": 0.3910656718501315, "learning_rate": 4.615897570645511e-07, "loss": 0.0183, "step": 7887 }, { "epoch": 3.5034421496779924, "grad_norm": 0.4314655959506559, "learning_rate": 4.6077657466408245e-07, "loss": 0.0312, "step": 7888 }, { "epoch": 3.5038862980235397, "grad_norm": 0.3568409941841216, "learning_rate": 4.599640745860029e-07, "loss": 0.0271, "step": 7889 }, { "epoch": 3.5043304463690874, "grad_norm": 0.4093953752586413, "learning_rate": 4.5915225695244536e-07, "loss": 0.0241, "step": 7890 }, { "epoch": 3.5047745947146347, "grad_norm": 0.3977398657560647, "learning_rate": 4.583411218854383e-07, "loss": 0.0237, "step": 7891 }, { "epoch": 3.505218743060182, "grad_norm": 0.4275747909847334, "learning_rate": 4.575306695069087e-07, "loss": 0.0301, "step": 7892 }, { "epoch": 3.5056628914057297, "grad_norm": 0.5388719609229746, "learning_rate": 4.567208999386852e-07, "loss": 0.035, "step": 7893 }, { "epoch": 3.506107039751277, "grad_norm": 0.36096697143744105, "learning_rate": 4.5591181330248534e-07, "loss": 0.0207, "step": 7894 }, { "epoch": 3.5065511880968243, "grad_norm": 0.4142429440699439, "learning_rate": 4.5510340971993086e-07, "loss": 0.0186, "step": 7895 }, { "epoch": 3.5069953364423716, "grad_norm": 0.4393049642057595, "learning_rate": 4.542956893125394e-07, "loss": 0.025, "step": 7896 }, { "epoch": 3.507439484787919, "grad_norm": 0.6384180216184727, "learning_rate": 4.534886522017229e-07, "loss": 0.0266, "step": 7897 }, { "epoch": 3.5078836331334666, "grad_norm": 0.43433435288410693, "learning_rate": 4.526822985087931e-07, "loss": 0.031, "step": 7898 }, { "epoch": 3.508327781479014, "grad_norm": 0.4028630240642223, "learning_rate": 4.5187662835495974e-07, "loss": 0.0225, "step": 7899 }, { "epoch": 3.5087719298245617, "grad_norm": 0.3939689206368112, "learning_rate": 4.510716418613281e-07, "loss": 0.0224, "step": 7900 }, { "epoch": 3.509216078170109, "grad_norm": 0.515126016444997, "learning_rate": 4.502673391489026e-07, "loss": 0.029, "step": 7901 }, { "epoch": 3.5096602265156562, "grad_norm": 0.4945699098085951, "learning_rate": 4.4946372033858157e-07, "loss": 0.028, "step": 7902 }, { "epoch": 3.5101043748612035, "grad_norm": 0.43373798757249604, "learning_rate": 4.486607855511627e-07, "loss": 0.0272, "step": 7903 }, { "epoch": 3.510548523206751, "grad_norm": 0.40314045697996204, "learning_rate": 4.4785853490734277e-07, "loss": 0.027, "step": 7904 }, { "epoch": 3.5109926715522985, "grad_norm": 0.4288828381204982, "learning_rate": 4.470569685277115e-07, "loss": 0.021, "step": 7905 }, { "epoch": 3.511436819897846, "grad_norm": 0.47380514557902603, "learning_rate": 4.462560865327592e-07, "loss": 0.0352, "step": 7906 }, { "epoch": 3.511880968243393, "grad_norm": 0.6718664840187395, "learning_rate": 4.454558890428728e-07, "loss": 0.037, "step": 7907 }, { "epoch": 3.512325116588941, "grad_norm": 0.42187384792878063, "learning_rate": 4.446563761783329e-07, "loss": 0.0265, "step": 7908 }, { "epoch": 3.512769264934488, "grad_norm": 0.4115565734686187, "learning_rate": 4.43857548059321e-07, "loss": 0.0292, "step": 7909 }, { "epoch": 3.5132134132800354, "grad_norm": 0.5047831437385433, "learning_rate": 4.430594048059167e-07, "loss": 0.0283, "step": 7910 }, { "epoch": 3.5136575616255827, "grad_norm": 0.4497401686326858, "learning_rate": 4.422619465380917e-07, "loss": 0.0257, "step": 7911 }, { "epoch": 3.5141017099711305, "grad_norm": 0.41364265915281817, "learning_rate": 4.4146517337571857e-07, "loss": 0.0219, "step": 7912 }, { "epoch": 3.5145458583166778, "grad_norm": 0.5685124202462428, "learning_rate": 4.4066908543856704e-07, "loss": 0.0421, "step": 7913 }, { "epoch": 3.514990006662225, "grad_norm": 0.31669420534645987, "learning_rate": 4.3987368284630015e-07, "loss": 0.0161, "step": 7914 }, { "epoch": 3.515434155007773, "grad_norm": 0.3911190961806719, "learning_rate": 4.3907896571848187e-07, "loss": 0.0295, "step": 7915 }, { "epoch": 3.51587830335332, "grad_norm": 0.49151230231883697, "learning_rate": 4.382849341745715e-07, "loss": 0.0323, "step": 7916 }, { "epoch": 3.5163224516988674, "grad_norm": 0.3734463542020498, "learning_rate": 4.3749158833392535e-07, "loss": 0.0233, "step": 7917 }, { "epoch": 3.5167666000444147, "grad_norm": 0.4234162479441925, "learning_rate": 4.366989283157985e-07, "loss": 0.0221, "step": 7918 }, { "epoch": 3.5172107483899624, "grad_norm": 0.36581058726451665, "learning_rate": 4.3590695423933795e-07, "loss": 0.0232, "step": 7919 }, { "epoch": 3.5176548967355097, "grad_norm": 0.41877616966330095, "learning_rate": 4.3511566622359224e-07, "loss": 0.032, "step": 7920 }, { "epoch": 3.518099045081057, "grad_norm": 0.381478457217342, "learning_rate": 4.3432506438750745e-07, "loss": 0.0198, "step": 7921 }, { "epoch": 3.5185431934266047, "grad_norm": 0.5312067893718723, "learning_rate": 4.335351488499218e-07, "loss": 0.0341, "step": 7922 }, { "epoch": 3.518987341772152, "grad_norm": 0.32613641832900486, "learning_rate": 4.327459197295736e-07, "loss": 0.0153, "step": 7923 }, { "epoch": 3.5194314901176993, "grad_norm": 0.4186436124064849, "learning_rate": 4.319573771450991e-07, "loss": 0.0239, "step": 7924 }, { "epoch": 3.5198756384632466, "grad_norm": 0.5213492953580083, "learning_rate": 4.3116952121502686e-07, "loss": 0.0245, "step": 7925 }, { "epoch": 3.520319786808794, "grad_norm": 0.4470067109711501, "learning_rate": 4.303823520577871e-07, "loss": 0.029, "step": 7926 }, { "epoch": 3.5207639351543416, "grad_norm": 0.44926277605141984, "learning_rate": 4.295958697917035e-07, "loss": 0.0251, "step": 7927 }, { "epoch": 3.521208083499889, "grad_norm": 0.4428984780991236, "learning_rate": 4.288100745349988e-07, "loss": 0.0283, "step": 7928 }, { "epoch": 3.5216522318454366, "grad_norm": 0.4649455292363394, "learning_rate": 4.2802496640579115e-07, "loss": 0.0261, "step": 7929 }, { "epoch": 3.522096380190984, "grad_norm": 0.4546600325025468, "learning_rate": 4.2724054552209515e-07, "loss": 0.0272, "step": 7930 }, { "epoch": 3.522540528536531, "grad_norm": 0.44460414877352883, "learning_rate": 4.2645681200182197e-07, "loss": 0.02, "step": 7931 }, { "epoch": 3.5229846768820785, "grad_norm": 0.3712603269325949, "learning_rate": 4.256737659627813e-07, "loss": 0.0252, "step": 7932 }, { "epoch": 3.523428825227626, "grad_norm": 0.4419446907695471, "learning_rate": 4.248914075226779e-07, "loss": 0.0296, "step": 7933 }, { "epoch": 3.5238729735731735, "grad_norm": 0.4371622849898781, "learning_rate": 4.2410973679911317e-07, "loss": 0.0193, "step": 7934 }, { "epoch": 3.524317121918721, "grad_norm": 0.42263243970502656, "learning_rate": 4.2332875390958707e-07, "loss": 0.0284, "step": 7935 }, { "epoch": 3.524761270264268, "grad_norm": 0.4360454559382049, "learning_rate": 4.225484589714918e-07, "loss": 0.0267, "step": 7936 }, { "epoch": 3.525205418609816, "grad_norm": 0.4221730269522906, "learning_rate": 4.2176885210212127e-07, "loss": 0.0283, "step": 7937 }, { "epoch": 3.525649566955363, "grad_norm": 0.34554558425959747, "learning_rate": 4.209899334186623e-07, "loss": 0.015, "step": 7938 }, { "epoch": 3.5260937153009104, "grad_norm": 0.3944057611136553, "learning_rate": 4.2021170303820025e-07, "loss": 0.0247, "step": 7939 }, { "epoch": 3.5265378636464577, "grad_norm": 0.3950267732155638, "learning_rate": 4.1943416107771585e-07, "loss": 0.0237, "step": 7940 }, { "epoch": 3.5269820119920055, "grad_norm": 0.5276030208132706, "learning_rate": 4.186573076540884e-07, "loss": 0.0293, "step": 7941 }, { "epoch": 3.5274261603375527, "grad_norm": 0.42130456420073215, "learning_rate": 4.178811428840901e-07, "loss": 0.0221, "step": 7942 }, { "epoch": 3.5278703086831, "grad_norm": 0.4967582028112826, "learning_rate": 4.1710566688439314e-07, "loss": 0.0264, "step": 7943 }, { "epoch": 3.5283144570286478, "grad_norm": 0.38558337467937864, "learning_rate": 4.163308797715637e-07, "loss": 0.0276, "step": 7944 }, { "epoch": 3.528758605374195, "grad_norm": 0.5044853945425457, "learning_rate": 4.155567816620659e-07, "loss": 0.0204, "step": 7945 }, { "epoch": 3.5292027537197423, "grad_norm": 0.8869805733234495, "learning_rate": 4.147833726722611e-07, "loss": 0.0291, "step": 7946 }, { "epoch": 3.5296469020652896, "grad_norm": 0.3536249761317266, "learning_rate": 4.140106529184035e-07, "loss": 0.0235, "step": 7947 }, { "epoch": 3.5300910504108374, "grad_norm": 0.4154592460581955, "learning_rate": 4.1323862251664684e-07, "loss": 0.0233, "step": 7948 }, { "epoch": 3.5305351987563847, "grad_norm": 0.3504099993535625, "learning_rate": 4.1246728158304107e-07, "loss": 0.0187, "step": 7949 }, { "epoch": 3.530979347101932, "grad_norm": 0.4437614260756121, "learning_rate": 4.1169663023353124e-07, "loss": 0.0262, "step": 7950 }, { "epoch": 3.5314234954474797, "grad_norm": 0.3600199231026137, "learning_rate": 4.109266685839597e-07, "loss": 0.02, "step": 7951 }, { "epoch": 3.531867643793027, "grad_norm": 0.3877433229131516, "learning_rate": 4.101573967500655e-07, "loss": 0.0236, "step": 7952 }, { "epoch": 3.5323117921385743, "grad_norm": 0.39474586031469927, "learning_rate": 4.0938881484748116e-07, "loss": 0.0261, "step": 7953 }, { "epoch": 3.5327559404841216, "grad_norm": 0.4148159595077121, "learning_rate": 4.086209229917387e-07, "loss": 0.0255, "step": 7954 }, { "epoch": 3.533200088829669, "grad_norm": 0.47070012710839254, "learning_rate": 4.0785372129826586e-07, "loss": 0.0318, "step": 7955 }, { "epoch": 3.5336442371752166, "grad_norm": 0.3680844624737353, "learning_rate": 4.0708720988238584e-07, "loss": 0.0166, "step": 7956 }, { "epoch": 3.534088385520764, "grad_norm": 0.3875786535924789, "learning_rate": 4.063213888593176e-07, "loss": 0.0256, "step": 7957 }, { "epoch": 3.5345325338663116, "grad_norm": 0.39292909474931037, "learning_rate": 4.0555625834417857e-07, "loss": 0.0244, "step": 7958 }, { "epoch": 3.534976682211859, "grad_norm": 0.4238019487678821, "learning_rate": 4.047918184519789e-07, "loss": 0.0265, "step": 7959 }, { "epoch": 3.535420830557406, "grad_norm": 0.3868503914395472, "learning_rate": 4.040280692976278e-07, "loss": 0.025, "step": 7960 }, { "epoch": 3.5358649789029535, "grad_norm": 0.5061016125162293, "learning_rate": 4.032650109959302e-07, "loss": 0.0305, "step": 7961 }, { "epoch": 3.5363091272485008, "grad_norm": 0.44915195593836177, "learning_rate": 4.0250264366158643e-07, "loss": 0.0227, "step": 7962 }, { "epoch": 3.5367532755940485, "grad_norm": 0.3701090299363028, "learning_rate": 4.017409674091932e-07, "loss": 0.0275, "step": 7963 }, { "epoch": 3.537197423939596, "grad_norm": 0.33071866294982566, "learning_rate": 4.009799823532434e-07, "loss": 0.0166, "step": 7964 }, { "epoch": 3.537641572285143, "grad_norm": 0.3869700046276894, "learning_rate": 4.0021968860812556e-07, "loss": 0.025, "step": 7965 }, { "epoch": 3.538085720630691, "grad_norm": 0.34946266928394254, "learning_rate": 3.994600862881248e-07, "loss": 0.0205, "step": 7966 }, { "epoch": 3.538529868976238, "grad_norm": 0.391173218273872, "learning_rate": 3.9870117550742273e-07, "loss": 0.0263, "step": 7967 }, { "epoch": 3.5389740173217854, "grad_norm": 0.3665430028174577, "learning_rate": 3.9794295638009683e-07, "loss": 0.022, "step": 7968 }, { "epoch": 3.5394181656673327, "grad_norm": 0.3931083199354571, "learning_rate": 3.971854290201205e-07, "loss": 0.0233, "step": 7969 }, { "epoch": 3.5398623140128804, "grad_norm": 0.4886210610765913, "learning_rate": 3.964285935413609e-07, "loss": 0.0213, "step": 7970 }, { "epoch": 3.5403064623584277, "grad_norm": 0.3576029073891739, "learning_rate": 3.9567245005758537e-07, "loss": 0.023, "step": 7971 }, { "epoch": 3.540750610703975, "grad_norm": 0.4994177352645637, "learning_rate": 3.9491699868245414e-07, "loss": 0.0419, "step": 7972 }, { "epoch": 3.5411947590495227, "grad_norm": 0.43622296138249816, "learning_rate": 3.941622395295247e-07, "loss": 0.0318, "step": 7973 }, { "epoch": 3.54163890739507, "grad_norm": 0.4309663417901952, "learning_rate": 3.934081727122513e-07, "loss": 0.0342, "step": 7974 }, { "epoch": 3.5420830557406173, "grad_norm": 0.4349122435505889, "learning_rate": 3.9265479834398103e-07, "loss": 0.0286, "step": 7975 }, { "epoch": 3.5425272040861646, "grad_norm": 0.4420907520326438, "learning_rate": 3.919021165379594e-07, "loss": 0.0192, "step": 7976 }, { "epoch": 3.5429713524317124, "grad_norm": 0.40134602793452784, "learning_rate": 3.911501274073276e-07, "loss": 0.0222, "step": 7977 }, { "epoch": 3.5434155007772596, "grad_norm": 0.4162128270891882, "learning_rate": 3.9039883106512243e-07, "loss": 0.0263, "step": 7978 }, { "epoch": 3.543859649122807, "grad_norm": 0.43122424865043063, "learning_rate": 3.8964822762427633e-07, "loss": 0.0251, "step": 7979 }, { "epoch": 3.5443037974683547, "grad_norm": 0.351905898436228, "learning_rate": 3.888983171976185e-07, "loss": 0.0225, "step": 7980 }, { "epoch": 3.544747945813902, "grad_norm": 0.37615773316072754, "learning_rate": 3.8814909989787155e-07, "loss": 0.02, "step": 7981 }, { "epoch": 3.5451920941594492, "grad_norm": 0.42395585895532933, "learning_rate": 3.87400575837657e-07, "loss": 0.0202, "step": 7982 }, { "epoch": 3.5456362425049965, "grad_norm": 0.3990970301083849, "learning_rate": 3.8665274512948994e-07, "loss": 0.0201, "step": 7983 }, { "epoch": 3.546080390850544, "grad_norm": 0.5764101771289384, "learning_rate": 3.859056078857826e-07, "loss": 0.0381, "step": 7984 }, { "epoch": 3.5465245391960916, "grad_norm": 0.3820488932414435, "learning_rate": 3.851591642188418e-07, "loss": 0.0242, "step": 7985 }, { "epoch": 3.546968687541639, "grad_norm": 0.3459706819898099, "learning_rate": 3.8441341424087233e-07, "loss": 0.0238, "step": 7986 }, { "epoch": 3.5474128358871866, "grad_norm": 0.4227398045664523, "learning_rate": 3.836683580639705e-07, "loss": 0.0244, "step": 7987 }, { "epoch": 3.547856984232734, "grad_norm": 0.4309060643690936, "learning_rate": 3.829239958001324e-07, "loss": 0.0194, "step": 7988 }, { "epoch": 3.548301132578281, "grad_norm": 0.42116836328880336, "learning_rate": 3.8218032756124844e-07, "loss": 0.0204, "step": 7989 }, { "epoch": 3.5487452809238285, "grad_norm": 0.37821657931686165, "learning_rate": 3.814373534591037e-07, "loss": 0.0194, "step": 7990 }, { "epoch": 3.5491894292693758, "grad_norm": 0.42284081107970184, "learning_rate": 3.8069507360538163e-07, "loss": 0.0221, "step": 7991 }, { "epoch": 3.5496335776149235, "grad_norm": 0.4662610237314549, "learning_rate": 3.799534881116573e-07, "loss": 0.0276, "step": 7992 }, { "epoch": 3.5500777259604708, "grad_norm": 0.45003555474688933, "learning_rate": 3.7921259708940503e-07, "loss": 0.0274, "step": 7993 }, { "epoch": 3.550521874306018, "grad_norm": 0.5571173920439147, "learning_rate": 3.7847240064999233e-07, "loss": 0.0294, "step": 7994 }, { "epoch": 3.550966022651566, "grad_norm": 0.4728510369440864, "learning_rate": 3.7773289890468414e-07, "loss": 0.026, "step": 7995 }, { "epoch": 3.551410170997113, "grad_norm": 0.3553733406102519, "learning_rate": 3.7699409196463977e-07, "loss": 0.0175, "step": 7996 }, { "epoch": 3.5518543193426604, "grad_norm": 0.4021957600650995, "learning_rate": 3.762559799409149e-07, "loss": 0.0268, "step": 7997 }, { "epoch": 3.5522984676882077, "grad_norm": 0.5246882857506893, "learning_rate": 3.7551856294445967e-07, "loss": 0.0243, "step": 7998 }, { "epoch": 3.5527426160337554, "grad_norm": 0.4805020243101749, "learning_rate": 3.7478184108612036e-07, "loss": 0.0185, "step": 7999 }, { "epoch": 3.5531867643793027, "grad_norm": 0.49276293248029474, "learning_rate": 3.74045814476639e-07, "loss": 0.0322, "step": 8000 }, { "epoch": 3.55363091272485, "grad_norm": 0.5382419991792012, "learning_rate": 3.733104832266532e-07, "loss": 0.0273, "step": 8001 }, { "epoch": 3.5540750610703977, "grad_norm": 0.4236743982252115, "learning_rate": 3.7257584744669615e-07, "loss": 0.024, "step": 8002 }, { "epoch": 3.554519209415945, "grad_norm": 0.3569656738216824, "learning_rate": 3.718419072471946e-07, "loss": 0.0206, "step": 8003 }, { "epoch": 3.5549633577614923, "grad_norm": 0.6957355605647527, "learning_rate": 3.7110866273847356e-07, "loss": 0.0286, "step": 8004 }, { "epoch": 3.5554075061070396, "grad_norm": 0.39146983502282784, "learning_rate": 3.70376114030751e-07, "loss": 0.0211, "step": 8005 }, { "epoch": 3.5558516544525873, "grad_norm": 0.4183758252296795, "learning_rate": 3.696442612341422e-07, "loss": 0.023, "step": 8006 }, { "epoch": 3.5562958027981346, "grad_norm": 0.4440464207564376, "learning_rate": 3.6891310445865693e-07, "loss": 0.0284, "step": 8007 }, { "epoch": 3.556739951143682, "grad_norm": 0.3721283097773306, "learning_rate": 3.681826438142011e-07, "loss": 0.0228, "step": 8008 }, { "epoch": 3.5571840994892296, "grad_norm": 0.4139366078303485, "learning_rate": 3.6745287941057417e-07, "loss": 0.0229, "step": 8009 }, { "epoch": 3.557628247834777, "grad_norm": 0.5055064614278328, "learning_rate": 3.6672381135747284e-07, "loss": 0.0278, "step": 8010 }, { "epoch": 3.5580723961803242, "grad_norm": 0.44529281390449127, "learning_rate": 3.6599543976448884e-07, "loss": 0.0261, "step": 8011 }, { "epoch": 3.5585165445258715, "grad_norm": 0.4907105184886079, "learning_rate": 3.6526776474110627e-07, "loss": 0.0244, "step": 8012 }, { "epoch": 3.558960692871419, "grad_norm": 0.4951645075676849, "learning_rate": 3.645407863967104e-07, "loss": 0.037, "step": 8013 }, { "epoch": 3.5594048412169665, "grad_norm": 0.37863500443452736, "learning_rate": 3.6381450484057777e-07, "loss": 0.0252, "step": 8014 }, { "epoch": 3.559848989562514, "grad_norm": 0.39735984529923846, "learning_rate": 3.630889201818788e-07, "loss": 0.0206, "step": 8015 }, { "epoch": 3.5602931379080616, "grad_norm": 0.4346987177337589, "learning_rate": 3.623640325296829e-07, "loss": 0.0257, "step": 8016 }, { "epoch": 3.560737286253609, "grad_norm": 0.33730436829577654, "learning_rate": 3.616398419929523e-07, "loss": 0.0189, "step": 8017 }, { "epoch": 3.561181434599156, "grad_norm": 0.3665779162327361, "learning_rate": 3.6091634868054557e-07, "loss": 0.0189, "step": 8018 }, { "epoch": 3.5616255829447034, "grad_norm": 0.3886589674388881, "learning_rate": 3.601935527012168e-07, "loss": 0.0225, "step": 8019 }, { "epoch": 3.5620697312902507, "grad_norm": 0.46005381668223383, "learning_rate": 3.594714541636124e-07, "loss": 0.0251, "step": 8020 }, { "epoch": 3.5625138796357985, "grad_norm": 0.437136127998239, "learning_rate": 3.5875005317627776e-07, "loss": 0.0336, "step": 8021 }, { "epoch": 3.5629580279813458, "grad_norm": 0.41144565522854637, "learning_rate": 3.580293498476517e-07, "loss": 0.0321, "step": 8022 }, { "epoch": 3.563402176326893, "grad_norm": 0.4402714548710384, "learning_rate": 3.573093442860659e-07, "loss": 0.0218, "step": 8023 }, { "epoch": 3.563846324672441, "grad_norm": 0.46313910275690856, "learning_rate": 3.565900365997521e-07, "loss": 0.038, "step": 8024 }, { "epoch": 3.564290473017988, "grad_norm": 0.4585923557925007, "learning_rate": 3.558714268968344e-07, "loss": 0.0242, "step": 8025 }, { "epoch": 3.5647346213635354, "grad_norm": 0.40511430211397, "learning_rate": 3.5515351528533024e-07, "loss": 0.0217, "step": 8026 }, { "epoch": 3.5651787697090827, "grad_norm": 0.4897536707816401, "learning_rate": 3.5443630187315504e-07, "loss": 0.0293, "step": 8027 }, { "epoch": 3.5656229180546304, "grad_norm": 0.3947409838204125, "learning_rate": 3.537197867681191e-07, "loss": 0.0231, "step": 8028 }, { "epoch": 3.5660670664001777, "grad_norm": 0.42936961733669526, "learning_rate": 3.5300397007792364e-07, "loss": 0.0266, "step": 8029 }, { "epoch": 3.566511214745725, "grad_norm": 0.3453771987941547, "learning_rate": 3.5228885191017084e-07, "loss": 0.0181, "step": 8030 }, { "epoch": 3.5669553630912727, "grad_norm": 0.38903220671574995, "learning_rate": 3.515744323723558e-07, "loss": 0.0254, "step": 8031 }, { "epoch": 3.56739951143682, "grad_norm": 0.39781787732509344, "learning_rate": 3.508607115718654e-07, "loss": 0.0181, "step": 8032 }, { "epoch": 3.5678436597823673, "grad_norm": 0.42441099532371085, "learning_rate": 3.50147689615985e-07, "loss": 0.0222, "step": 8033 }, { "epoch": 3.5682878081279146, "grad_norm": 0.38842537901922425, "learning_rate": 3.494353666118938e-07, "loss": 0.0378, "step": 8034 }, { "epoch": 3.568731956473462, "grad_norm": 0.35803044170961684, "learning_rate": 3.4872374266666674e-07, "loss": 0.0241, "step": 8035 }, { "epoch": 3.5691761048190096, "grad_norm": 0.41680398082335124, "learning_rate": 3.4801281788727326e-07, "loss": 0.0226, "step": 8036 }, { "epoch": 3.569620253164557, "grad_norm": 0.4978684696151685, "learning_rate": 3.4730259238057563e-07, "loss": 0.0265, "step": 8037 }, { "epoch": 3.5700644015101046, "grad_norm": 0.526050521323082, "learning_rate": 3.46593066253334e-07, "loss": 0.0241, "step": 8038 }, { "epoch": 3.570508549855652, "grad_norm": 0.4100317457053481, "learning_rate": 3.4588423961220306e-07, "loss": 0.0244, "step": 8039 }, { "epoch": 3.570952698201199, "grad_norm": 0.4007810456784438, "learning_rate": 3.4517611256372875e-07, "loss": 0.0207, "step": 8040 }, { "epoch": 3.5713968465467465, "grad_norm": 0.44507883967406275, "learning_rate": 3.444686852143575e-07, "loss": 0.0203, "step": 8041 }, { "epoch": 3.571840994892294, "grad_norm": 0.4110302975726491, "learning_rate": 3.4376195767042706e-07, "loss": 0.0196, "step": 8042 }, { "epoch": 3.5722851432378415, "grad_norm": 0.42428375307760136, "learning_rate": 3.4305593003816917e-07, "loss": 0.0239, "step": 8043 }, { "epoch": 3.572729291583389, "grad_norm": 0.4068665526914318, "learning_rate": 3.423506024237122e-07, "loss": 0.0251, "step": 8044 }, { "epoch": 3.573173439928936, "grad_norm": 0.5621905349071596, "learning_rate": 3.416459749330808e-07, "loss": 0.0436, "step": 8045 }, { "epoch": 3.573617588274484, "grad_norm": 0.4013053666322752, "learning_rate": 3.409420476721892e-07, "loss": 0.0233, "step": 8046 }, { "epoch": 3.574061736620031, "grad_norm": 0.40409509056363074, "learning_rate": 3.4023882074685266e-07, "loss": 0.0196, "step": 8047 }, { "epoch": 3.5745058849655784, "grad_norm": 0.401583653285129, "learning_rate": 3.3953629426277666e-07, "loss": 0.0254, "step": 8048 }, { "epoch": 3.5749500333111257, "grad_norm": 0.39247780188844755, "learning_rate": 3.3883446832556286e-07, "loss": 0.0226, "step": 8049 }, { "epoch": 3.5753941816566734, "grad_norm": 0.4595707850818669, "learning_rate": 3.381333430407074e-07, "loss": 0.0331, "step": 8050 }, { "epoch": 3.5758383300022207, "grad_norm": 0.38262273059202667, "learning_rate": 3.3743291851360215e-07, "loss": 0.032, "step": 8051 }, { "epoch": 3.576282478347768, "grad_norm": 0.5184355648640612, "learning_rate": 3.3673319484953224e-07, "loss": 0.0535, "step": 8052 }, { "epoch": 3.5767266266933158, "grad_norm": 0.4837920320850102, "learning_rate": 3.3603417215367916e-07, "loss": 0.028, "step": 8053 }, { "epoch": 3.577170775038863, "grad_norm": 0.4493839583313447, "learning_rate": 3.3533585053111604e-07, "loss": 0.0239, "step": 8054 }, { "epoch": 3.5776149233844103, "grad_norm": 0.45676379041549287, "learning_rate": 3.346382300868134e-07, "loss": 0.0183, "step": 8055 }, { "epoch": 3.5780590717299576, "grad_norm": 0.42408641826733806, "learning_rate": 3.339413109256362e-07, "loss": 0.019, "step": 8056 }, { "epoch": 3.5785032200755054, "grad_norm": 0.41032911192121174, "learning_rate": 3.3324509315234066e-07, "loss": 0.0243, "step": 8057 }, { "epoch": 3.5789473684210527, "grad_norm": 0.3781126839349984, "learning_rate": 3.325495768715831e-07, "loss": 0.023, "step": 8058 }, { "epoch": 3.5793915167666, "grad_norm": 0.48422608200816336, "learning_rate": 3.318547621879109e-07, "loss": 0.0209, "step": 8059 }, { "epoch": 3.5798356651121477, "grad_norm": 0.5803863155007879, "learning_rate": 3.311606492057651e-07, "loss": 0.03, "step": 8060 }, { "epoch": 3.580279813457695, "grad_norm": 0.4295397381772821, "learning_rate": 3.304672380294832e-07, "loss": 0.0276, "step": 8061 }, { "epoch": 3.5807239618032423, "grad_norm": 0.4359738260640305, "learning_rate": 3.2977452876329806e-07, "loss": 0.0226, "step": 8062 }, { "epoch": 3.5811681101487896, "grad_norm": 0.6578716400312555, "learning_rate": 3.290825215113325e-07, "loss": 0.0317, "step": 8063 }, { "epoch": 3.581612258494337, "grad_norm": 0.31354942464141744, "learning_rate": 3.2839121637761095e-07, "loss": 0.021, "step": 8064 }, { "epoch": 3.5820564068398846, "grad_norm": 0.3962803838449033, "learning_rate": 3.277006134660454e-07, "loss": 0.0279, "step": 8065 }, { "epoch": 3.582500555185432, "grad_norm": 0.4214624266956511, "learning_rate": 3.270107128804462e-07, "loss": 0.0344, "step": 8066 }, { "epoch": 3.5829447035309796, "grad_norm": 0.3602571631127595, "learning_rate": 3.26321514724518e-07, "loss": 0.0228, "step": 8067 }, { "epoch": 3.583388851876527, "grad_norm": 0.37827106337184596, "learning_rate": 3.2563301910185585e-07, "loss": 0.0229, "step": 8068 }, { "epoch": 3.583833000222074, "grad_norm": 0.3969145699088532, "learning_rate": 3.249452261159558e-07, "loss": 0.0216, "step": 8069 }, { "epoch": 3.5842771485676215, "grad_norm": 0.3539428309291324, "learning_rate": 3.242581358702046e-07, "loss": 0.0219, "step": 8070 }, { "epoch": 3.5847212969131688, "grad_norm": 0.46162564562225794, "learning_rate": 3.235717484678813e-07, "loss": 0.0219, "step": 8071 }, { "epoch": 3.5851654452587165, "grad_norm": 0.395257543151881, "learning_rate": 3.2288606401216283e-07, "loss": 0.0283, "step": 8072 }, { "epoch": 3.585609593604264, "grad_norm": 0.3543800684377955, "learning_rate": 3.2220108260612e-07, "loss": 0.022, "step": 8073 }, { "epoch": 3.586053741949811, "grad_norm": 0.3956501467663245, "learning_rate": 3.2151680435271504e-07, "loss": 0.0227, "step": 8074 }, { "epoch": 3.586497890295359, "grad_norm": 0.6908876097601867, "learning_rate": 3.208332293548094e-07, "loss": 0.0402, "step": 8075 }, { "epoch": 3.586942038640906, "grad_norm": 0.41210563747436485, "learning_rate": 3.2015035771515377e-07, "loss": 0.0207, "step": 8076 }, { "epoch": 3.5873861869864534, "grad_norm": 0.4210761069284605, "learning_rate": 3.1946818953639604e-07, "loss": 0.0199, "step": 8077 }, { "epoch": 3.5878303353320007, "grad_norm": 0.562680504099952, "learning_rate": 3.1878672492107796e-07, "loss": 0.0333, "step": 8078 }, { "epoch": 3.5882744836775484, "grad_norm": 0.45181012070367105, "learning_rate": 3.181059639716355e-07, "loss": 0.0292, "step": 8079 }, { "epoch": 3.5887186320230957, "grad_norm": 0.3560150149469517, "learning_rate": 3.1742590679039675e-07, "loss": 0.0263, "step": 8080 }, { "epoch": 3.589162780368643, "grad_norm": 0.2727495861254254, "learning_rate": 3.167465534795888e-07, "loss": 0.0142, "step": 8081 }, { "epoch": 3.5896069287141907, "grad_norm": 0.3723555834846713, "learning_rate": 3.1606790414132784e-07, "loss": 0.0204, "step": 8082 }, { "epoch": 3.590051077059738, "grad_norm": 0.3414027162319298, "learning_rate": 3.153899588776266e-07, "loss": 0.0229, "step": 8083 }, { "epoch": 3.5904952254052853, "grad_norm": 0.42338168106939433, "learning_rate": 3.147127177903936e-07, "loss": 0.0232, "step": 8084 }, { "epoch": 3.5909393737508326, "grad_norm": 0.41145971331036213, "learning_rate": 3.1403618098142683e-07, "loss": 0.0196, "step": 8085 }, { "epoch": 3.5913835220963803, "grad_norm": 0.4400619534919001, "learning_rate": 3.133603485524217e-07, "loss": 0.0305, "step": 8086 }, { "epoch": 3.5918276704419276, "grad_norm": 0.3445253112185815, "learning_rate": 3.126852206049702e-07, "loss": 0.0184, "step": 8087 }, { "epoch": 3.592271818787475, "grad_norm": 0.4780681637104892, "learning_rate": 3.1201079724055284e-07, "loss": 0.0217, "step": 8088 }, { "epoch": 3.5927159671330227, "grad_norm": 0.3835860710733381, "learning_rate": 3.113370785605474e-07, "loss": 0.0177, "step": 8089 }, { "epoch": 3.59316011547857, "grad_norm": 0.36022249138379353, "learning_rate": 3.106640646662268e-07, "loss": 0.0229, "step": 8090 }, { "epoch": 3.5936042638241172, "grad_norm": 0.3513194805512465, "learning_rate": 3.099917556587534e-07, "loss": 0.0183, "step": 8091 }, { "epoch": 3.5940484121696645, "grad_norm": 0.44275629565228397, "learning_rate": 3.0932015163918973e-07, "loss": 0.0295, "step": 8092 }, { "epoch": 3.594492560515212, "grad_norm": 0.4202067622758764, "learning_rate": 3.0864925270848725e-07, "loss": 0.022, "step": 8093 }, { "epoch": 3.5949367088607596, "grad_norm": 0.35964770608777313, "learning_rate": 3.079790589674947e-07, "loss": 0.019, "step": 8094 }, { "epoch": 3.595380857206307, "grad_norm": 0.33143293773682697, "learning_rate": 3.073095705169532e-07, "loss": 0.0174, "step": 8095 }, { "epoch": 3.5958250055518546, "grad_norm": 0.35370082577225564, "learning_rate": 3.066407874574978e-07, "loss": 0.0245, "step": 8096 }, { "epoch": 3.596269153897402, "grad_norm": 0.46712833979581136, "learning_rate": 3.05972709889657e-07, "loss": 0.0248, "step": 8097 }, { "epoch": 3.596713302242949, "grad_norm": 0.38025100976241183, "learning_rate": 3.0530533791385765e-07, "loss": 0.0244, "step": 8098 }, { "epoch": 3.5971574505884965, "grad_norm": 0.342220120184616, "learning_rate": 3.0463867163041396e-07, "loss": 0.0155, "step": 8099 }, { "epoch": 3.5976015989340437, "grad_norm": 0.36933627653900136, "learning_rate": 3.0397271113953796e-07, "loss": 0.0209, "step": 8100 }, { "epoch": 3.5980457472795915, "grad_norm": 0.3882041575523787, "learning_rate": 3.0330745654133576e-07, "loss": 0.0289, "step": 8101 }, { "epoch": 3.5984898956251388, "grad_norm": 0.39071440323579304, "learning_rate": 3.026429079358051e-07, "loss": 0.0225, "step": 8102 }, { "epoch": 3.598934043970686, "grad_norm": 0.5224321778667499, "learning_rate": 3.0197906542283996e-07, "loss": 0.0366, "step": 8103 }, { "epoch": 3.599378192316234, "grad_norm": 0.463268142143758, "learning_rate": 3.013159291022261e-07, "loss": 0.0254, "step": 8104 }, { "epoch": 3.599822340661781, "grad_norm": 0.4758792312691063, "learning_rate": 3.006534990736448e-07, "loss": 0.0177, "step": 8105 }, { "epoch": 3.6002664890073284, "grad_norm": 0.3801891704764608, "learning_rate": 2.99991775436671e-07, "loss": 0.0285, "step": 8106 }, { "epoch": 3.6007106373528757, "grad_norm": 0.47832635752425606, "learning_rate": 2.993307582907728e-07, "loss": 0.0393, "step": 8107 }, { "epoch": 3.6011547856984234, "grad_norm": 0.4174887456808177, "learning_rate": 2.9867044773531083e-07, "loss": 0.0304, "step": 8108 }, { "epoch": 3.6015989340439707, "grad_norm": 0.4160783686882648, "learning_rate": 2.9801084386954337e-07, "loss": 0.0224, "step": 8109 }, { "epoch": 3.602043082389518, "grad_norm": 0.3728837062303793, "learning_rate": 2.9735194679261835e-07, "loss": 0.0212, "step": 8110 }, { "epoch": 3.6024872307350657, "grad_norm": 0.7022594125847725, "learning_rate": 2.966937566035799e-07, "loss": 0.0301, "step": 8111 }, { "epoch": 3.602931379080613, "grad_norm": 0.4052906698509884, "learning_rate": 2.9603627340136553e-07, "loss": 0.0293, "step": 8112 }, { "epoch": 3.6033755274261603, "grad_norm": 0.3921299918104816, "learning_rate": 2.953794972848051e-07, "loss": 0.025, "step": 8113 }, { "epoch": 3.6038196757717076, "grad_norm": 0.406918068376974, "learning_rate": 2.947234283526229e-07, "loss": 0.0278, "step": 8114 }, { "epoch": 3.6042638241172553, "grad_norm": 0.4175532634627423, "learning_rate": 2.940680667034396e-07, "loss": 0.0318, "step": 8115 }, { "epoch": 3.6047079724628026, "grad_norm": 0.43540375334253767, "learning_rate": 2.934134124357646e-07, "loss": 0.0268, "step": 8116 }, { "epoch": 3.60515212080835, "grad_norm": 0.4718399188563787, "learning_rate": 2.927594656480054e-07, "loss": 0.0319, "step": 8117 }, { "epoch": 3.6055962691538976, "grad_norm": 0.4746986163908478, "learning_rate": 2.921062264384605e-07, "loss": 0.0239, "step": 8118 }, { "epoch": 3.606040417499445, "grad_norm": 0.43178883447784155, "learning_rate": 2.914536949053226e-07, "loss": 0.0252, "step": 8119 }, { "epoch": 3.606484565844992, "grad_norm": 0.44193392412448756, "learning_rate": 2.908018711466787e-07, "loss": 0.0293, "step": 8120 }, { "epoch": 3.6069287141905395, "grad_norm": 0.361976350777308, "learning_rate": 2.901507552605087e-07, "loss": 0.0155, "step": 8121 }, { "epoch": 3.607372862536087, "grad_norm": 0.4277695581236822, "learning_rate": 2.895003473446861e-07, "loss": 0.0286, "step": 8122 }, { "epoch": 3.6078170108816345, "grad_norm": 0.4459278029555017, "learning_rate": 2.8885064749697987e-07, "loss": 0.028, "step": 8123 }, { "epoch": 3.608261159227182, "grad_norm": 0.566646999620044, "learning_rate": 2.882016558150491e-07, "loss": 0.0364, "step": 8124 }, { "epoch": 3.6087053075727296, "grad_norm": 0.4313671970565694, "learning_rate": 2.87553372396448e-07, "loss": 0.031, "step": 8125 }, { "epoch": 3.609149455918277, "grad_norm": 0.40292145142350155, "learning_rate": 2.869057973386269e-07, "loss": 0.0272, "step": 8126 }, { "epoch": 3.609593604263824, "grad_norm": 0.41179464266273536, "learning_rate": 2.8625893073892577e-07, "loss": 0.0208, "step": 8127 }, { "epoch": 3.6100377526093714, "grad_norm": 0.4344939010412826, "learning_rate": 2.85612772694579e-07, "loss": 0.0246, "step": 8128 }, { "epoch": 3.6104819009549187, "grad_norm": 0.4503615177393324, "learning_rate": 2.8496732330271726e-07, "loss": 0.0318, "step": 8129 }, { "epoch": 3.6109260493004665, "grad_norm": 0.49743560082851446, "learning_rate": 2.8432258266036016e-07, "loss": 0.0242, "step": 8130 }, { "epoch": 3.6113701976460137, "grad_norm": 0.418557316697792, "learning_rate": 2.8367855086442353e-07, "loss": 0.0279, "step": 8131 }, { "epoch": 3.611814345991561, "grad_norm": 0.3450331986237824, "learning_rate": 2.830352280117188e-07, "loss": 0.022, "step": 8132 }, { "epoch": 3.6122584943371088, "grad_norm": 0.44492026164748794, "learning_rate": 2.8239261419894526e-07, "loss": 0.0323, "step": 8133 }, { "epoch": 3.612702642682656, "grad_norm": 0.39841298833512895, "learning_rate": 2.8175070952270014e-07, "loss": 0.0285, "step": 8134 }, { "epoch": 3.6131467910282034, "grad_norm": 0.43301418955026805, "learning_rate": 2.811095140794734e-07, "loss": 0.0274, "step": 8135 }, { "epoch": 3.6135909393737506, "grad_norm": 0.5096476202086357, "learning_rate": 2.804690279656458e-07, "loss": 0.051, "step": 8136 }, { "epoch": 3.6140350877192984, "grad_norm": 0.3293367472926153, "learning_rate": 2.7982925127749416e-07, "loss": 0.0227, "step": 8137 }, { "epoch": 3.6144792360648457, "grad_norm": 0.37776627801259316, "learning_rate": 2.791901841111877e-07, "loss": 0.0254, "step": 8138 }, { "epoch": 3.614923384410393, "grad_norm": 0.40922593503138716, "learning_rate": 2.78551826562789e-07, "loss": 0.045, "step": 8139 }, { "epoch": 3.6153675327559407, "grad_norm": 0.42201659981374173, "learning_rate": 2.779141787282547e-07, "loss": 0.0251, "step": 8140 }, { "epoch": 3.615811681101488, "grad_norm": 0.4053495089948873, "learning_rate": 2.7727724070343296e-07, "loss": 0.023, "step": 8141 }, { "epoch": 3.6162558294470353, "grad_norm": 0.4556874573880963, "learning_rate": 2.7664101258406626e-07, "loss": 0.0227, "step": 8142 }, { "epoch": 3.6166999777925826, "grad_norm": 0.3450822468016936, "learning_rate": 2.7600549446579306e-07, "loss": 0.018, "step": 8143 }, { "epoch": 3.6171441261381303, "grad_norm": 0.35001380471776244, "learning_rate": 2.753706864441391e-07, "loss": 0.0232, "step": 8144 }, { "epoch": 3.6175882744836776, "grad_norm": 0.364015851517707, "learning_rate": 2.7473658861452923e-07, "loss": 0.0185, "step": 8145 }, { "epoch": 3.618032422829225, "grad_norm": 0.4713893596705623, "learning_rate": 2.741032010722788e-07, "loss": 0.0337, "step": 8146 }, { "epoch": 3.6184765711747726, "grad_norm": 0.3440041200074425, "learning_rate": 2.734705239125951e-07, "loss": 0.0197, "step": 8147 }, { "epoch": 3.61892071952032, "grad_norm": 0.41811195093373094, "learning_rate": 2.728385572305814e-07, "loss": 0.0278, "step": 8148 }, { "epoch": 3.619364867865867, "grad_norm": 0.339333589639759, "learning_rate": 2.7220730112123337e-07, "loss": 0.0187, "step": 8149 }, { "epoch": 3.6198090162114145, "grad_norm": 0.7462572452071613, "learning_rate": 2.715767556794391e-07, "loss": 0.0425, "step": 8150 }, { "epoch": 3.620253164556962, "grad_norm": 0.5298727144349012, "learning_rate": 2.7094692099997986e-07, "loss": 0.0308, "step": 8151 }, { "epoch": 3.6206973129025095, "grad_norm": 0.411102053477688, "learning_rate": 2.7031779717753223e-07, "loss": 0.0249, "step": 8152 }, { "epoch": 3.621141461248057, "grad_norm": 0.4354411151955604, "learning_rate": 2.696893843066617e-07, "loss": 0.0307, "step": 8153 }, { "epoch": 3.6215856095936045, "grad_norm": 0.3792115584655087, "learning_rate": 2.6906168248183095e-07, "loss": 0.0248, "step": 8154 }, { "epoch": 3.622029757939152, "grad_norm": 0.3457882220586487, "learning_rate": 2.68434691797394e-07, "loss": 0.0197, "step": 8155 }, { "epoch": 3.622473906284699, "grad_norm": 0.5361251704803487, "learning_rate": 2.6780841234759826e-07, "loss": 0.0382, "step": 8156 }, { "epoch": 3.6229180546302464, "grad_norm": 0.45121861235515354, "learning_rate": 2.6718284422658447e-07, "loss": 0.0226, "step": 8157 }, { "epoch": 3.6233622029757937, "grad_norm": 0.3726443210546779, "learning_rate": 2.665579875283847e-07, "loss": 0.0223, "step": 8158 }, { "epoch": 3.6238063513213414, "grad_norm": 0.39308466518304386, "learning_rate": 2.6593384234692597e-07, "loss": 0.0259, "step": 8159 }, { "epoch": 3.6242504996668887, "grad_norm": 0.3787552610454571, "learning_rate": 2.6531040877602997e-07, "loss": 0.0194, "step": 8160 }, { "epoch": 3.624694648012436, "grad_norm": 0.41262964342629177, "learning_rate": 2.646876869094073e-07, "loss": 0.0197, "step": 8161 }, { "epoch": 3.6251387963579838, "grad_norm": 0.34296275500584317, "learning_rate": 2.640656768406641e-07, "loss": 0.0215, "step": 8162 }, { "epoch": 3.625582944703531, "grad_norm": 0.37447434249834716, "learning_rate": 2.634443786632995e-07, "loss": 0.0251, "step": 8163 }, { "epoch": 3.6260270930490783, "grad_norm": 0.44671265535557425, "learning_rate": 2.628237924707044e-07, "loss": 0.0251, "step": 8164 }, { "epoch": 3.6264712413946256, "grad_norm": 0.39092522082367537, "learning_rate": 2.622039183561642e-07, "loss": 0.0245, "step": 8165 }, { "epoch": 3.6269153897401734, "grad_norm": 0.3971038919445469, "learning_rate": 2.6158475641285544e-07, "loss": 0.0297, "step": 8166 }, { "epoch": 3.6273595380857206, "grad_norm": 0.7481153045495028, "learning_rate": 2.609663067338497e-07, "loss": 0.0379, "step": 8167 }, { "epoch": 3.627803686431268, "grad_norm": 0.3705770719942717, "learning_rate": 2.6034856941211104e-07, "loss": 0.0238, "step": 8168 }, { "epoch": 3.6282478347768157, "grad_norm": 0.42989268956281285, "learning_rate": 2.597315445404941e-07, "loss": 0.0264, "step": 8169 }, { "epoch": 3.628691983122363, "grad_norm": 0.3871931353713456, "learning_rate": 2.5911523221174963e-07, "loss": 0.024, "step": 8170 }, { "epoch": 3.6291361314679103, "grad_norm": 0.5499381201238311, "learning_rate": 2.584996325185185e-07, "loss": 0.036, "step": 8171 }, { "epoch": 3.6295802798134575, "grad_norm": 0.42718459557037247, "learning_rate": 2.5788474555333675e-07, "loss": 0.0284, "step": 8172 }, { "epoch": 3.630024428159005, "grad_norm": 0.45150263461678425, "learning_rate": 2.5727057140863266e-07, "loss": 0.0424, "step": 8173 }, { "epoch": 3.6304685765045526, "grad_norm": 0.4604751454319313, "learning_rate": 2.566571101767268e-07, "loss": 0.0229, "step": 8174 }, { "epoch": 3.6309127248501, "grad_norm": 0.40368285144270716, "learning_rate": 2.5604436194983204e-07, "loss": 0.0241, "step": 8175 }, { "epoch": 3.6313568731956476, "grad_norm": 0.35454391047270123, "learning_rate": 2.554323268200559e-07, "loss": 0.0194, "step": 8176 }, { "epoch": 3.631801021541195, "grad_norm": 0.4841769515138133, "learning_rate": 2.548210048793964e-07, "loss": 0.0253, "step": 8177 }, { "epoch": 3.632245169886742, "grad_norm": 0.34838665066054253, "learning_rate": 2.5421039621974677e-07, "loss": 0.0219, "step": 8178 }, { "epoch": 3.6326893182322895, "grad_norm": 0.40000850001012683, "learning_rate": 2.5360050093289123e-07, "loss": 0.0281, "step": 8179 }, { "epoch": 3.6331334665778368, "grad_norm": 0.3910550154100924, "learning_rate": 2.529913191105088e-07, "loss": 0.0223, "step": 8180 }, { "epoch": 3.6335776149233845, "grad_norm": 0.48803942739172496, "learning_rate": 2.523828508441672e-07, "loss": 0.025, "step": 8181 }, { "epoch": 3.634021763268932, "grad_norm": 0.5754863723730677, "learning_rate": 2.5177509622533183e-07, "loss": 0.0406, "step": 8182 }, { "epoch": 3.634465911614479, "grad_norm": 0.4286654639108965, "learning_rate": 2.511680553453572e-07, "loss": 0.0216, "step": 8183 }, { "epoch": 3.634910059960027, "grad_norm": 0.35950293556792007, "learning_rate": 2.5056172829549254e-07, "loss": 0.0331, "step": 8184 }, { "epoch": 3.635354208305574, "grad_norm": 0.40214970790890675, "learning_rate": 2.4995611516688003e-07, "loss": 0.0233, "step": 8185 }, { "epoch": 3.6357983566511214, "grad_norm": 0.36548024593908557, "learning_rate": 2.4935121605055125e-07, "loss": 0.0261, "step": 8186 }, { "epoch": 3.6362425049966687, "grad_norm": 0.5386358859158857, "learning_rate": 2.487470310374346e-07, "loss": 0.0215, "step": 8187 }, { "epoch": 3.6366866533422164, "grad_norm": 0.38408368355582256, "learning_rate": 2.481435602183485e-07, "loss": 0.0227, "step": 8188 }, { "epoch": 3.6371308016877637, "grad_norm": 0.3671311848219369, "learning_rate": 2.475408036840055e-07, "loss": 0.0203, "step": 8189 }, { "epoch": 3.637574950033311, "grad_norm": 0.46572660180603004, "learning_rate": 2.469387615250096e-07, "loss": 0.0274, "step": 8190 }, { "epoch": 3.6380190983788587, "grad_norm": 0.4911123345635677, "learning_rate": 2.4633743383185917e-07, "loss": 0.0252, "step": 8191 }, { "epoch": 3.638463246724406, "grad_norm": 0.3574418971645557, "learning_rate": 2.4573682069494234e-07, "loss": 0.0205, "step": 8192 }, { "epoch": 3.6389073950699533, "grad_norm": 0.4154556222921757, "learning_rate": 2.451369222045419e-07, "loss": 0.0248, "step": 8193 }, { "epoch": 3.6393515434155006, "grad_norm": 0.3834951658389965, "learning_rate": 2.445377384508335e-07, "loss": 0.0228, "step": 8194 }, { "epoch": 3.6397956917610483, "grad_norm": 0.3958357851929866, "learning_rate": 2.4393926952388405e-07, "loss": 0.0195, "step": 8195 }, { "epoch": 3.6402398401065956, "grad_norm": 0.481708316236042, "learning_rate": 2.433415155136543e-07, "loss": 0.0245, "step": 8196 }, { "epoch": 3.640683988452143, "grad_norm": 0.45974860157380204, "learning_rate": 2.427444765099951e-07, "loss": 0.0228, "step": 8197 }, { "epoch": 3.6411281367976907, "grad_norm": 0.4731819261447121, "learning_rate": 2.4214815260265367e-07, "loss": 0.0498, "step": 8198 }, { "epoch": 3.641572285143238, "grad_norm": 0.40140095766392536, "learning_rate": 2.4155254388126605e-07, "loss": 0.0309, "step": 8199 }, { "epoch": 3.6420164334887852, "grad_norm": 0.34222283638473855, "learning_rate": 2.4095765043536335e-07, "loss": 0.0196, "step": 8200 }, { "epoch": 3.6424605818343325, "grad_norm": 0.4008139911989502, "learning_rate": 2.403634723543674e-07, "loss": 0.0163, "step": 8201 }, { "epoch": 3.64290473017988, "grad_norm": 0.4485823633640726, "learning_rate": 2.3977000972759454e-07, "loss": 0.0247, "step": 8202 }, { "epoch": 3.6433488785254275, "grad_norm": 0.32021601023059565, "learning_rate": 2.391772626442507e-07, "loss": 0.0162, "step": 8203 }, { "epoch": 3.643793026870975, "grad_norm": 0.43897160516812744, "learning_rate": 2.385852311934367e-07, "loss": 0.0192, "step": 8204 }, { "epoch": 3.6442371752165226, "grad_norm": 0.35383635594441887, "learning_rate": 2.379939154641442e-07, "loss": 0.0224, "step": 8205 }, { "epoch": 3.64468132356207, "grad_norm": 0.4656360521009951, "learning_rate": 2.3740331554525875e-07, "loss": 0.0281, "step": 8206 }, { "epoch": 3.645125471907617, "grad_norm": 0.3752654330901412, "learning_rate": 2.3681343152555768e-07, "loss": 0.021, "step": 8207 }, { "epoch": 3.6455696202531644, "grad_norm": 0.43380693965119627, "learning_rate": 2.3622426349371064e-07, "loss": 0.0233, "step": 8208 }, { "epoch": 3.6460137685987117, "grad_norm": 0.3468026081865424, "learning_rate": 2.3563581153827897e-07, "loss": 0.0208, "step": 8209 }, { "epoch": 3.6464579169442595, "grad_norm": 0.3963696177580075, "learning_rate": 2.3504807574771638e-07, "loss": 0.0234, "step": 8210 }, { "epoch": 3.6469020652898068, "grad_norm": 0.3493278085805506, "learning_rate": 2.3446105621037108e-07, "loss": 0.018, "step": 8211 }, { "epoch": 3.647346213635354, "grad_norm": 0.448389498368257, "learning_rate": 2.3387475301448138e-07, "loss": 0.0262, "step": 8212 }, { "epoch": 3.647790361980902, "grad_norm": 0.4639744403941518, "learning_rate": 2.33289166248179e-07, "loss": 0.0299, "step": 8213 }, { "epoch": 3.648234510326449, "grad_norm": 0.34846046299317557, "learning_rate": 2.327042959994863e-07, "loss": 0.0197, "step": 8214 }, { "epoch": 3.6486786586719964, "grad_norm": 0.35001828631869614, "learning_rate": 2.3212014235632074e-07, "loss": 0.0184, "step": 8215 }, { "epoch": 3.6491228070175437, "grad_norm": 0.3539191157147139, "learning_rate": 2.3153670540648932e-07, "loss": 0.0202, "step": 8216 }, { "epoch": 3.6495669553630914, "grad_norm": 0.39166688802077126, "learning_rate": 2.3095398523769353e-07, "loss": 0.0237, "step": 8217 }, { "epoch": 3.6500111037086387, "grad_norm": 0.41163896619179474, "learning_rate": 2.3037198193752553e-07, "loss": 0.0254, "step": 8218 }, { "epoch": 3.650455252054186, "grad_norm": 0.43685004953046314, "learning_rate": 2.2979069559347088e-07, "loss": 0.0237, "step": 8219 }, { "epoch": 3.6508994003997337, "grad_norm": 0.5309766077496497, "learning_rate": 2.292101262929064e-07, "loss": 0.0292, "step": 8220 }, { "epoch": 3.651343548745281, "grad_norm": 0.41452507271756617, "learning_rate": 2.2863027412310056e-07, "loss": 0.0292, "step": 8221 }, { "epoch": 3.6517876970908283, "grad_norm": 0.44136007566907587, "learning_rate": 2.2805113917121647e-07, "loss": 0.0284, "step": 8222 }, { "epoch": 3.6522318454363756, "grad_norm": 0.3476979126592021, "learning_rate": 2.274727215243072e-07, "loss": 0.0197, "step": 8223 }, { "epoch": 3.6526759937819233, "grad_norm": 0.38817929599514056, "learning_rate": 2.2689502126931938e-07, "loss": 0.0224, "step": 8224 }, { "epoch": 3.6531201421274706, "grad_norm": 0.4746782142221425, "learning_rate": 2.2631803849309076e-07, "loss": 0.0352, "step": 8225 }, { "epoch": 3.653564290473018, "grad_norm": 0.40968095773887653, "learning_rate": 2.2574177328235137e-07, "loss": 0.0276, "step": 8226 }, { "epoch": 3.6540084388185656, "grad_norm": 0.41401682518721233, "learning_rate": 2.2516622572372416e-07, "loss": 0.0283, "step": 8227 }, { "epoch": 3.654452587164113, "grad_norm": 0.4117127298389683, "learning_rate": 2.2459139590372325e-07, "loss": 0.0228, "step": 8228 }, { "epoch": 3.65489673550966, "grad_norm": 0.4311069896164078, "learning_rate": 2.240172839087551e-07, "loss": 0.0281, "step": 8229 }, { "epoch": 3.6553408838552075, "grad_norm": 0.4327836235259352, "learning_rate": 2.2344388982512012e-07, "loss": 0.0228, "step": 8230 }, { "epoch": 3.655785032200755, "grad_norm": 0.4573606402349546, "learning_rate": 2.2287121373900712e-07, "loss": 0.0319, "step": 8231 }, { "epoch": 3.6562291805463025, "grad_norm": 0.4345901205357684, "learning_rate": 2.2229925573650001e-07, "loss": 0.0269, "step": 8232 }, { "epoch": 3.65667332889185, "grad_norm": 0.3825914316334593, "learning_rate": 2.2172801590357395e-07, "loss": 0.0282, "step": 8233 }, { "epoch": 3.6571174772373976, "grad_norm": 0.31366037192320456, "learning_rate": 2.2115749432609524e-07, "loss": 0.0167, "step": 8234 }, { "epoch": 3.657561625582945, "grad_norm": 0.3788275375636668, "learning_rate": 2.205876910898236e-07, "loss": 0.0306, "step": 8235 }, { "epoch": 3.658005773928492, "grad_norm": 0.38419523963619157, "learning_rate": 2.2001860628041106e-07, "loss": 0.0299, "step": 8236 }, { "epoch": 3.6584499222740394, "grad_norm": 0.4550088476883181, "learning_rate": 2.1945023998339865e-07, "loss": 0.0347, "step": 8237 }, { "epoch": 3.6588940706195867, "grad_norm": 0.597248639500098, "learning_rate": 2.1888259228422248e-07, "loss": 0.0194, "step": 8238 }, { "epoch": 3.6593382189651344, "grad_norm": 0.5188207400851287, "learning_rate": 2.1831566326820986e-07, "loss": 0.0276, "step": 8239 }, { "epoch": 3.6597823673106817, "grad_norm": 0.32831002856910196, "learning_rate": 2.177494530205798e-07, "loss": 0.0245, "step": 8240 }, { "epoch": 3.660226515656229, "grad_norm": 0.3836254147289484, "learning_rate": 2.1718396162644319e-07, "loss": 0.0212, "step": 8241 }, { "epoch": 3.6606706640017768, "grad_norm": 0.4548723138202921, "learning_rate": 2.1661918917080304e-07, "loss": 0.0253, "step": 8242 }, { "epoch": 3.661114812347324, "grad_norm": 0.3684605663304441, "learning_rate": 2.1605513573855375e-07, "loss": 0.0178, "step": 8243 }, { "epoch": 3.6615589606928713, "grad_norm": 0.3733701839690256, "learning_rate": 2.1549180141448356e-07, "loss": 0.0194, "step": 8244 }, { "epoch": 3.6620031090384186, "grad_norm": 0.4105795750947663, "learning_rate": 2.1492918628326864e-07, "loss": 0.0148, "step": 8245 }, { "epoch": 3.6624472573839664, "grad_norm": 0.41761384153375, "learning_rate": 2.143672904294819e-07, "loss": 0.0293, "step": 8246 }, { "epoch": 3.6628914057295137, "grad_norm": 0.3728653241577783, "learning_rate": 2.1380611393758576e-07, "loss": 0.0223, "step": 8247 }, { "epoch": 3.663335554075061, "grad_norm": 0.3955291650313776, "learning_rate": 2.1324565689193332e-07, "loss": 0.025, "step": 8248 }, { "epoch": 3.6637797024206087, "grad_norm": 0.5976258676447095, "learning_rate": 2.1268591937677164e-07, "loss": 0.0328, "step": 8249 }, { "epoch": 3.664223850766156, "grad_norm": 0.7950146359972061, "learning_rate": 2.1212690147623894e-07, "loss": 0.0295, "step": 8250 }, { "epoch": 3.6646679991117033, "grad_norm": 0.5530704513836777, "learning_rate": 2.1156860327436302e-07, "loss": 0.0302, "step": 8251 }, { "epoch": 3.6651121474572506, "grad_norm": 0.29807912775738393, "learning_rate": 2.1101102485506842e-07, "loss": 0.0178, "step": 8252 }, { "epoch": 3.6655562958027983, "grad_norm": 0.4517710106710957, "learning_rate": 2.1045416630216808e-07, "loss": 0.0242, "step": 8253 }, { "epoch": 3.6660004441483456, "grad_norm": 0.4254157446074955, "learning_rate": 2.0989802769936563e-07, "loss": 0.0268, "step": 8254 }, { "epoch": 3.666444592493893, "grad_norm": 0.3693301648290188, "learning_rate": 2.0934260913025973e-07, "loss": 0.0266, "step": 8255 }, { "epoch": 3.6668887408394406, "grad_norm": 0.4631155691374193, "learning_rate": 2.0878791067833805e-07, "loss": 0.0293, "step": 8256 }, { "epoch": 3.667332889184988, "grad_norm": 0.3868441750288522, "learning_rate": 2.0823393242698275e-07, "loss": 0.0261, "step": 8257 }, { "epoch": 3.667777037530535, "grad_norm": 0.430361544922243, "learning_rate": 2.0768067445946506e-07, "loss": 0.0247, "step": 8258 }, { "epoch": 3.6682211858760825, "grad_norm": 0.4116233802052011, "learning_rate": 2.0712813685894894e-07, "loss": 0.0315, "step": 8259 }, { "epoch": 3.6686653342216298, "grad_norm": 0.3788192226900461, "learning_rate": 2.0657631970849078e-07, "loss": 0.0229, "step": 8260 }, { "epoch": 3.6691094825671775, "grad_norm": 0.37557385002997257, "learning_rate": 2.0602522309103813e-07, "loss": 0.0236, "step": 8261 }, { "epoch": 3.669553630912725, "grad_norm": 0.3805674438167674, "learning_rate": 2.054748470894291e-07, "loss": 0.0252, "step": 8262 }, { "epoch": 3.6699977792582725, "grad_norm": 0.3690281782315337, "learning_rate": 2.0492519178639536e-07, "loss": 0.0161, "step": 8263 }, { "epoch": 3.67044192760382, "grad_norm": 0.7116450161368049, "learning_rate": 2.0437625726456024e-07, "loss": 0.0292, "step": 8264 }, { "epoch": 3.670886075949367, "grad_norm": 0.5939366820148858, "learning_rate": 2.0382804360643603e-07, "loss": 0.0382, "step": 8265 }, { "epoch": 3.6713302242949144, "grad_norm": 0.4269858268062977, "learning_rate": 2.0328055089443023e-07, "loss": 0.0236, "step": 8266 }, { "epoch": 3.6717743726404617, "grad_norm": 0.43163831702859096, "learning_rate": 2.027337792108397e-07, "loss": 0.0244, "step": 8267 }, { "epoch": 3.6722185209860094, "grad_norm": 0.5025109334901467, "learning_rate": 2.0218772863785263e-07, "loss": 0.0296, "step": 8268 }, { "epoch": 3.6726626693315567, "grad_norm": 0.6343391256263229, "learning_rate": 2.016423992575517e-07, "loss": 0.0248, "step": 8269 }, { "epoch": 3.673106817677104, "grad_norm": 0.4000275239829487, "learning_rate": 2.0109779115190742e-07, "loss": 0.0198, "step": 8270 }, { "epoch": 3.6735509660226517, "grad_norm": 0.3303791647606292, "learning_rate": 2.0055390440278376e-07, "loss": 0.0258, "step": 8271 }, { "epoch": 3.673995114368199, "grad_norm": 0.36510639486980745, "learning_rate": 2.0001073909193702e-07, "loss": 0.0173, "step": 8272 }, { "epoch": 3.6744392627137463, "grad_norm": 0.4445660217933993, "learning_rate": 1.9946829530101408e-07, "loss": 0.0279, "step": 8273 }, { "epoch": 3.6748834110592936, "grad_norm": 0.3323295635321309, "learning_rate": 1.989265731115525e-07, "loss": 0.0211, "step": 8274 }, { "epoch": 3.6753275594048413, "grad_norm": 0.4082377088884518, "learning_rate": 1.983855726049838e-07, "loss": 0.0281, "step": 8275 }, { "epoch": 3.6757717077503886, "grad_norm": 0.5538219755733254, "learning_rate": 1.9784529386262798e-07, "loss": 0.0375, "step": 8276 }, { "epoch": 3.676215856095936, "grad_norm": 0.4054313962604177, "learning_rate": 1.9730573696569888e-07, "loss": 0.0221, "step": 8277 }, { "epoch": 3.6766600044414837, "grad_norm": 0.338502488617252, "learning_rate": 1.9676690199530169e-07, "loss": 0.0231, "step": 8278 }, { "epoch": 3.677104152787031, "grad_norm": 0.4016411596672554, "learning_rate": 1.9622878903243104e-07, "loss": 0.024, "step": 8279 }, { "epoch": 3.6775483011325782, "grad_norm": 0.4045075471165877, "learning_rate": 1.95691398157975e-07, "loss": 0.0179, "step": 8280 }, { "epoch": 3.6779924494781255, "grad_norm": 0.40898342731485937, "learning_rate": 1.9515472945271396e-07, "loss": 0.0337, "step": 8281 }, { "epoch": 3.6784365978236733, "grad_norm": 0.3742680249776215, "learning_rate": 1.946187829973162e-07, "loss": 0.0181, "step": 8282 }, { "epoch": 3.6788807461692206, "grad_norm": 0.38742125913942227, "learning_rate": 1.9408355887234443e-07, "loss": 0.0267, "step": 8283 }, { "epoch": 3.679324894514768, "grad_norm": 0.31803662853945275, "learning_rate": 1.9354905715825323e-07, "loss": 0.0106, "step": 8284 }, { "epoch": 3.6797690428603156, "grad_norm": 0.3842318920408417, "learning_rate": 1.9301527793538445e-07, "loss": 0.0208, "step": 8285 }, { "epoch": 3.680213191205863, "grad_norm": 0.34972059951342765, "learning_rate": 1.9248222128397663e-07, "loss": 0.0209, "step": 8286 }, { "epoch": 3.68065733955141, "grad_norm": 0.4945754865360583, "learning_rate": 1.9194988728415632e-07, "loss": 0.0285, "step": 8287 }, { "epoch": 3.6811014878969575, "grad_norm": 0.4665722830251576, "learning_rate": 1.9141827601594221e-07, "loss": 0.0228, "step": 8288 }, { "epoch": 3.6815456362425047, "grad_norm": 0.33231118424653433, "learning_rate": 1.908873875592454e-07, "loss": 0.0215, "step": 8289 }, { "epoch": 3.6819897845880525, "grad_norm": 0.41202362800291603, "learning_rate": 1.9035722199386542e-07, "loss": 0.0219, "step": 8290 }, { "epoch": 3.6824339329335998, "grad_norm": 0.4209126047122881, "learning_rate": 1.8982777939949736e-07, "loss": 0.0356, "step": 8291 }, { "epoch": 3.6828780812791475, "grad_norm": 0.35749851244891667, "learning_rate": 1.8929905985572484e-07, "loss": 0.0194, "step": 8292 }, { "epoch": 3.683322229624695, "grad_norm": 0.38704831620698, "learning_rate": 1.8877106344202312e-07, "loss": 0.0329, "step": 8293 }, { "epoch": 3.683766377970242, "grad_norm": 0.526720346124437, "learning_rate": 1.8824379023775874e-07, "loss": 0.0357, "step": 8294 }, { "epoch": 3.6842105263157894, "grad_norm": 0.42113199922195677, "learning_rate": 1.877172403221905e-07, "loss": 0.0315, "step": 8295 }, { "epoch": 3.6846546746613367, "grad_norm": 0.3078574734152821, "learning_rate": 1.871914137744668e-07, "loss": 0.0165, "step": 8296 }, { "epoch": 3.6850988230068844, "grad_norm": 0.4123152188226272, "learning_rate": 1.866663106736294e-07, "loss": 0.0401, "step": 8297 }, { "epoch": 3.6855429713524317, "grad_norm": 0.3318217390451302, "learning_rate": 1.8614193109860955e-07, "loss": 0.0228, "step": 8298 }, { "epoch": 3.685987119697979, "grad_norm": 0.4377759250920508, "learning_rate": 1.8561827512823095e-07, "loss": 0.0271, "step": 8299 }, { "epoch": 3.6864312680435267, "grad_norm": 0.8207219751826093, "learning_rate": 1.8509534284120721e-07, "loss": 0.0344, "step": 8300 }, { "epoch": 3.686875416389074, "grad_norm": 0.8375758066775824, "learning_rate": 1.84573134316145e-07, "loss": 0.0367, "step": 8301 }, { "epoch": 3.6873195647346213, "grad_norm": 0.36428841642403553, "learning_rate": 1.840516496315392e-07, "loss": 0.0217, "step": 8302 }, { "epoch": 3.6877637130801686, "grad_norm": 0.5076188683241221, "learning_rate": 1.8353088886578053e-07, "loss": 0.0294, "step": 8303 }, { "epoch": 3.6882078614257163, "grad_norm": 0.6798347292105588, "learning_rate": 1.830108520971463e-07, "loss": 0.0405, "step": 8304 }, { "epoch": 3.6886520097712636, "grad_norm": 0.3568052667311966, "learning_rate": 1.8249153940380738e-07, "loss": 0.0279, "step": 8305 }, { "epoch": 3.689096158116811, "grad_norm": 0.4023381862191607, "learning_rate": 1.8197295086382515e-07, "loss": 0.0271, "step": 8306 }, { "epoch": 3.6895403064623586, "grad_norm": 0.3720480087573231, "learning_rate": 1.8145508655515177e-07, "loss": 0.0199, "step": 8307 }, { "epoch": 3.689984454807906, "grad_norm": 0.4144873947838932, "learning_rate": 1.8093794655563214e-07, "loss": 0.0215, "step": 8308 }, { "epoch": 3.6904286031534532, "grad_norm": 0.39511282629465616, "learning_rate": 1.804215309430013e-07, "loss": 0.0175, "step": 8309 }, { "epoch": 3.6908727514990005, "grad_norm": 0.5527099360678772, "learning_rate": 1.799058397948844e-07, "loss": 0.0381, "step": 8310 }, { "epoch": 3.691316899844548, "grad_norm": 0.46007077628363413, "learning_rate": 1.7939087318879833e-07, "loss": 0.0333, "step": 8311 }, { "epoch": 3.6917610481900955, "grad_norm": 0.43813513461840425, "learning_rate": 1.788766312021528e-07, "loss": 0.0298, "step": 8312 }, { "epoch": 3.692205196535643, "grad_norm": 0.4101350408780889, "learning_rate": 1.7836311391224494e-07, "loss": 0.0259, "step": 8313 }, { "epoch": 3.6926493448811906, "grad_norm": 0.2992536206219798, "learning_rate": 1.7785032139626734e-07, "loss": 0.0185, "step": 8314 }, { "epoch": 3.693093493226738, "grad_norm": 0.365121204268502, "learning_rate": 1.7733825373129954e-07, "loss": 0.0216, "step": 8315 }, { "epoch": 3.693537641572285, "grad_norm": 0.416511886265595, "learning_rate": 1.7682691099431548e-07, "loss": 0.0372, "step": 8316 }, { "epoch": 3.6939817899178324, "grad_norm": 0.3845252122577115, "learning_rate": 1.763162932621787e-07, "loss": 0.0257, "step": 8317 }, { "epoch": 3.6944259382633797, "grad_norm": 0.4233868381729098, "learning_rate": 1.7580640061164223e-07, "loss": 0.0294, "step": 8318 }, { "epoch": 3.6948700866089275, "grad_norm": 0.462463687883106, "learning_rate": 1.7529723311935198e-07, "loss": 0.0364, "step": 8319 }, { "epoch": 3.6953142349544748, "grad_norm": 0.43549161369245626, "learning_rate": 1.7478879086184564e-07, "loss": 0.0263, "step": 8320 }, { "epoch": 3.695758383300022, "grad_norm": 0.39333187210295867, "learning_rate": 1.742810739155504e-07, "loss": 0.0246, "step": 8321 }, { "epoch": 3.6962025316455698, "grad_norm": 0.35014124457318635, "learning_rate": 1.737740823567835e-07, "loss": 0.0218, "step": 8322 }, { "epoch": 3.696646679991117, "grad_norm": 0.42978622961517393, "learning_rate": 1.7326781626175627e-07, "loss": 0.0257, "step": 8323 }, { "epoch": 3.6970908283366644, "grad_norm": 0.4118470726335261, "learning_rate": 1.727622757065678e-07, "loss": 0.0261, "step": 8324 }, { "epoch": 3.6975349766822116, "grad_norm": 0.471980787223015, "learning_rate": 1.7225746076720894e-07, "loss": 0.0226, "step": 8325 }, { "epoch": 3.6979791250277594, "grad_norm": 0.4565879663220365, "learning_rate": 1.717533715195635e-07, "loss": 0.0248, "step": 8326 }, { "epoch": 3.6984232733733067, "grad_norm": 0.3976292965837061, "learning_rate": 1.712500080394036e-07, "loss": 0.0324, "step": 8327 }, { "epoch": 3.698867421718854, "grad_norm": 0.4111827692476157, "learning_rate": 1.7074737040239375e-07, "loss": 0.0248, "step": 8328 }, { "epoch": 3.6993115700644017, "grad_norm": 0.36585319212766676, "learning_rate": 1.7024545868408903e-07, "loss": 0.0209, "step": 8329 }, { "epoch": 3.699755718409949, "grad_norm": 0.47005920699211934, "learning_rate": 1.6974427295993412e-07, "loss": 0.0203, "step": 8330 }, { "epoch": 3.7001998667554963, "grad_norm": 0.36932538349452104, "learning_rate": 1.6924381330526817e-07, "loss": 0.0212, "step": 8331 }, { "epoch": 3.7006440151010436, "grad_norm": 0.446723580160887, "learning_rate": 1.6874407979531604e-07, "loss": 0.0247, "step": 8332 }, { "epoch": 3.7010881634465913, "grad_norm": 0.445687716290222, "learning_rate": 1.682450725051976e-07, "loss": 0.0251, "step": 8333 }, { "epoch": 3.7015323117921386, "grad_norm": 0.3855963805554051, "learning_rate": 1.677467915099229e-07, "loss": 0.0161, "step": 8334 }, { "epoch": 3.701976460137686, "grad_norm": 0.39452015734075696, "learning_rate": 1.6724923688439033e-07, "loss": 0.0238, "step": 8335 }, { "epoch": 3.7024206084832336, "grad_norm": 0.45003118141173504, "learning_rate": 1.667524087033906e-07, "loss": 0.0264, "step": 8336 }, { "epoch": 3.702864756828781, "grad_norm": 0.42598757489225364, "learning_rate": 1.6625630704160788e-07, "loss": 0.0336, "step": 8337 }, { "epoch": 3.703308905174328, "grad_norm": 0.38079855634441273, "learning_rate": 1.6576093197361253e-07, "loss": 0.0286, "step": 8338 }, { "epoch": 3.7037530535198755, "grad_norm": 0.3983365488271041, "learning_rate": 1.652662835738683e-07, "loss": 0.0205, "step": 8339 }, { "epoch": 3.704197201865423, "grad_norm": 0.29963760935621175, "learning_rate": 1.6477236191673018e-07, "loss": 0.0189, "step": 8340 }, { "epoch": 3.7046413502109705, "grad_norm": 0.771557737936414, "learning_rate": 1.6427916707644153e-07, "loss": 0.0319, "step": 8341 }, { "epoch": 3.705085498556518, "grad_norm": 0.4894556058318927, "learning_rate": 1.6378669912713862e-07, "loss": 0.0316, "step": 8342 }, { "epoch": 3.7055296469020655, "grad_norm": 0.3795606721666376, "learning_rate": 1.6329495814284778e-07, "loss": 0.0196, "step": 8343 }, { "epoch": 3.705973795247613, "grad_norm": 0.3778626203357572, "learning_rate": 1.62803944197486e-07, "loss": 0.0266, "step": 8344 }, { "epoch": 3.70641794359316, "grad_norm": 0.4426801167498142, "learning_rate": 1.6231365736486093e-07, "loss": 0.0288, "step": 8345 }, { "epoch": 3.7068620919387074, "grad_norm": 0.35264417606088994, "learning_rate": 1.6182409771867137e-07, "loss": 0.0189, "step": 8346 }, { "epoch": 3.7073062402842547, "grad_norm": 0.38892860903541526, "learning_rate": 1.6133526533250566e-07, "loss": 0.0318, "step": 8347 }, { "epoch": 3.7077503886298024, "grad_norm": 0.3867272658310112, "learning_rate": 1.6084716027984503e-07, "loss": 0.0246, "step": 8348 }, { "epoch": 3.7081945369753497, "grad_norm": 0.6221413345427776, "learning_rate": 1.6035978263405804e-07, "loss": 0.0333, "step": 8349 }, { "epoch": 3.708638685320897, "grad_norm": 0.5154361170774239, "learning_rate": 1.5987313246840718e-07, "loss": 0.0342, "step": 8350 }, { "epoch": 3.7090828336664448, "grad_norm": 0.3524314183478963, "learning_rate": 1.593872098560445e-07, "loss": 0.0211, "step": 8351 }, { "epoch": 3.709526982011992, "grad_norm": 0.38476894824992575, "learning_rate": 1.58902014870011e-07, "loss": 0.0211, "step": 8352 }, { "epoch": 3.7099711303575393, "grad_norm": 0.4245656298307411, "learning_rate": 1.5841754758324058e-07, "loss": 0.0209, "step": 8353 }, { "epoch": 3.7104152787030866, "grad_norm": 0.3820971674731869, "learning_rate": 1.579338080685572e-07, "loss": 0.0221, "step": 8354 }, { "epoch": 3.7108594270486344, "grad_norm": 0.37340680941918303, "learning_rate": 1.5745079639867488e-07, "loss": 0.0297, "step": 8355 }, { "epoch": 3.7113035753941817, "grad_norm": 0.29722253097229934, "learning_rate": 1.5696851264619785e-07, "loss": 0.0178, "step": 8356 }, { "epoch": 3.711747723739729, "grad_norm": 0.4252952177272931, "learning_rate": 1.5648695688362304e-07, "loss": 0.0257, "step": 8357 }, { "epoch": 3.7121918720852767, "grad_norm": 0.49778617697788585, "learning_rate": 1.560061291833348e-07, "loss": 0.0377, "step": 8358 }, { "epoch": 3.712636020430824, "grad_norm": 0.45908962801821235, "learning_rate": 1.5552602961761033e-07, "loss": 0.0236, "step": 8359 }, { "epoch": 3.7130801687763713, "grad_norm": 0.35236790788170025, "learning_rate": 1.5504665825861687e-07, "loss": 0.018, "step": 8360 }, { "epoch": 3.7135243171219185, "grad_norm": 0.516154033831261, "learning_rate": 1.5456801517841236e-07, "loss": 0.0338, "step": 8361 }, { "epoch": 3.7139684654674663, "grad_norm": 0.2906873978074194, "learning_rate": 1.540901004489448e-07, "loss": 0.0257, "step": 8362 }, { "epoch": 3.7144126138130136, "grad_norm": 0.363688530548789, "learning_rate": 1.5361291414205226e-07, "loss": 0.0175, "step": 8363 }, { "epoch": 3.714856762158561, "grad_norm": 0.4720314053715725, "learning_rate": 1.5313645632946407e-07, "loss": 0.0406, "step": 8364 }, { "epoch": 3.7153009105041086, "grad_norm": 0.36832914963768065, "learning_rate": 1.5266072708280177e-07, "loss": 0.0232, "step": 8365 }, { "epoch": 3.715745058849656, "grad_norm": 0.40219479844398387, "learning_rate": 1.5218572647357265e-07, "loss": 0.0325, "step": 8366 }, { "epoch": 3.716189207195203, "grad_norm": 0.3686119581029194, "learning_rate": 1.517114545731796e-07, "loss": 0.019, "step": 8367 }, { "epoch": 3.7166333555407505, "grad_norm": 0.4442003891466178, "learning_rate": 1.5123791145291332e-07, "loss": 0.0275, "step": 8368 }, { "epoch": 3.7170775038862978, "grad_norm": 0.36490090535494746, "learning_rate": 1.5076509718395416e-07, "loss": 0.0155, "step": 8369 }, { "epoch": 3.7175216522318455, "grad_norm": 0.4009221914723727, "learning_rate": 1.502930118373752e-07, "loss": 0.0223, "step": 8370 }, { "epoch": 3.717965800577393, "grad_norm": 0.43768126067136603, "learning_rate": 1.4982165548413862e-07, "loss": 0.0192, "step": 8371 }, { "epoch": 3.7184099489229405, "grad_norm": 0.5261889966594541, "learning_rate": 1.4935102819509717e-07, "loss": 0.0353, "step": 8372 }, { "epoch": 3.718854097268488, "grad_norm": 0.34594837099051023, "learning_rate": 1.488811300409948e-07, "loss": 0.0188, "step": 8373 }, { "epoch": 3.719298245614035, "grad_norm": 0.5250244707730272, "learning_rate": 1.4841196109246448e-07, "loss": 0.0355, "step": 8374 }, { "epoch": 3.7197423939595824, "grad_norm": 0.3336300196833185, "learning_rate": 1.4794352142003088e-07, "loss": 0.0199, "step": 8375 }, { "epoch": 3.7201865423051297, "grad_norm": 0.37689522491294797, "learning_rate": 1.4747581109410713e-07, "loss": 0.0174, "step": 8376 }, { "epoch": 3.7206306906506774, "grad_norm": 0.48406735678739193, "learning_rate": 1.4700883018499979e-07, "loss": 0.025, "step": 8377 }, { "epoch": 3.7210748389962247, "grad_norm": 0.45821951845296044, "learning_rate": 1.4654257876290267e-07, "loss": 0.0392, "step": 8378 }, { "epoch": 3.721518987341772, "grad_norm": 0.43841923148082784, "learning_rate": 1.4607705689790197e-07, "loss": 0.0213, "step": 8379 }, { "epoch": 3.7219631356873197, "grad_norm": 0.38180624149856224, "learning_rate": 1.4561226465997337e-07, "loss": 0.0244, "step": 8380 }, { "epoch": 3.722407284032867, "grad_norm": 0.38238991365264746, "learning_rate": 1.4514820211898263e-07, "loss": 0.0235, "step": 8381 }, { "epoch": 3.7228514323784143, "grad_norm": 0.4147995047117634, "learning_rate": 1.4468486934468728e-07, "loss": 0.0244, "step": 8382 }, { "epoch": 3.7232955807239616, "grad_norm": 0.37825138010304116, "learning_rate": 1.442222664067333e-07, "loss": 0.0204, "step": 8383 }, { "epoch": 3.7237397290695093, "grad_norm": 0.43713370136688856, "learning_rate": 1.437603933746573e-07, "loss": 0.0283, "step": 8384 }, { "epoch": 3.7241838774150566, "grad_norm": 0.3959829514177559, "learning_rate": 1.4329925031788815e-07, "loss": 0.024, "step": 8385 }, { "epoch": 3.724628025760604, "grad_norm": 0.36795641365580717, "learning_rate": 1.4283883730574212e-07, "loss": 0.0242, "step": 8386 }, { "epoch": 3.7250721741061517, "grad_norm": 0.33923664126749403, "learning_rate": 1.4237915440742768e-07, "loss": 0.0259, "step": 8387 }, { "epoch": 3.725516322451699, "grad_norm": 0.43212191542717354, "learning_rate": 1.4192020169204292e-07, "loss": 0.0205, "step": 8388 }, { "epoch": 3.7259604707972462, "grad_norm": 0.5186925399242536, "learning_rate": 1.4146197922857597e-07, "loss": 0.032, "step": 8389 }, { "epoch": 3.7264046191427935, "grad_norm": 0.3400385668731632, "learning_rate": 1.410044870859062e-07, "loss": 0.0224, "step": 8390 }, { "epoch": 3.7268487674883413, "grad_norm": 0.4227136704820241, "learning_rate": 1.4054772533280137e-07, "loss": 0.0217, "step": 8391 }, { "epoch": 3.7272929158338886, "grad_norm": 0.4011138415574072, "learning_rate": 1.4009169403792154e-07, "loss": 0.025, "step": 8392 }, { "epoch": 3.727737064179436, "grad_norm": 0.3003293847098222, "learning_rate": 1.396363932698147e-07, "loss": 0.0174, "step": 8393 }, { "epoch": 3.7281812125249836, "grad_norm": 0.4465399668144551, "learning_rate": 1.3918182309692164e-07, "loss": 0.0361, "step": 8394 }, { "epoch": 3.728625360870531, "grad_norm": 0.3571377988241023, "learning_rate": 1.3872798358757155e-07, "loss": 0.0186, "step": 8395 }, { "epoch": 3.729069509216078, "grad_norm": 0.3618570693279145, "learning_rate": 1.3827487480998437e-07, "loss": 0.0204, "step": 8396 }, { "epoch": 3.7295136575616255, "grad_norm": 0.4415246137854366, "learning_rate": 1.3782249683226946e-07, "loss": 0.0215, "step": 8397 }, { "epoch": 3.7299578059071727, "grad_norm": 0.3988226147588891, "learning_rate": 1.373708497224263e-07, "loss": 0.025, "step": 8398 }, { "epoch": 3.7304019542527205, "grad_norm": 0.3588668524798711, "learning_rate": 1.3691993354834733e-07, "loss": 0.017, "step": 8399 }, { "epoch": 3.7308461025982678, "grad_norm": 0.4766338898375096, "learning_rate": 1.3646974837781102e-07, "loss": 0.0246, "step": 8400 }, { "epoch": 3.7312902509438155, "grad_norm": 0.4346113992522893, "learning_rate": 1.3602029427848885e-07, "loss": 0.0224, "step": 8401 }, { "epoch": 3.731734399289363, "grad_norm": 0.4308704837627075, "learning_rate": 1.355715713179412e-07, "loss": 0.0228, "step": 8402 }, { "epoch": 3.73217854763491, "grad_norm": 0.3138469968218578, "learning_rate": 1.35123579563618e-07, "loss": 0.0211, "step": 8403 }, { "epoch": 3.7326226959804574, "grad_norm": 0.3603696166551371, "learning_rate": 1.346763190828604e-07, "loss": 0.0202, "step": 8404 }, { "epoch": 3.7330668443260047, "grad_norm": 0.472833085205998, "learning_rate": 1.3422978994290014e-07, "loss": 0.032, "step": 8405 }, { "epoch": 3.7335109926715524, "grad_norm": 0.5373078797879084, "learning_rate": 1.3378399221085691e-07, "loss": 0.0331, "step": 8406 }, { "epoch": 3.7339551410170997, "grad_norm": 0.32376977842916665, "learning_rate": 1.3333892595374265e-07, "loss": 0.0193, "step": 8407 }, { "epoch": 3.734399289362647, "grad_norm": 0.5384681977899171, "learning_rate": 1.3289459123845772e-07, "loss": 0.0439, "step": 8408 }, { "epoch": 3.7348434377081947, "grad_norm": 0.360367652411676, "learning_rate": 1.3245098813179315e-07, "loss": 0.0259, "step": 8409 }, { "epoch": 3.735287586053742, "grad_norm": 0.34043662992938706, "learning_rate": 1.3200811670043057e-07, "loss": 0.0156, "step": 8410 }, { "epoch": 3.7357317343992893, "grad_norm": 0.46845973568867966, "learning_rate": 1.3156597701094065e-07, "loss": 0.0223, "step": 8411 }, { "epoch": 3.7361758827448366, "grad_norm": 0.4170724326037999, "learning_rate": 1.3112456912978467e-07, "loss": 0.0183, "step": 8412 }, { "epoch": 3.7366200310903843, "grad_norm": 0.6062425923257145, "learning_rate": 1.3068389312331398e-07, "loss": 0.0326, "step": 8413 }, { "epoch": 3.7370641794359316, "grad_norm": 0.32063888344376806, "learning_rate": 1.3024394905776893e-07, "loss": 0.0155, "step": 8414 }, { "epoch": 3.737508327781479, "grad_norm": 0.4839068579509274, "learning_rate": 1.298047369992811e-07, "loss": 0.0285, "step": 8415 }, { "epoch": 3.7379524761270266, "grad_norm": 0.4686148975397421, "learning_rate": 1.2936625701387152e-07, "loss": 0.0291, "step": 8416 }, { "epoch": 3.738396624472574, "grad_norm": 0.40247010635972097, "learning_rate": 1.289285091674508e-07, "loss": 0.0248, "step": 8417 }, { "epoch": 3.738840772818121, "grad_norm": 0.3788928148883821, "learning_rate": 1.2849149352582135e-07, "loss": 0.0277, "step": 8418 }, { "epoch": 3.7392849211636685, "grad_norm": 0.45807438580819443, "learning_rate": 1.280552101546717e-07, "loss": 0.0311, "step": 8419 }, { "epoch": 3.7397290695092162, "grad_norm": 0.513273904365022, "learning_rate": 1.2761965911958385e-07, "loss": 0.0245, "step": 8420 }, { "epoch": 3.7401732178547635, "grad_norm": 0.39246829057407184, "learning_rate": 1.2718484048602876e-07, "loss": 0.0231, "step": 8421 }, { "epoch": 3.740617366200311, "grad_norm": 0.41649329925637596, "learning_rate": 1.267507543193669e-07, "loss": 0.0199, "step": 8422 }, { "epoch": 3.7410615145458586, "grad_norm": 0.42862616530797626, "learning_rate": 1.2631740068484888e-07, "loss": 0.0307, "step": 8423 }, { "epoch": 3.741505662891406, "grad_norm": 0.35644680831072795, "learning_rate": 1.258847796476148e-07, "loss": 0.0241, "step": 8424 }, { "epoch": 3.741949811236953, "grad_norm": 0.32721758016106417, "learning_rate": 1.2545289127269488e-07, "loss": 0.0235, "step": 8425 }, { "epoch": 3.7423939595825004, "grad_norm": 0.40082921797531834, "learning_rate": 1.2502173562500995e-07, "loss": 0.021, "step": 8426 }, { "epoch": 3.7428381079280477, "grad_norm": 0.39626883986681716, "learning_rate": 1.2459131276936876e-07, "loss": 0.0242, "step": 8427 }, { "epoch": 3.7432822562735955, "grad_norm": 0.5073039426598518, "learning_rate": 1.241616227704723e-07, "loss": 0.0278, "step": 8428 }, { "epoch": 3.7437264046191427, "grad_norm": 0.3603567472819875, "learning_rate": 1.2373266569290997e-07, "loss": 0.032, "step": 8429 }, { "epoch": 3.7441705529646905, "grad_norm": 0.4679950484890861, "learning_rate": 1.2330444160116196e-07, "loss": 0.0208, "step": 8430 }, { "epoch": 3.7446147013102378, "grad_norm": 0.468095341335435, "learning_rate": 1.2287695055959615e-07, "loss": 0.0223, "step": 8431 }, { "epoch": 3.745058849655785, "grad_norm": 0.4819559753911326, "learning_rate": 1.2245019263247283e-07, "loss": 0.0262, "step": 8432 }, { "epoch": 3.7455029980013324, "grad_norm": 0.4954170402461803, "learning_rate": 1.2202416788394067e-07, "loss": 0.0333, "step": 8433 }, { "epoch": 3.7459471463468796, "grad_norm": 0.6174858065052228, "learning_rate": 1.215988763780379e-07, "loss": 0.0313, "step": 8434 }, { "epoch": 3.7463912946924274, "grad_norm": 0.42561144433663495, "learning_rate": 1.2117431817869453e-07, "loss": 0.0277, "step": 8435 }, { "epoch": 3.7468354430379747, "grad_norm": 0.3709807838740829, "learning_rate": 1.207504933497272e-07, "loss": 0.0243, "step": 8436 }, { "epoch": 3.747279591383522, "grad_norm": 0.3000902649747397, "learning_rate": 1.2032740195484448e-07, "loss": 0.0147, "step": 8437 }, { "epoch": 3.7477237397290697, "grad_norm": 0.5581005324860493, "learning_rate": 1.1990504405764492e-07, "loss": 0.0302, "step": 8438 }, { "epoch": 3.748167888074617, "grad_norm": 0.5633312132178465, "learning_rate": 1.1948341972161492e-07, "loss": 0.0309, "step": 8439 }, { "epoch": 3.7486120364201643, "grad_norm": 0.36629488791856035, "learning_rate": 1.1906252901013271e-07, "loss": 0.0238, "step": 8440 }, { "epoch": 3.7490561847657116, "grad_norm": 0.49491041444267353, "learning_rate": 1.1864237198646544e-07, "loss": 0.0298, "step": 8441 }, { "epoch": 3.7495003331112593, "grad_norm": 0.564678297749189, "learning_rate": 1.1822294871376928e-07, "loss": 0.0305, "step": 8442 }, { "epoch": 3.7499444814568066, "grad_norm": 0.40055447684951356, "learning_rate": 1.1780425925509043e-07, "loss": 0.0272, "step": 8443 }, { "epoch": 3.750388629802354, "grad_norm": 0.470018676790432, "learning_rate": 1.1738630367336579e-07, "loss": 0.032, "step": 8444 }, { "epoch": 3.7508327781479016, "grad_norm": 0.4384932264169123, "learning_rate": 1.1696908203142066e-07, "loss": 0.0256, "step": 8445 }, { "epoch": 3.751276926493449, "grad_norm": 0.4653823127550869, "learning_rate": 1.1655259439197042e-07, "loss": 0.03, "step": 8446 }, { "epoch": 3.751721074838996, "grad_norm": 0.41947700796617987, "learning_rate": 1.1613684081762111e-07, "loss": 0.0236, "step": 8447 }, { "epoch": 3.7521652231845435, "grad_norm": 0.38536838427097736, "learning_rate": 1.1572182137086662e-07, "loss": 0.0242, "step": 8448 }, { "epoch": 3.7526093715300908, "grad_norm": 0.341291576535971, "learning_rate": 1.1530753611409151e-07, "loss": 0.0229, "step": 8449 }, { "epoch": 3.7530535198756385, "grad_norm": 0.3792987155782225, "learning_rate": 1.1489398510957039e-07, "loss": 0.0237, "step": 8450 }, { "epoch": 3.753497668221186, "grad_norm": 0.37854108977812184, "learning_rate": 1.1448116841946688e-07, "loss": 0.0287, "step": 8451 }, { "epoch": 3.7539418165667335, "grad_norm": 0.46875753276752063, "learning_rate": 1.1406908610583467e-07, "loss": 0.0311, "step": 8452 }, { "epoch": 3.754385964912281, "grad_norm": 0.4064331207182262, "learning_rate": 1.1365773823061532e-07, "loss": 0.0283, "step": 8453 }, { "epoch": 3.754830113257828, "grad_norm": 0.4002763856777381, "learning_rate": 1.1324712485564271e-07, "loss": 0.024, "step": 8454 }, { "epoch": 3.7552742616033754, "grad_norm": 0.45827533835175305, "learning_rate": 1.1283724604263857e-07, "loss": 0.0246, "step": 8455 }, { "epoch": 3.7557184099489227, "grad_norm": 0.39394686171582316, "learning_rate": 1.1242810185321473e-07, "loss": 0.03, "step": 8456 }, { "epoch": 3.7561625582944704, "grad_norm": 0.42301627464562236, "learning_rate": 1.1201969234887256e-07, "loss": 0.0321, "step": 8457 }, { "epoch": 3.7566067066400177, "grad_norm": 0.5331607328416476, "learning_rate": 1.1161201759100349e-07, "loss": 0.0279, "step": 8458 }, { "epoch": 3.757050854985565, "grad_norm": 0.44317622469926854, "learning_rate": 1.1120507764088684e-07, "loss": 0.0449, "step": 8459 }, { "epoch": 3.7574950033311127, "grad_norm": 0.3625167956575573, "learning_rate": 1.1079887255969257e-07, "loss": 0.0194, "step": 8460 }, { "epoch": 3.75793915167666, "grad_norm": 0.38235614382527433, "learning_rate": 1.1039340240848129e-07, "loss": 0.0286, "step": 8461 }, { "epoch": 3.7583833000222073, "grad_norm": 0.4781732717681245, "learning_rate": 1.0998866724820145e-07, "loss": 0.0275, "step": 8462 }, { "epoch": 3.7588274483677546, "grad_norm": 0.36277030824808554, "learning_rate": 1.0958466713969218e-07, "loss": 0.02, "step": 8463 }, { "epoch": 3.7592715967133024, "grad_norm": 0.4310996548128226, "learning_rate": 1.09181402143681e-07, "loss": 0.0271, "step": 8464 }, { "epoch": 3.7597157450588496, "grad_norm": 0.44245437066064086, "learning_rate": 1.0877887232078499e-07, "loss": 0.0221, "step": 8465 }, { "epoch": 3.760159893404397, "grad_norm": 0.45765152233325074, "learning_rate": 1.0837707773151185e-07, "loss": 0.0229, "step": 8466 }, { "epoch": 3.7606040417499447, "grad_norm": 0.5026222951099458, "learning_rate": 1.0797601843625827e-07, "loss": 0.0253, "step": 8467 }, { "epoch": 3.761048190095492, "grad_norm": 0.5808566690208385, "learning_rate": 1.0757569449530991e-07, "loss": 0.0229, "step": 8468 }, { "epoch": 3.7614923384410393, "grad_norm": 0.36877164338905205, "learning_rate": 1.0717610596884309e-07, "loss": 0.0285, "step": 8469 }, { "epoch": 3.7619364867865865, "grad_norm": 0.3700848691513958, "learning_rate": 1.0677725291692143e-07, "loss": 0.0204, "step": 8470 }, { "epoch": 3.7623806351321343, "grad_norm": 0.3125009535356397, "learning_rate": 1.0637913539950029e-07, "loss": 0.0181, "step": 8471 }, { "epoch": 3.7628247834776816, "grad_norm": 0.4808509442055829, "learning_rate": 1.0598175347642293e-07, "loss": 0.028, "step": 8472 }, { "epoch": 3.763268931823229, "grad_norm": 0.4442187709817902, "learning_rate": 1.0558510720742265e-07, "loss": 0.0316, "step": 8473 }, { "epoch": 3.7637130801687766, "grad_norm": 0.5324409399106215, "learning_rate": 1.0518919665212235e-07, "loss": 0.0319, "step": 8474 }, { "epoch": 3.764157228514324, "grad_norm": 0.42314697759763437, "learning_rate": 1.0479402187003496e-07, "loss": 0.0222, "step": 8475 }, { "epoch": 3.764601376859871, "grad_norm": 0.378666602483765, "learning_rate": 1.0439958292056074e-07, "loss": 0.029, "step": 8476 }, { "epoch": 3.7650455252054185, "grad_norm": 0.5003419916114011, "learning_rate": 1.040058798629906e-07, "loss": 0.0259, "step": 8477 }, { "epoch": 3.7654896735509658, "grad_norm": 0.5187802895889871, "learning_rate": 1.0361291275650498e-07, "loss": 0.0412, "step": 8478 }, { "epoch": 3.7659338218965135, "grad_norm": 0.41358816649715524, "learning_rate": 1.0322068166017386e-07, "loss": 0.0325, "step": 8479 }, { "epoch": 3.766377970242061, "grad_norm": 0.3554104231788337, "learning_rate": 1.0282918663295616e-07, "loss": 0.0266, "step": 8480 }, { "epoch": 3.7668221185876085, "grad_norm": 0.433977374572574, "learning_rate": 1.0243842773369983e-07, "loss": 0.0262, "step": 8481 }, { "epoch": 3.767266266933156, "grad_norm": 0.3050089012609615, "learning_rate": 1.0204840502114288e-07, "loss": 0.016, "step": 8482 }, { "epoch": 3.767710415278703, "grad_norm": 0.35035155995481493, "learning_rate": 1.0165911855391286e-07, "loss": 0.022, "step": 8483 }, { "epoch": 3.7681545636242504, "grad_norm": 0.3852773325944005, "learning_rate": 1.0127056839052462e-07, "loss": 0.017, "step": 8484 }, { "epoch": 3.7685987119697977, "grad_norm": 0.43026798549784784, "learning_rate": 1.0088275458938535e-07, "loss": 0.0246, "step": 8485 }, { "epoch": 3.7690428603153454, "grad_norm": 0.49903822216529625, "learning_rate": 1.004956772087895e-07, "loss": 0.0278, "step": 8486 }, { "epoch": 3.7694870086608927, "grad_norm": 0.30382121537402157, "learning_rate": 1.0010933630692166e-07, "loss": 0.0168, "step": 8487 }, { "epoch": 3.76993115700644, "grad_norm": 0.41575029483477577, "learning_rate": 9.972373194185481e-08, "loss": 0.0273, "step": 8488 }, { "epoch": 3.7703753053519877, "grad_norm": 0.425652535729631, "learning_rate": 9.933886417155258e-08, "loss": 0.0258, "step": 8489 }, { "epoch": 3.770819453697535, "grad_norm": 0.4620847686387116, "learning_rate": 9.895473305386593e-08, "loss": 0.024, "step": 8490 }, { "epoch": 3.7712636020430823, "grad_norm": 0.4272116851275043, "learning_rate": 9.857133864653812e-08, "loss": 0.0246, "step": 8491 }, { "epoch": 3.7717077503886296, "grad_norm": 0.3869470039777947, "learning_rate": 9.818868100719803e-08, "loss": 0.0266, "step": 8492 }, { "epoch": 3.7721518987341773, "grad_norm": 0.38019015167069964, "learning_rate": 9.780676019336632e-08, "loss": 0.0235, "step": 8493 }, { "epoch": 3.7725960470797246, "grad_norm": 0.6659937799546494, "learning_rate": 9.742557626245264e-08, "loss": 0.0336, "step": 8494 }, { "epoch": 3.773040195425272, "grad_norm": 0.5490763929102886, "learning_rate": 9.704512927175502e-08, "loss": 0.0325, "step": 8495 }, { "epoch": 3.7734843437708196, "grad_norm": 0.5157099798547079, "learning_rate": 9.666541927846107e-08, "loss": 0.0408, "step": 8496 }, { "epoch": 3.773928492116367, "grad_norm": 0.6662930057806212, "learning_rate": 9.62864463396479e-08, "loss": 0.0181, "step": 8497 }, { "epoch": 3.7743726404619142, "grad_norm": 0.36908564910307085, "learning_rate": 9.590821051228105e-08, "loss": 0.0196, "step": 8498 }, { "epoch": 3.7748167888074615, "grad_norm": 0.34743147522098267, "learning_rate": 9.553071185321616e-08, "loss": 0.0177, "step": 8499 }, { "epoch": 3.7752609371530093, "grad_norm": 0.4868308303940559, "learning_rate": 9.515395041919839e-08, "loss": 0.0292, "step": 8500 }, { "epoch": 3.7757050854985565, "grad_norm": 0.4939860503734759, "learning_rate": 9.477792626685966e-08, "loss": 0.026, "step": 8501 }, { "epoch": 3.776149233844104, "grad_norm": 0.31320876766724953, "learning_rate": 9.440263945272365e-08, "loss": 0.018, "step": 8502 }, { "epoch": 3.7765933821896516, "grad_norm": 0.36925404461957756, "learning_rate": 9.402809003320357e-08, "loss": 0.022, "step": 8503 }, { "epoch": 3.777037530535199, "grad_norm": 0.5099202980127734, "learning_rate": 9.365427806459826e-08, "loss": 0.0219, "step": 8504 }, { "epoch": 3.777481678880746, "grad_norm": 0.4048961589102498, "learning_rate": 9.32812036031e-08, "loss": 0.02, "step": 8505 }, { "epoch": 3.7779258272262934, "grad_norm": 0.3828893218593422, "learning_rate": 9.290886670478727e-08, "loss": 0.0329, "step": 8506 }, { "epoch": 3.7783699755718407, "grad_norm": 0.47619859186660657, "learning_rate": 9.253726742562808e-08, "loss": 0.0339, "step": 8507 }, { "epoch": 3.7788141239173885, "grad_norm": 0.3817560157892672, "learning_rate": 9.216640582148218e-08, "loss": 0.0245, "step": 8508 }, { "epoch": 3.7792582722629358, "grad_norm": 0.44955998588034224, "learning_rate": 9.179628194809387e-08, "loss": 0.036, "step": 8509 }, { "epoch": 3.7797024206084835, "grad_norm": 0.39409371816193695, "learning_rate": 9.142689586110032e-08, "loss": 0.0243, "step": 8510 }, { "epoch": 3.780146568954031, "grad_norm": 0.3329695501761576, "learning_rate": 9.105824761602711e-08, "loss": 0.0177, "step": 8511 }, { "epoch": 3.780590717299578, "grad_norm": 0.41637622163142424, "learning_rate": 9.069033726828657e-08, "loss": 0.0243, "step": 8512 }, { "epoch": 3.7810348656451254, "grad_norm": 0.513878740141547, "learning_rate": 9.032316487318338e-08, "loss": 0.0422, "step": 8513 }, { "epoch": 3.7814790139906727, "grad_norm": 0.3779536127482641, "learning_rate": 8.995673048591002e-08, "loss": 0.0243, "step": 8514 }, { "epoch": 3.7819231623362204, "grad_norm": 0.40656693503772895, "learning_rate": 8.959103416154635e-08, "loss": 0.0204, "step": 8515 }, { "epoch": 3.7823673106817677, "grad_norm": 0.37125663642968887, "learning_rate": 8.922607595506339e-08, "loss": 0.0222, "step": 8516 }, { "epoch": 3.782811459027315, "grad_norm": 0.4060879405476042, "learning_rate": 8.886185592132113e-08, "loss": 0.0213, "step": 8517 }, { "epoch": 3.7832556073728627, "grad_norm": 0.4488358277812707, "learning_rate": 8.849837411506745e-08, "loss": 0.0344, "step": 8518 }, { "epoch": 3.78369975571841, "grad_norm": 0.4066455322610377, "learning_rate": 8.813563059093977e-08, "loss": 0.0242, "step": 8519 }, { "epoch": 3.7841439040639573, "grad_norm": 0.4551394122387141, "learning_rate": 8.777362540346501e-08, "loss": 0.0293, "step": 8520 }, { "epoch": 3.7845880524095046, "grad_norm": 0.42457221786596655, "learning_rate": 8.741235860705855e-08, "loss": 0.0259, "step": 8521 }, { "epoch": 3.7850322007550523, "grad_norm": 0.3043935300931366, "learning_rate": 8.70518302560247e-08, "loss": 0.0182, "step": 8522 }, { "epoch": 3.7854763491005996, "grad_norm": 0.4462173753124753, "learning_rate": 8.669204040455737e-08, "loss": 0.0265, "step": 8523 }, { "epoch": 3.785920497446147, "grad_norm": 0.41601964127930957, "learning_rate": 8.633298910673826e-08, "loss": 0.0288, "step": 8524 }, { "epoch": 3.7863646457916946, "grad_norm": 0.2813068890343098, "learning_rate": 8.597467641654034e-08, "loss": 0.0212, "step": 8525 }, { "epoch": 3.786808794137242, "grad_norm": 0.43623034836188324, "learning_rate": 8.561710238782272e-08, "loss": 0.0249, "step": 8526 }, { "epoch": 3.787252942482789, "grad_norm": 0.5751855788252586, "learning_rate": 8.526026707433577e-08, "loss": 0.0413, "step": 8527 }, { "epoch": 3.7876970908283365, "grad_norm": 0.3868182612344314, "learning_rate": 8.490417052971766e-08, "loss": 0.0281, "step": 8528 }, { "epoch": 3.7881412391738842, "grad_norm": 0.356372518499826, "learning_rate": 8.45488128074945e-08, "loss": 0.022, "step": 8529 }, { "epoch": 3.7885853875194315, "grad_norm": 0.32262084504669236, "learning_rate": 8.419419396108464e-08, "loss": 0.0168, "step": 8530 }, { "epoch": 3.789029535864979, "grad_norm": 0.4043798709101995, "learning_rate": 8.384031404379211e-08, "loss": 0.024, "step": 8531 }, { "epoch": 3.7894736842105265, "grad_norm": 0.41478103237109665, "learning_rate": 8.34871731088116e-08, "loss": 0.027, "step": 8532 }, { "epoch": 3.789917832556074, "grad_norm": 0.4196449951874737, "learning_rate": 8.313477120922563e-08, "loss": 0.0209, "step": 8533 }, { "epoch": 3.790361980901621, "grad_norm": 0.4317330818971433, "learning_rate": 8.278310839800685e-08, "loss": 0.0284, "step": 8534 }, { "epoch": 3.7908061292471684, "grad_norm": 0.44713201608661013, "learning_rate": 8.243218472801461e-08, "loss": 0.0438, "step": 8535 }, { "epoch": 3.7912502775927157, "grad_norm": 0.4371441171214935, "learning_rate": 8.208200025200119e-08, "loss": 0.0201, "step": 8536 }, { "epoch": 3.7916944259382634, "grad_norm": 0.5043700394965174, "learning_rate": 8.173255502260336e-08, "loss": 0.0285, "step": 8537 }, { "epoch": 3.7921385742838107, "grad_norm": 0.3893482313023411, "learning_rate": 8.138384909234964e-08, "loss": 0.0258, "step": 8538 }, { "epoch": 3.7925827226293585, "grad_norm": 0.3403931667558864, "learning_rate": 8.103588251365534e-08, "loss": 0.0199, "step": 8539 }, { "epoch": 3.7930268709749058, "grad_norm": 0.4249904672408077, "learning_rate": 8.068865533882752e-08, "loss": 0.0221, "step": 8540 }, { "epoch": 3.793471019320453, "grad_norm": 0.4067407706983934, "learning_rate": 8.034216762005831e-08, "loss": 0.0374, "step": 8541 }, { "epoch": 3.7939151676660003, "grad_norm": 0.41570749972762994, "learning_rate": 7.99964194094327e-08, "loss": 0.024, "step": 8542 }, { "epoch": 3.7943593160115476, "grad_norm": 0.48499039450946113, "learning_rate": 7.96514107589208e-08, "loss": 0.0341, "step": 8543 }, { "epoch": 3.7948034643570954, "grad_norm": 0.40373614464609714, "learning_rate": 7.93071417203839e-08, "loss": 0.0268, "step": 8544 }, { "epoch": 3.7952476127026427, "grad_norm": 0.35452268236632206, "learning_rate": 7.896361234557226e-08, "loss": 0.0192, "step": 8545 }, { "epoch": 3.79569176104819, "grad_norm": 0.4666753180876556, "learning_rate": 7.862082268612237e-08, "loss": 0.0342, "step": 8546 }, { "epoch": 3.7961359093937377, "grad_norm": 0.39409303578975985, "learning_rate": 7.8278772793563e-08, "loss": 0.0244, "step": 8547 }, { "epoch": 3.796580057739285, "grad_norm": 0.543942317931089, "learning_rate": 7.793746271930968e-08, "loss": 0.0232, "step": 8548 }, { "epoch": 3.7970242060848323, "grad_norm": 0.4331154507830711, "learning_rate": 7.759689251466695e-08, "loss": 0.0314, "step": 8549 }, { "epoch": 3.7974683544303796, "grad_norm": 0.4001739441704062, "learning_rate": 7.72570622308283e-08, "loss": 0.0201, "step": 8550 }, { "epoch": 3.7979125027759273, "grad_norm": 0.4891526654283636, "learning_rate": 7.691797191887618e-08, "loss": 0.027, "step": 8551 }, { "epoch": 3.7983566511214746, "grad_norm": 0.40920185175638346, "learning_rate": 7.657962162978038e-08, "loss": 0.0264, "step": 8552 }, { "epoch": 3.798800799467022, "grad_norm": 0.4710270896413282, "learning_rate": 7.624201141440301e-08, "loss": 0.0295, "step": 8553 }, { "epoch": 3.7992449478125696, "grad_norm": 0.3793884463205298, "learning_rate": 7.59051413234907e-08, "loss": 0.0162, "step": 8554 }, { "epoch": 3.799689096158117, "grad_norm": 0.2877108795069413, "learning_rate": 7.556901140768125e-08, "loss": 0.0134, "step": 8555 }, { "epoch": 3.800133244503664, "grad_norm": 0.3350085177589909, "learning_rate": 7.523362171750148e-08, "loss": 0.0139, "step": 8556 }, { "epoch": 3.8005773928492115, "grad_norm": 0.4349812650446206, "learning_rate": 7.489897230336496e-08, "loss": 0.0276, "step": 8557 }, { "epoch": 3.801021541194759, "grad_norm": 0.3673667656791951, "learning_rate": 7.456506321557533e-08, "loss": 0.0212, "step": 8558 }, { "epoch": 3.8014656895403065, "grad_norm": 0.4327339896844779, "learning_rate": 7.423189450432633e-08, "loss": 0.0234, "step": 8559 }, { "epoch": 3.801909837885854, "grad_norm": 0.3414463099110009, "learning_rate": 7.389946621969679e-08, "loss": 0.02, "step": 8560 }, { "epoch": 3.8023539862314015, "grad_norm": 0.35401161038931783, "learning_rate": 7.356777841165786e-08, "loss": 0.0236, "step": 8561 }, { "epoch": 3.802798134576949, "grad_norm": 0.3318458174565355, "learning_rate": 7.32368311300674e-08, "loss": 0.0228, "step": 8562 }, { "epoch": 3.803242282922496, "grad_norm": 0.4407338624172669, "learning_rate": 7.290662442467178e-08, "loss": 0.0243, "step": 8563 }, { "epoch": 3.8036864312680434, "grad_norm": 0.3688164392706434, "learning_rate": 7.257715834510737e-08, "loss": 0.0194, "step": 8564 }, { "epoch": 3.8041305796135907, "grad_norm": 0.4497260712320215, "learning_rate": 7.224843294089844e-08, "loss": 0.0287, "step": 8565 }, { "epoch": 3.8045747279591384, "grad_norm": 0.33236407987313205, "learning_rate": 7.192044826145772e-08, "loss": 0.0191, "step": 8566 }, { "epoch": 3.8050188763046857, "grad_norm": 0.37564613529503493, "learning_rate": 7.159320435608741e-08, "loss": 0.0161, "step": 8567 }, { "epoch": 3.8054630246502335, "grad_norm": 0.3522232268407622, "learning_rate": 7.126670127397705e-08, "loss": 0.0215, "step": 8568 }, { "epoch": 3.8059071729957807, "grad_norm": 0.429595537532537, "learning_rate": 7.094093906420629e-08, "loss": 0.0288, "step": 8569 }, { "epoch": 3.806351321341328, "grad_norm": 0.4178495453889158, "learning_rate": 7.061591777574261e-08, "loss": 0.0273, "step": 8570 }, { "epoch": 3.8067954696868753, "grad_norm": 0.34370415504576896, "learning_rate": 7.029163745744194e-08, "loss": 0.0198, "step": 8571 }, { "epoch": 3.8072396180324226, "grad_norm": 0.6738541750686097, "learning_rate": 6.996809815804917e-08, "loss": 0.0216, "step": 8572 }, { "epoch": 3.8076837663779703, "grad_norm": 0.3104036826233125, "learning_rate": 6.964529992619817e-08, "loss": 0.0158, "step": 8573 }, { "epoch": 3.8081279147235176, "grad_norm": 0.3875694214420725, "learning_rate": 6.932324281041014e-08, "loss": 0.0282, "step": 8574 }, { "epoch": 3.808572063069065, "grad_norm": 0.7370247616254277, "learning_rate": 6.900192685909635e-08, "loss": 0.0427, "step": 8575 }, { "epoch": 3.8090162114146127, "grad_norm": 0.3713027373129603, "learning_rate": 6.868135212055649e-08, "loss": 0.0273, "step": 8576 }, { "epoch": 3.80946035976016, "grad_norm": 0.4118005174789893, "learning_rate": 6.836151864297702e-08, "loss": 0.0256, "step": 8577 }, { "epoch": 3.8099045081057072, "grad_norm": 0.47276389861446366, "learning_rate": 6.80424264744356e-08, "loss": 0.0345, "step": 8578 }, { "epoch": 3.8103486564512545, "grad_norm": 0.49450924911427574, "learning_rate": 6.772407566289718e-08, "loss": 0.0299, "step": 8579 }, { "epoch": 3.8107928047968023, "grad_norm": 0.3914626594412097, "learning_rate": 6.740646625621461e-08, "loss": 0.0244, "step": 8580 }, { "epoch": 3.8112369531423496, "grad_norm": 0.3930999395745575, "learning_rate": 6.708959830213024e-08, "loss": 0.0359, "step": 8581 }, { "epoch": 3.811681101487897, "grad_norm": 0.49573525692180653, "learning_rate": 6.677347184827487e-08, "loss": 0.0319, "step": 8582 }, { "epoch": 3.8121252498334446, "grad_norm": 0.49861370512502395, "learning_rate": 6.645808694216715e-08, "loss": 0.0256, "step": 8583 }, { "epoch": 3.812569398178992, "grad_norm": 0.4401777666781036, "learning_rate": 6.614344363121583e-08, "loss": 0.0223, "step": 8584 }, { "epoch": 3.813013546524539, "grad_norm": 0.36977940669747483, "learning_rate": 6.582954196271641e-08, "loss": 0.0214, "step": 8585 }, { "epoch": 3.8134576948700865, "grad_norm": 0.4927485327079211, "learning_rate": 6.55163819838528e-08, "loss": 0.0287, "step": 8586 }, { "epoch": 3.8139018432156337, "grad_norm": 0.37546001627014713, "learning_rate": 6.520396374170013e-08, "loss": 0.0229, "step": 8587 }, { "epoch": 3.8143459915611815, "grad_norm": 0.4196889003684699, "learning_rate": 6.489228728321917e-08, "loss": 0.0276, "step": 8588 }, { "epoch": 3.8147901399067288, "grad_norm": 0.4820349858082287, "learning_rate": 6.458135265525967e-08, "loss": 0.0274, "step": 8589 }, { "epoch": 3.8152342882522765, "grad_norm": 0.3797443254174065, "learning_rate": 6.427115990456201e-08, "loss": 0.0237, "step": 8590 }, { "epoch": 3.815678436597824, "grad_norm": 0.4390761052826011, "learning_rate": 6.396170907775167e-08, "loss": 0.03, "step": 8591 }, { "epoch": 3.816122584943371, "grad_norm": 0.32997954286919473, "learning_rate": 6.365300022134479e-08, "loss": 0.0242, "step": 8592 }, { "epoch": 3.8165667332889184, "grad_norm": 0.45875132604087615, "learning_rate": 6.334503338174646e-08, "loss": 0.0269, "step": 8593 }, { "epoch": 3.8170108816344657, "grad_norm": 0.48148563808244804, "learning_rate": 6.303780860524855e-08, "loss": 0.0273, "step": 8594 }, { "epoch": 3.8174550299800134, "grad_norm": 0.4353447524714429, "learning_rate": 6.273132593803189e-08, "loss": 0.0195, "step": 8595 }, { "epoch": 3.8178991783255607, "grad_norm": 0.38567398492319127, "learning_rate": 6.242558542616739e-08, "loss": 0.0193, "step": 8596 }, { "epoch": 3.818343326671108, "grad_norm": 0.4173237164922603, "learning_rate": 6.212058711561165e-08, "loss": 0.0278, "step": 8597 }, { "epoch": 3.8187874750166557, "grad_norm": 0.3154210166282003, "learning_rate": 6.18163310522113e-08, "loss": 0.0189, "step": 8598 }, { "epoch": 3.819231623362203, "grad_norm": 0.5134503050749415, "learning_rate": 6.151281728170144e-08, "loss": 0.0289, "step": 8599 }, { "epoch": 3.8196757717077503, "grad_norm": 0.46640864477037924, "learning_rate": 6.121004584970558e-08, "loss": 0.0308, "step": 8600 }, { "epoch": 3.8201199200532976, "grad_norm": 0.36566694826665086, "learning_rate": 6.090801680173563e-08, "loss": 0.022, "step": 8601 }, { "epoch": 3.8205640683988453, "grad_norm": 0.39260388522181044, "learning_rate": 6.060673018319085e-08, "loss": 0.033, "step": 8602 }, { "epoch": 3.8210082167443926, "grad_norm": 0.32628501291346773, "learning_rate": 6.030618603935945e-08, "loss": 0.0195, "step": 8603 }, { "epoch": 3.82145236508994, "grad_norm": 0.5032358609244834, "learning_rate": 6.000638441542029e-08, "loss": 0.0367, "step": 8604 }, { "epoch": 3.8218965134354876, "grad_norm": 0.49029558220343267, "learning_rate": 5.970732535643675e-08, "loss": 0.0357, "step": 8605 }, { "epoch": 3.822340661781035, "grad_norm": 0.4041938487036102, "learning_rate": 5.94090089073629e-08, "loss": 0.0192, "step": 8606 }, { "epoch": 3.8227848101265822, "grad_norm": 0.35662580614791967, "learning_rate": 5.911143511304174e-08, "loss": 0.0223, "step": 8607 }, { "epoch": 3.8232289584721295, "grad_norm": 0.42334321858420404, "learning_rate": 5.8814604018202494e-08, "loss": 0.0333, "step": 8608 }, { "epoch": 3.8236731068176772, "grad_norm": 0.3454601103621082, "learning_rate": 5.851851566746392e-08, "loss": 0.0184, "step": 8609 }, { "epoch": 3.8241172551632245, "grad_norm": 0.39337857078220745, "learning_rate": 5.8223170105333734e-08, "loss": 0.0228, "step": 8610 }, { "epoch": 3.824561403508772, "grad_norm": 0.3888924660914583, "learning_rate": 5.792856737620756e-08, "loss": 0.0222, "step": 8611 }, { "epoch": 3.8250055518543196, "grad_norm": 0.3633339990171769, "learning_rate": 5.763470752436884e-08, "loss": 0.0178, "step": 8612 }, { "epoch": 3.825449700199867, "grad_norm": 0.5069652365071539, "learning_rate": 5.734159059398947e-08, "loss": 0.0311, "step": 8613 }, { "epoch": 3.825893848545414, "grad_norm": 0.36481775204860845, "learning_rate": 5.7049216629129764e-08, "loss": 0.0214, "step": 8614 }, { "epoch": 3.8263379968909614, "grad_norm": 0.4066225715814007, "learning_rate": 5.6757585673739014e-08, "loss": 0.0196, "step": 8615 }, { "epoch": 3.8267821452365087, "grad_norm": 0.38508300233088066, "learning_rate": 5.6466697771654365e-08, "loss": 0.0359, "step": 8616 }, { "epoch": 3.8272262935820565, "grad_norm": 0.3857136200508219, "learning_rate": 5.617655296660085e-08, "loss": 0.0295, "step": 8617 }, { "epoch": 3.8276704419276038, "grad_norm": 0.38361748458889544, "learning_rate": 5.5887151302192465e-08, "loss": 0.0225, "step": 8618 }, { "epoch": 3.8281145902731515, "grad_norm": 0.4136696628077852, "learning_rate": 5.5598492821931083e-08, "loss": 0.0259, "step": 8619 }, { "epoch": 3.8285587386186988, "grad_norm": 0.4157902458106096, "learning_rate": 5.531057756920644e-08, "loss": 0.0299, "step": 8620 }, { "epoch": 3.829002886964246, "grad_norm": 0.41824752196070275, "learning_rate": 5.502340558729835e-08, "loss": 0.0276, "step": 8621 }, { "epoch": 3.8294470353097934, "grad_norm": 0.3542732476447781, "learning_rate": 5.4736976919372295e-08, "loss": 0.0214, "step": 8622 }, { "epoch": 3.8298911836553406, "grad_norm": 0.47614032589088634, "learning_rate": 5.445129160848384e-08, "loss": 0.0271, "step": 8623 }, { "epoch": 3.8303353320008884, "grad_norm": 0.38508281181821147, "learning_rate": 5.416634969757695e-08, "loss": 0.0184, "step": 8624 }, { "epoch": 3.8307794803464357, "grad_norm": 0.3910260501124829, "learning_rate": 5.388215122948237e-08, "loss": 0.0259, "step": 8625 }, { "epoch": 3.831223628691983, "grad_norm": 0.31939094873163437, "learning_rate": 5.359869624692038e-08, "loss": 0.0197, "step": 8626 }, { "epoch": 3.8316677770375307, "grad_norm": 0.30534186293474436, "learning_rate": 5.331598479249911e-08, "loss": 0.0201, "step": 8627 }, { "epoch": 3.832111925383078, "grad_norm": 0.33277037895962464, "learning_rate": 5.303401690871457e-08, "loss": 0.0203, "step": 8628 }, { "epoch": 3.8325560737286253, "grad_norm": 0.46652287015112265, "learning_rate": 5.275279263795175e-08, "loss": 0.0282, "step": 8629 }, { "epoch": 3.8330002220741726, "grad_norm": 0.3848369302003942, "learning_rate": 5.2472312022483486e-08, "loss": 0.0258, "step": 8630 }, { "epoch": 3.8334443704197203, "grad_norm": 0.4448218074585927, "learning_rate": 5.2192575104469956e-08, "loss": 0.0254, "step": 8631 }, { "epoch": 3.8338885187652676, "grad_norm": 0.384660032948373, "learning_rate": 5.1913581925960853e-08, "loss": 0.0298, "step": 8632 }, { "epoch": 3.834332667110815, "grad_norm": 0.4204621984497172, "learning_rate": 5.16353325288943e-08, "loss": 0.0242, "step": 8633 }, { "epoch": 3.8347768154563626, "grad_norm": 0.46188483366801475, "learning_rate": 5.135782695509461e-08, "loss": 0.0231, "step": 8634 }, { "epoch": 3.83522096380191, "grad_norm": 0.31629549729913453, "learning_rate": 5.1081065246277314e-08, "loss": 0.0158, "step": 8635 }, { "epoch": 3.835665112147457, "grad_norm": 0.4353107611550551, "learning_rate": 5.0805047444042467e-08, "loss": 0.0287, "step": 8636 }, { "epoch": 3.8361092604930045, "grad_norm": 0.40299033179334315, "learning_rate": 5.0529773589881315e-08, "loss": 0.0195, "step": 8637 }, { "epoch": 3.8365534088385522, "grad_norm": 0.39085359021822447, "learning_rate": 5.0255243725171876e-08, "loss": 0.0279, "step": 8638 }, { "epoch": 3.8369975571840995, "grad_norm": 0.48519972906519826, "learning_rate": 4.998145789118114e-08, "loss": 0.0333, "step": 8639 }, { "epoch": 3.837441705529647, "grad_norm": 0.3517351658799901, "learning_rate": 4.970841612906285e-08, "loss": 0.0224, "step": 8640 }, { "epoch": 3.8378858538751945, "grad_norm": 0.4190846054803923, "learning_rate": 4.943611847986085e-08, "loss": 0.0234, "step": 8641 }, { "epoch": 3.838330002220742, "grad_norm": 0.35500926856677334, "learning_rate": 4.9164564984505723e-08, "loss": 0.0198, "step": 8642 }, { "epoch": 3.838774150566289, "grad_norm": 0.34433663873966086, "learning_rate": 4.889375568381594e-08, "loss": 0.0174, "step": 8643 }, { "epoch": 3.8392182989118364, "grad_norm": 0.38443528004632466, "learning_rate": 4.8623690618499474e-08, "loss": 0.0229, "step": 8644 }, { "epoch": 3.8396624472573837, "grad_norm": 0.6274521829246321, "learning_rate": 4.835436982915165e-08, "loss": 0.0344, "step": 8645 }, { "epoch": 3.8401065956029314, "grad_norm": 0.6266172275143739, "learning_rate": 4.808579335625563e-08, "loss": 0.0327, "step": 8646 }, { "epoch": 3.8405507439484787, "grad_norm": 0.34058549807195543, "learning_rate": 4.7817961240183567e-08, "loss": 0.0224, "step": 8647 }, { "epoch": 3.8409948922940265, "grad_norm": 0.4390835521149124, "learning_rate": 4.7550873521194364e-08, "loss": 0.0195, "step": 8648 }, { "epoch": 3.8414390406395738, "grad_norm": 0.3842688631078936, "learning_rate": 4.728453023943591e-08, "loss": 0.0231, "step": 8649 }, { "epoch": 3.841883188985121, "grad_norm": 0.4692045722646363, "learning_rate": 4.701893143494507e-08, "loss": 0.0324, "step": 8650 }, { "epoch": 3.8423273373306683, "grad_norm": 0.45747006524072764, "learning_rate": 4.675407714764491e-08, "loss": 0.0308, "step": 8651 }, { "epoch": 3.8427714856762156, "grad_norm": 0.3514460328352755, "learning_rate": 4.648996741734857e-08, "loss": 0.0207, "step": 8652 }, { "epoch": 3.8432156340217634, "grad_norm": 0.3912542737417533, "learning_rate": 4.622660228375486e-08, "loss": 0.0219, "step": 8653 }, { "epoch": 3.8436597823673107, "grad_norm": 0.3136962524397564, "learning_rate": 4.596398178645323e-08, "loss": 0.0178, "step": 8654 }, { "epoch": 3.844103930712858, "grad_norm": 0.35110936427287714, "learning_rate": 4.5702105964919305e-08, "loss": 0.0163, "step": 8655 }, { "epoch": 3.8445480790584057, "grad_norm": 0.3924180108841284, "learning_rate": 4.5440974858517174e-08, "loss": 0.0185, "step": 8656 }, { "epoch": 3.844992227403953, "grad_norm": 0.41446900223517336, "learning_rate": 4.5180588506500424e-08, "loss": 0.0294, "step": 8657 }, { "epoch": 3.8454363757495003, "grad_norm": 0.3695057701384476, "learning_rate": 4.492094694800886e-08, "loss": 0.0269, "step": 8658 }, { "epoch": 3.8458805240950475, "grad_norm": 0.3819021189729295, "learning_rate": 4.4662050222070707e-08, "loss": 0.0242, "step": 8659 }, { "epoch": 3.8463246724405953, "grad_norm": 0.4011303876812642, "learning_rate": 4.440389836760317e-08, "loss": 0.028, "step": 8660 }, { "epoch": 3.8467688207861426, "grad_norm": 0.42320733559927765, "learning_rate": 4.414649142341021e-08, "loss": 0.023, "step": 8661 }, { "epoch": 3.84721296913169, "grad_norm": 0.4205022645129385, "learning_rate": 4.388982942818476e-08, "loss": 0.0261, "step": 8662 }, { "epoch": 3.8476571174772376, "grad_norm": 0.3312900714579485, "learning_rate": 4.363391242050819e-08, "loss": 0.0191, "step": 8663 }, { "epoch": 3.848101265822785, "grad_norm": 0.40420539853558396, "learning_rate": 4.3378740438848045e-08, "loss": 0.0317, "step": 8664 }, { "epoch": 3.848545414168332, "grad_norm": 0.41550220010293626, "learning_rate": 4.312431352156143e-08, "loss": 0.0255, "step": 8665 }, { "epoch": 3.8489895625138795, "grad_norm": 0.34141480859605966, "learning_rate": 4.287063170689332e-08, "loss": 0.0219, "step": 8666 }, { "epoch": 3.849433710859427, "grad_norm": 0.38457339309127786, "learning_rate": 4.261769503297597e-08, "loss": 0.0241, "step": 8667 }, { "epoch": 3.8498778592049745, "grad_norm": 0.3865401623267944, "learning_rate": 4.236550353783009e-08, "loss": 0.0177, "step": 8668 }, { "epoch": 3.850322007550522, "grad_norm": 0.40767308229409815, "learning_rate": 4.211405725936535e-08, "loss": 0.0255, "step": 8669 }, { "epoch": 3.8507661558960695, "grad_norm": 0.3151055767858847, "learning_rate": 4.186335623537707e-08, "loss": 0.0233, "step": 8670 }, { "epoch": 3.851210304241617, "grad_norm": 0.3928367577202992, "learning_rate": 4.1613400503550114e-08, "loss": 0.0212, "step": 8671 }, { "epoch": 3.851654452587164, "grad_norm": 0.4221600614877391, "learning_rate": 4.13641901014572e-08, "loss": 0.034, "step": 8672 }, { "epoch": 3.8520986009327114, "grad_norm": 0.3485554952889989, "learning_rate": 4.1115725066559476e-08, "loss": 0.0189, "step": 8673 }, { "epoch": 3.8525427492782587, "grad_norm": 0.4184669816003682, "learning_rate": 4.086800543620484e-08, "loss": 0.0265, "step": 8674 }, { "epoch": 3.8529868976238064, "grad_norm": 0.40633919115050987, "learning_rate": 4.062103124763017e-08, "loss": 0.0178, "step": 8675 }, { "epoch": 3.8534310459693537, "grad_norm": 0.5255893189755798, "learning_rate": 4.0374802537959114e-08, "loss": 0.0255, "step": 8676 }, { "epoch": 3.8538751943149014, "grad_norm": 0.4396123179486424, "learning_rate": 4.012931934420483e-08, "loss": 0.0317, "step": 8677 }, { "epoch": 3.8543193426604487, "grad_norm": 0.3657385642630762, "learning_rate": 3.9884581703267254e-08, "loss": 0.022, "step": 8678 }, { "epoch": 3.854763491005996, "grad_norm": 0.34626474417977715, "learning_rate": 3.964058965193473e-08, "loss": 0.022, "step": 8679 }, { "epoch": 3.8552076393515433, "grad_norm": 0.38562426634481894, "learning_rate": 3.939734322688349e-08, "loss": 0.0338, "step": 8680 }, { "epoch": 3.8556517876970906, "grad_norm": 0.4035987782299242, "learning_rate": 3.9154842464677045e-08, "loss": 0.0236, "step": 8681 }, { "epoch": 3.8560959360426383, "grad_norm": 0.4159647047153014, "learning_rate": 3.8913087401767914e-08, "loss": 0.024, "step": 8682 }, { "epoch": 3.8565400843881856, "grad_norm": 0.5090881621984651, "learning_rate": 3.867207807449591e-08, "loss": 0.03, "step": 8683 }, { "epoch": 3.856984232733733, "grad_norm": 0.3724391004939136, "learning_rate": 3.843181451908928e-08, "loss": 0.0222, "step": 8684 }, { "epoch": 3.8574283810792807, "grad_norm": 0.3994107927570375, "learning_rate": 3.8192296771663026e-08, "loss": 0.0365, "step": 8685 }, { "epoch": 3.857872529424828, "grad_norm": 0.534388923736043, "learning_rate": 3.795352486822057e-08, "loss": 0.0286, "step": 8686 }, { "epoch": 3.8583166777703752, "grad_norm": 0.3902311138190663, "learning_rate": 3.7715498844653755e-08, "loss": 0.0332, "step": 8687 }, { "epoch": 3.8587608261159225, "grad_norm": 0.6121843349896643, "learning_rate": 3.7478218736742286e-08, "loss": 0.0354, "step": 8688 }, { "epoch": 3.8592049744614703, "grad_norm": 0.3370534483485381, "learning_rate": 3.724168458015265e-08, "loss": 0.0221, "step": 8689 }, { "epoch": 3.8596491228070176, "grad_norm": 0.450573063405829, "learning_rate": 3.700589641044083e-08, "loss": 0.0225, "step": 8690 }, { "epoch": 3.860093271152565, "grad_norm": 0.49278309629574163, "learning_rate": 3.677085426304905e-08, "loss": 0.029, "step": 8691 }, { "epoch": 3.8605374194981126, "grad_norm": 0.4055229539649227, "learning_rate": 3.6536558173308476e-08, "loss": 0.025, "step": 8692 }, { "epoch": 3.86098156784366, "grad_norm": 0.3771030172546445, "learning_rate": 3.630300817643762e-08, "loss": 0.0196, "step": 8693 }, { "epoch": 3.861425716189207, "grad_norm": 0.4460409041626926, "learning_rate": 3.607020430754338e-08, "loss": 0.0263, "step": 8694 }, { "epoch": 3.8618698645347544, "grad_norm": 0.36455946204318906, "learning_rate": 3.583814660161944e-08, "loss": 0.0254, "step": 8695 }, { "epoch": 3.862314012880302, "grad_norm": 0.45869268016899095, "learning_rate": 3.5606835093548456e-08, "loss": 0.0364, "step": 8696 }, { "epoch": 3.8627581612258495, "grad_norm": 0.3785302300726189, "learning_rate": 3.537626981810094e-08, "loss": 0.0203, "step": 8697 }, { "epoch": 3.8632023095713968, "grad_norm": 0.41102132869975794, "learning_rate": 3.514645080993362e-08, "loss": 0.0304, "step": 8698 }, { "epoch": 3.8636464579169445, "grad_norm": 0.3842024608267308, "learning_rate": 3.49173781035933e-08, "loss": 0.0217, "step": 8699 }, { "epoch": 3.864090606262492, "grad_norm": 0.5453458258267285, "learning_rate": 3.4689051733513e-08, "loss": 0.0423, "step": 8700 }, { "epoch": 3.864534754608039, "grad_norm": 0.4282646993236422, "learning_rate": 3.446147173401415e-08, "loss": 0.0283, "step": 8701 }, { "epoch": 3.8649789029535864, "grad_norm": 0.3536636171621975, "learning_rate": 3.4234638139306055e-08, "loss": 0.0239, "step": 8702 }, { "epoch": 3.8654230512991337, "grad_norm": 0.32067352971181584, "learning_rate": 3.4008550983484766e-08, "loss": 0.019, "step": 8703 }, { "epoch": 3.8658671996446814, "grad_norm": 0.42623278208292326, "learning_rate": 3.378321030053644e-08, "loss": 0.0235, "step": 8704 }, { "epoch": 3.8663113479902287, "grad_norm": 0.4469975288314765, "learning_rate": 3.355861612433231e-08, "loss": 0.0309, "step": 8705 }, { "epoch": 3.8667554963357764, "grad_norm": 0.4959636073318917, "learning_rate": 3.3334768488633706e-08, "loss": 0.0284, "step": 8706 }, { "epoch": 3.8671996446813237, "grad_norm": 0.503087622712729, "learning_rate": 3.31116674270876e-08, "loss": 0.0256, "step": 8707 }, { "epoch": 3.867643793026871, "grad_norm": 0.36447403400661404, "learning_rate": 3.2889312973231616e-08, "loss": 0.0187, "step": 8708 }, { "epoch": 3.8680879413724183, "grad_norm": 0.39131073302761393, "learning_rate": 3.266770516048734e-08, "loss": 0.0227, "step": 8709 }, { "epoch": 3.8685320897179656, "grad_norm": 0.39335266998944496, "learning_rate": 3.2446844022167576e-08, "loss": 0.0302, "step": 8710 }, { "epoch": 3.8689762380635133, "grad_norm": 0.3587677625718898, "learning_rate": 3.2226729591471326e-08, "loss": 0.0196, "step": 8711 }, { "epoch": 3.8694203864090606, "grad_norm": 0.360223764784636, "learning_rate": 3.2007361901485455e-08, "loss": 0.0202, "step": 8712 }, { "epoch": 3.869864534754608, "grad_norm": 0.45416360753066903, "learning_rate": 3.1788740985184144e-08, "loss": 0.0296, "step": 8713 }, { "epoch": 3.8703086831001556, "grad_norm": 0.4540255704066019, "learning_rate": 3.1570866875430536e-08, "loss": 0.03, "step": 8714 }, { "epoch": 3.870752831445703, "grad_norm": 0.39674557183309156, "learning_rate": 3.135373960497401e-08, "loss": 0.0246, "step": 8715 }, { "epoch": 3.87119697979125, "grad_norm": 0.4615034431538654, "learning_rate": 3.113735920645344e-08, "loss": 0.0209, "step": 8716 }, { "epoch": 3.8716411281367975, "grad_norm": 0.40129516372174695, "learning_rate": 3.092172571239338e-08, "loss": 0.0239, "step": 8717 }, { "epoch": 3.8720852764823452, "grad_norm": 0.3795932466717671, "learning_rate": 3.070683915520845e-08, "loss": 0.0275, "step": 8718 }, { "epoch": 3.8725294248278925, "grad_norm": 0.5210559447705239, "learning_rate": 3.049269956719891e-08, "loss": 0.0261, "step": 8719 }, { "epoch": 3.87297357317344, "grad_norm": 0.4437613175714331, "learning_rate": 3.0279306980554034e-08, "loss": 0.0295, "step": 8720 }, { "epoch": 3.8734177215189876, "grad_norm": 0.3896516822182538, "learning_rate": 3.006666142734982e-08, "loss": 0.0246, "step": 8721 }, { "epoch": 3.873861869864535, "grad_norm": 0.43115789241542724, "learning_rate": 2.9854762939551254e-08, "loss": 0.0314, "step": 8722 }, { "epoch": 3.874306018210082, "grad_norm": 0.34436582503878505, "learning_rate": 2.9643611549008967e-08, "loss": 0.0219, "step": 8723 }, { "epoch": 3.8747501665556294, "grad_norm": 0.430010546023126, "learning_rate": 2.9433207287464238e-08, "loss": 0.0267, "step": 8724 }, { "epoch": 3.8751943149011767, "grad_norm": 0.28855368882245647, "learning_rate": 2.9223550186543435e-08, "loss": 0.0137, "step": 8725 }, { "epoch": 3.8756384632467245, "grad_norm": 0.3534494715975773, "learning_rate": 2.9014640277761353e-08, "loss": 0.0274, "step": 8726 }, { "epoch": 3.8760826115922717, "grad_norm": 0.3093760532300058, "learning_rate": 2.8806477592521755e-08, "loss": 0.015, "step": 8727 }, { "epoch": 3.8765267599378195, "grad_norm": 0.6497895297842237, "learning_rate": 2.8599062162114056e-08, "loss": 0.037, "step": 8728 }, { "epoch": 3.8769709082833668, "grad_norm": 0.5014893102635929, "learning_rate": 2.8392394017716095e-08, "loss": 0.0344, "step": 8729 }, { "epoch": 3.877415056628914, "grad_norm": 0.3936192272437833, "learning_rate": 2.8186473190395246e-08, "loss": 0.0303, "step": 8730 }, { "epoch": 3.8778592049744613, "grad_norm": 0.4010118354626057, "learning_rate": 2.798129971110286e-08, "loss": 0.0307, "step": 8731 }, { "epoch": 3.8783033533200086, "grad_norm": 0.3784503093468852, "learning_rate": 2.7776873610681486e-08, "loss": 0.02, "step": 8732 }, { "epoch": 3.8787475016655564, "grad_norm": 0.4230263125616594, "learning_rate": 2.7573194919859325e-08, "loss": 0.0224, "step": 8733 }, { "epoch": 3.8791916500111037, "grad_norm": 0.4140738675854264, "learning_rate": 2.737026366925244e-08, "loss": 0.0258, "step": 8734 }, { "epoch": 3.879635798356651, "grad_norm": 0.42420423811982005, "learning_rate": 2.716807988936532e-08, "loss": 0.022, "step": 8735 }, { "epoch": 3.8800799467021987, "grad_norm": 0.3980720494346127, "learning_rate": 2.696664361058976e-08, "loss": 0.0269, "step": 8736 }, { "epoch": 3.880524095047746, "grad_norm": 0.3946254166839832, "learning_rate": 2.6765954863204323e-08, "loss": 0.0288, "step": 8737 }, { "epoch": 3.8809682433932933, "grad_norm": 0.3788227216815116, "learning_rate": 2.6566013677376545e-08, "loss": 0.0238, "step": 8738 }, { "epoch": 3.8814123917388406, "grad_norm": 0.34261939630726157, "learning_rate": 2.6366820083160715e-08, "loss": 0.0145, "step": 8739 }, { "epoch": 3.8818565400843883, "grad_norm": 0.42749605488948866, "learning_rate": 2.6168374110498995e-08, "loss": 0.0257, "step": 8740 }, { "epoch": 3.8823006884299356, "grad_norm": 0.4042334016798025, "learning_rate": 2.5970675789220855e-08, "loss": 0.0186, "step": 8741 }, { "epoch": 3.882744836775483, "grad_norm": 0.39831255423745965, "learning_rate": 2.577372514904475e-08, "loss": 0.0209, "step": 8742 }, { "epoch": 3.8831889851210306, "grad_norm": 0.42360207246789, "learning_rate": 2.5577522219575324e-08, "loss": 0.0275, "step": 8743 }, { "epoch": 3.883633133466578, "grad_norm": 0.48707455874974, "learning_rate": 2.5382067030304546e-08, "loss": 0.0282, "step": 8744 }, { "epoch": 3.884077281812125, "grad_norm": 0.4723400265193081, "learning_rate": 2.5187359610612805e-08, "loss": 0.0382, "step": 8745 }, { "epoch": 3.8845214301576725, "grad_norm": 0.6222325293211652, "learning_rate": 2.499339998976835e-08, "loss": 0.0342, "step": 8746 }, { "epoch": 3.88496557850322, "grad_norm": 0.35612361294226674, "learning_rate": 2.4800188196926757e-08, "loss": 0.0183, "step": 8747 }, { "epoch": 3.8854097268487675, "grad_norm": 0.42945931063185067, "learning_rate": 2.4607724261130893e-08, "loss": 0.0249, "step": 8748 }, { "epoch": 3.885853875194315, "grad_norm": 0.3845905954635827, "learning_rate": 2.441600821131096e-08, "loss": 0.0219, "step": 8749 }, { "epoch": 3.8862980235398625, "grad_norm": 0.3836148595852599, "learning_rate": 2.422504007628501e-08, "loss": 0.0207, "step": 8750 }, { "epoch": 3.88674217188541, "grad_norm": 0.3404491277084874, "learning_rate": 2.4034819884759532e-08, "loss": 0.0198, "step": 8751 }, { "epoch": 3.887186320230957, "grad_norm": 0.3619930083510835, "learning_rate": 2.3845347665327202e-08, "loss": 0.0191, "step": 8752 }, { "epoch": 3.8876304685765044, "grad_norm": 0.4286874704038195, "learning_rate": 2.3656623446469684e-08, "loss": 0.0299, "step": 8753 }, { "epoch": 3.8880746169220517, "grad_norm": 0.3418186033416821, "learning_rate": 2.3468647256554845e-08, "loss": 0.0201, "step": 8754 }, { "epoch": 3.8885187652675994, "grad_norm": 0.4273211578376309, "learning_rate": 2.3281419123838966e-08, "loss": 0.0254, "step": 8755 }, { "epoch": 3.8889629136131467, "grad_norm": 0.46117222518267137, "learning_rate": 2.3094939076465095e-08, "loss": 0.0235, "step": 8756 }, { "epoch": 3.8894070619586945, "grad_norm": 0.30961316233144026, "learning_rate": 2.2909207142464695e-08, "loss": 0.0178, "step": 8757 }, { "epoch": 3.8898512103042417, "grad_norm": 0.4313089227592287, "learning_rate": 2.2724223349756547e-08, "loss": 0.0334, "step": 8758 }, { "epoch": 3.890295358649789, "grad_norm": 0.45892558729550764, "learning_rate": 2.253998772614674e-08, "loss": 0.0358, "step": 8759 }, { "epoch": 3.8907395069953363, "grad_norm": 0.4320146677307238, "learning_rate": 2.235650029932923e-08, "loss": 0.0275, "step": 8760 }, { "epoch": 3.8911836553408836, "grad_norm": 0.47469416867571373, "learning_rate": 2.2173761096884737e-08, "loss": 0.0313, "step": 8761 }, { "epoch": 3.8916278036864314, "grad_norm": 0.38809247080958487, "learning_rate": 2.1991770146282953e-08, "loss": 0.0254, "step": 8762 }, { "epoch": 3.8920719520319786, "grad_norm": 0.36883556562596115, "learning_rate": 2.181052747487922e-08, "loss": 0.0223, "step": 8763 }, { "epoch": 3.892516100377526, "grad_norm": 0.4140272360800753, "learning_rate": 2.1630033109918403e-08, "loss": 0.0305, "step": 8764 }, { "epoch": 3.8929602487230737, "grad_norm": 0.4360012174993987, "learning_rate": 2.1450287078531028e-08, "loss": 0.0247, "step": 8765 }, { "epoch": 3.893404397068621, "grad_norm": 0.46749106481514396, "learning_rate": 2.127128940773604e-08, "loss": 0.0336, "step": 8766 }, { "epoch": 3.8938485454141682, "grad_norm": 0.48628664662774723, "learning_rate": 2.1093040124440246e-08, "loss": 0.0364, "step": 8767 }, { "epoch": 3.8942926937597155, "grad_norm": 0.4655918973858269, "learning_rate": 2.091553925543721e-08, "loss": 0.029, "step": 8768 }, { "epoch": 3.8947368421052633, "grad_norm": 0.4385695873892939, "learning_rate": 2.073878682740893e-08, "loss": 0.0332, "step": 8769 }, { "epoch": 3.8951809904508106, "grad_norm": 0.42920683680763916, "learning_rate": 2.056278286692359e-08, "loss": 0.0234, "step": 8770 }, { "epoch": 3.895625138796358, "grad_norm": 0.44077294633964265, "learning_rate": 2.0387527400437812e-08, "loss": 0.0171, "step": 8771 }, { "epoch": 3.8960692871419056, "grad_norm": 0.4448703366882169, "learning_rate": 2.0213020454295517e-08, "loss": 0.0241, "step": 8772 }, { "epoch": 3.896513435487453, "grad_norm": 0.3962409747625572, "learning_rate": 2.003926205472795e-08, "loss": 0.0303, "step": 8773 }, { "epoch": 3.896957583833, "grad_norm": 0.45182968264253826, "learning_rate": 1.986625222785421e-08, "loss": 0.0517, "step": 8774 }, { "epoch": 3.8974017321785475, "grad_norm": 0.3768583215614216, "learning_rate": 1.9693990999680167e-08, "loss": 0.0243, "step": 8775 }, { "epoch": 3.897845880524095, "grad_norm": 0.37480588059892506, "learning_rate": 1.952247839610011e-08, "loss": 0.0232, "step": 8776 }, { "epoch": 3.8982900288696425, "grad_norm": 0.43502480214398537, "learning_rate": 1.9351714442895077e-08, "loss": 0.0243, "step": 8777 }, { "epoch": 3.8987341772151898, "grad_norm": 0.33331146266153816, "learning_rate": 1.918169916573398e-08, "loss": 0.0223, "step": 8778 }, { "epoch": 3.8991783255607375, "grad_norm": 0.3283297894096133, "learning_rate": 1.9012432590172493e-08, "loss": 0.0182, "step": 8779 }, { "epoch": 3.899622473906285, "grad_norm": 0.4613996841914982, "learning_rate": 1.8843914741654146e-08, "loss": 0.0264, "step": 8780 }, { "epoch": 3.900066622251832, "grad_norm": 0.3728645160784641, "learning_rate": 1.8676145645511456e-08, "loss": 0.0219, "step": 8781 }, { "epoch": 3.9005107705973794, "grad_norm": 0.367364355087082, "learning_rate": 1.850912532696092e-08, "loss": 0.0169, "step": 8782 }, { "epoch": 3.9009549189429267, "grad_norm": 0.43782054130287257, "learning_rate": 1.8342853811110227e-08, "loss": 0.0244, "step": 8783 }, { "epoch": 3.9013990672884744, "grad_norm": 0.46621513501531153, "learning_rate": 1.817733112295217e-08, "loss": 0.0237, "step": 8784 }, { "epoch": 3.9018432156340217, "grad_norm": 0.3948393185866851, "learning_rate": 1.8012557287367394e-08, "loss": 0.0197, "step": 8785 }, { "epoch": 3.9022873639795694, "grad_norm": 0.5251739552887689, "learning_rate": 1.7848532329124978e-08, "loss": 0.0376, "step": 8786 }, { "epoch": 3.9027315123251167, "grad_norm": 0.3262674897809267, "learning_rate": 1.7685256272879646e-08, "loss": 0.0189, "step": 8787 }, { "epoch": 3.903175660670664, "grad_norm": 0.32400288553912415, "learning_rate": 1.7522729143174545e-08, "loss": 0.0185, "step": 8788 }, { "epoch": 3.9036198090162113, "grad_norm": 0.36601512988828766, "learning_rate": 1.7360950964441236e-08, "loss": 0.0196, "step": 8789 }, { "epoch": 3.9040639573617586, "grad_norm": 0.5081663386558348, "learning_rate": 1.7199921760997494e-08, "loss": 0.0312, "step": 8790 }, { "epoch": 3.9045081057073063, "grad_norm": 0.717337078541349, "learning_rate": 1.7039641557048402e-08, "loss": 0.035, "step": 8791 }, { "epoch": 3.9049522540528536, "grad_norm": 0.44298957493772145, "learning_rate": 1.6880110376686353e-08, "loss": 0.0368, "step": 8792 }, { "epoch": 3.905396402398401, "grad_norm": 0.34392561304252184, "learning_rate": 1.672132824389272e-08, "loss": 0.0178, "step": 8793 }, { "epoch": 3.9058405507439486, "grad_norm": 0.42295010360189583, "learning_rate": 1.6563295182534524e-08, "loss": 0.0251, "step": 8794 }, { "epoch": 3.906284699089496, "grad_norm": 0.3790549559366509, "learning_rate": 1.6406011216366647e-08, "loss": 0.0235, "step": 8795 }, { "epoch": 3.9067288474350432, "grad_norm": 0.41052130644928436, "learning_rate": 1.6249476369031845e-08, "loss": 0.0186, "step": 8796 }, { "epoch": 3.9071729957805905, "grad_norm": 0.4367038611052701, "learning_rate": 1.6093690664059635e-08, "loss": 0.0339, "step": 8797 }, { "epoch": 3.9076171441261383, "grad_norm": 0.37822957010835845, "learning_rate": 1.5938654124867394e-08, "loss": 0.0215, "step": 8798 }, { "epoch": 3.9080612924716855, "grad_norm": 0.3875394125371318, "learning_rate": 1.5784366774760362e-08, "loss": 0.0217, "step": 8799 }, { "epoch": 3.908505440817233, "grad_norm": 0.4043786874203682, "learning_rate": 1.563082863692944e-08, "loss": 0.0236, "step": 8800 }, { "epoch": 3.9089495891627806, "grad_norm": 0.4592008857137564, "learning_rate": 1.5478039734455053e-08, "loss": 0.0229, "step": 8801 }, { "epoch": 3.909393737508328, "grad_norm": 0.29589640326829375, "learning_rate": 1.5326000090303272e-08, "loss": 0.0162, "step": 8802 }, { "epoch": 3.909837885853875, "grad_norm": 0.42576417588302456, "learning_rate": 1.5174709727328595e-08, "loss": 0.0359, "step": 8803 }, { "epoch": 3.9102820341994224, "grad_norm": 0.4093180434069703, "learning_rate": 1.5024168668272275e-08, "loss": 0.0264, "step": 8804 }, { "epoch": 3.91072618254497, "grad_norm": 0.46276973652514886, "learning_rate": 1.4874376935763434e-08, "loss": 0.0289, "step": 8805 }, { "epoch": 3.9111703308905175, "grad_norm": 0.4688883489908628, "learning_rate": 1.4725334552318504e-08, "loss": 0.0181, "step": 8806 }, { "epoch": 3.9116144792360648, "grad_norm": 0.3039807302776599, "learning_rate": 1.4577041540340676e-08, "loss": 0.015, "step": 8807 }, { "epoch": 3.9120586275816125, "grad_norm": 0.3741536582807147, "learning_rate": 1.442949792212045e-08, "loss": 0.0235, "step": 8808 }, { "epoch": 3.91250277592716, "grad_norm": 0.5513005564638584, "learning_rate": 1.428270371983731e-08, "loss": 0.0238, "step": 8809 }, { "epoch": 3.912946924272707, "grad_norm": 0.37263633834750676, "learning_rate": 1.4136658955556381e-08, "loss": 0.0249, "step": 8810 }, { "epoch": 3.9133910726182544, "grad_norm": 0.36432340872278923, "learning_rate": 1.3991363651230106e-08, "loss": 0.0207, "step": 8811 }, { "epoch": 3.9138352209638017, "grad_norm": 0.49111074770917923, "learning_rate": 1.38468178286999e-08, "loss": 0.0296, "step": 8812 }, { "epoch": 3.9142793693093494, "grad_norm": 0.36763184203730237, "learning_rate": 1.3703021509692827e-08, "loss": 0.0262, "step": 8813 }, { "epoch": 3.9147235176548967, "grad_norm": 0.4095976827275787, "learning_rate": 1.3559974715823266e-08, "loss": 0.0297, "step": 8814 }, { "epoch": 3.9151676660004444, "grad_norm": 0.4248033682883231, "learning_rate": 1.3417677468595125e-08, "loss": 0.025, "step": 8815 }, { "epoch": 3.9156118143459917, "grad_norm": 0.4638888518779016, "learning_rate": 1.3276129789397407e-08, "loss": 0.0225, "step": 8816 }, { "epoch": 3.916055962691539, "grad_norm": 0.5594291449958435, "learning_rate": 1.3135331699506426e-08, "loss": 0.0285, "step": 8817 }, { "epoch": 3.9165001110370863, "grad_norm": 0.5046033957125147, "learning_rate": 1.2995283220087473e-08, "loss": 0.0329, "step": 8818 }, { "epoch": 3.9169442593826336, "grad_norm": 0.4673687167446593, "learning_rate": 1.2855984372191488e-08, "loss": 0.0256, "step": 8819 }, { "epoch": 3.9173884077281813, "grad_norm": 0.37484792767681735, "learning_rate": 1.2717435176758386e-08, "loss": 0.0271, "step": 8820 }, { "epoch": 3.9178325560737286, "grad_norm": 0.5235629211435013, "learning_rate": 1.2579635654613176e-08, "loss": 0.0353, "step": 8821 }, { "epoch": 3.918276704419276, "grad_norm": 0.37938171555742584, "learning_rate": 1.24425858264704e-08, "loss": 0.0236, "step": 8822 }, { "epoch": 3.9187208527648236, "grad_norm": 0.31241390505019695, "learning_rate": 1.2306285712931354e-08, "loss": 0.0175, "step": 8823 }, { "epoch": 3.919165001110371, "grad_norm": 0.3890372627975678, "learning_rate": 1.2170735334482986e-08, "loss": 0.0323, "step": 8824 }, { "epoch": 3.919609149455918, "grad_norm": 0.5060461363110739, "learning_rate": 1.2035934711501773e-08, "loss": 0.0213, "step": 8825 }, { "epoch": 3.9200532978014655, "grad_norm": 0.4210215040945434, "learning_rate": 1.1901883864250396e-08, "loss": 0.0201, "step": 8826 }, { "epoch": 3.9204974461470132, "grad_norm": 0.3987058322406445, "learning_rate": 1.1768582812878848e-08, "loss": 0.0265, "step": 8827 }, { "epoch": 3.9209415944925605, "grad_norm": 0.3908166480296464, "learning_rate": 1.1636031577424434e-08, "loss": 0.0204, "step": 8828 }, { "epoch": 3.921385742838108, "grad_norm": 0.4484182047406765, "learning_rate": 1.150423017781177e-08, "loss": 0.0262, "step": 8829 }, { "epoch": 3.9218298911836555, "grad_norm": 0.39560743278914223, "learning_rate": 1.1373178633853344e-08, "loss": 0.0301, "step": 8830 }, { "epoch": 3.922274039529203, "grad_norm": 0.44427517578613046, "learning_rate": 1.124287696524784e-08, "loss": 0.033, "step": 8831 }, { "epoch": 3.92271818787475, "grad_norm": 0.48713519459485133, "learning_rate": 1.111332519158237e-08, "loss": 0.0293, "step": 8832 }, { "epoch": 3.9231623362202974, "grad_norm": 0.38920760124214965, "learning_rate": 1.0984523332330244e-08, "loss": 0.0313, "step": 8833 }, { "epoch": 3.923606484565845, "grad_norm": 0.39034349333664137, "learning_rate": 1.0856471406852642e-08, "loss": 0.0227, "step": 8834 }, { "epoch": 3.9240506329113924, "grad_norm": 0.5491500358691043, "learning_rate": 1.0729169434398613e-08, "loss": 0.046, "step": 8835 }, { "epoch": 3.9244947812569397, "grad_norm": 0.38151095276050573, "learning_rate": 1.0602617434102846e-08, "loss": 0.0248, "step": 8836 }, { "epoch": 3.9249389296024875, "grad_norm": 0.4149228267505079, "learning_rate": 1.0476815424989018e-08, "loss": 0.0212, "step": 8837 }, { "epoch": 3.9253830779480348, "grad_norm": 0.4455456792947203, "learning_rate": 1.0351763425966999e-08, "loss": 0.0276, "step": 8838 }, { "epoch": 3.925827226293582, "grad_norm": 0.38613196310803816, "learning_rate": 1.022746145583453e-08, "loss": 0.0224, "step": 8839 }, { "epoch": 3.9262713746391293, "grad_norm": 0.4192361066154168, "learning_rate": 1.0103909533275557e-08, "loss": 0.0299, "step": 8840 }, { "epoch": 3.9267155229846766, "grad_norm": 0.6267500325799473, "learning_rate": 9.981107676862444e-09, "loss": 0.0255, "step": 8841 }, { "epoch": 3.9271596713302244, "grad_norm": 0.43799023094491873, "learning_rate": 9.859055905054871e-09, "loss": 0.0318, "step": 8842 }, { "epoch": 3.9276038196757717, "grad_norm": 0.40925366502801463, "learning_rate": 9.737754236198716e-09, "loss": 0.0237, "step": 8843 }, { "epoch": 3.9280479680213194, "grad_norm": 0.34421339563935327, "learning_rate": 9.617202688527727e-09, "loss": 0.02, "step": 8844 }, { "epoch": 3.9284921163668667, "grad_norm": 0.36452089076221383, "learning_rate": 9.497401280162966e-09, "loss": 0.0272, "step": 8845 }, { "epoch": 3.928936264712414, "grad_norm": 0.37821446185307267, "learning_rate": 9.378350029112248e-09, "loss": 0.0187, "step": 8846 }, { "epoch": 3.9293804130579613, "grad_norm": 0.4769435842108248, "learning_rate": 9.260048953271817e-09, "loss": 0.0304, "step": 8847 }, { "epoch": 3.9298245614035086, "grad_norm": 0.42416826190385143, "learning_rate": 9.142498070424111e-09, "loss": 0.0252, "step": 8848 }, { "epoch": 3.9302687097490563, "grad_norm": 0.4780719986413718, "learning_rate": 9.02569739823833e-09, "loss": 0.0243, "step": 8849 }, { "epoch": 3.9307128580946036, "grad_norm": 0.5064893232422892, "learning_rate": 8.9096469542721e-09, "loss": 0.029, "step": 8850 }, { "epoch": 3.931157006440151, "grad_norm": 0.472928807421496, "learning_rate": 8.794346755969795e-09, "loss": 0.0234, "step": 8851 }, { "epoch": 3.9316011547856986, "grad_norm": 0.40013223823456623, "learning_rate": 8.679796820663111e-09, "loss": 0.0344, "step": 8852 }, { "epoch": 3.932045303131246, "grad_norm": 0.40854096697741643, "learning_rate": 8.565997165570494e-09, "loss": 0.0317, "step": 8853 }, { "epoch": 3.932489451476793, "grad_norm": 0.6774113697293435, "learning_rate": 8.452947807798261e-09, "loss": 0.0265, "step": 8854 }, { "epoch": 3.9329335998223405, "grad_norm": 0.47756331700457766, "learning_rate": 8.340648764339487e-09, "loss": 0.0245, "step": 8855 }, { "epoch": 3.933377748167888, "grad_norm": 0.408015599055906, "learning_rate": 8.229100052074557e-09, "loss": 0.0266, "step": 8856 }, { "epoch": 3.9338218965134355, "grad_norm": 0.3723159509813085, "learning_rate": 8.118301687771169e-09, "loss": 0.0244, "step": 8857 }, { "epoch": 3.934266044858983, "grad_norm": 0.44846859119809684, "learning_rate": 8.008253688084888e-09, "loss": 0.0199, "step": 8858 }, { "epoch": 3.9347101932045305, "grad_norm": 0.3532724419502754, "learning_rate": 7.898956069556375e-09, "loss": 0.0167, "step": 8859 }, { "epoch": 3.935154341550078, "grad_norm": 0.38377139505288993, "learning_rate": 7.790408848616371e-09, "loss": 0.025, "step": 8860 }, { "epoch": 3.935598489895625, "grad_norm": 0.47706384193903933, "learning_rate": 7.682612041580161e-09, "loss": 0.0298, "step": 8861 }, { "epoch": 3.9360426382411724, "grad_norm": 0.47799572814835295, "learning_rate": 7.575565664652562e-09, "loss": 0.0232, "step": 8862 }, { "epoch": 3.9364867865867197, "grad_norm": 0.42569464791407546, "learning_rate": 7.469269733923478e-09, "loss": 0.0284, "step": 8863 }, { "epoch": 3.9369309349322674, "grad_norm": 0.49128556583785543, "learning_rate": 7.363724265371796e-09, "loss": 0.0322, "step": 8864 }, { "epoch": 3.9373750832778147, "grad_norm": 0.4426824222148885, "learning_rate": 7.258929274862048e-09, "loss": 0.0244, "step": 8865 }, { "epoch": 3.9378192316233624, "grad_norm": 0.4220498782198706, "learning_rate": 7.154884778147187e-09, "loss": 0.0381, "step": 8866 }, { "epoch": 3.9382633799689097, "grad_norm": 0.40573724780385634, "learning_rate": 7.051590790866925e-09, "loss": 0.0208, "step": 8867 }, { "epoch": 3.938707528314457, "grad_norm": 0.36192582224558784, "learning_rate": 6.949047328547731e-09, "loss": 0.0186, "step": 8868 }, { "epoch": 3.9391516766600043, "grad_norm": 0.44081181805446706, "learning_rate": 6.847254406603943e-09, "loss": 0.0305, "step": 8869 }, { "epoch": 3.9395958250055516, "grad_norm": 0.47409619445972684, "learning_rate": 6.746212040336653e-09, "loss": 0.0276, "step": 8870 }, { "epoch": 3.9400399733510993, "grad_norm": 0.3907822464278477, "learning_rate": 6.645920244934267e-09, "loss": 0.0265, "step": 8871 }, { "epoch": 3.9404841216966466, "grad_norm": 0.3877502095250313, "learning_rate": 6.546379035472505e-09, "loss": 0.0196, "step": 8872 }, { "epoch": 3.9409282700421944, "grad_norm": 0.36359890877779866, "learning_rate": 6.447588426913287e-09, "loss": 0.0207, "step": 8873 }, { "epoch": 3.9413724183877417, "grad_norm": 0.4046244907134214, "learning_rate": 6.349548434108066e-09, "loss": 0.0245, "step": 8874 }, { "epoch": 3.941816566733289, "grad_norm": 0.30688221794365883, "learning_rate": 6.252259071792277e-09, "loss": 0.0252, "step": 8875 }, { "epoch": 3.9422607150788362, "grad_norm": 0.3706992283845886, "learning_rate": 6.155720354590888e-09, "loss": 0.0179, "step": 8876 }, { "epoch": 3.9427048634243835, "grad_norm": 0.3606911516685271, "learning_rate": 6.059932297015625e-09, "loss": 0.0155, "step": 8877 }, { "epoch": 3.9431490117699313, "grad_norm": 0.3448866184147424, "learning_rate": 5.964894913464969e-09, "loss": 0.0238, "step": 8878 }, { "epoch": 3.9435931601154786, "grad_norm": 0.42743435978316485, "learning_rate": 5.8706082182241605e-09, "loss": 0.019, "step": 8879 }, { "epoch": 3.944037308461026, "grad_norm": 0.38374298902655224, "learning_rate": 5.777072225466307e-09, "loss": 0.0232, "step": 8880 }, { "epoch": 3.9444814568065736, "grad_norm": 0.3737396272194393, "learning_rate": 5.684286949251272e-09, "loss": 0.0218, "step": 8881 }, { "epoch": 3.944925605152121, "grad_norm": 0.3717422178901769, "learning_rate": 5.592252403526788e-09, "loss": 0.018, "step": 8882 }, { "epoch": 3.945369753497668, "grad_norm": 0.4530028394270273, "learning_rate": 5.500968602126788e-09, "loss": 0.0311, "step": 8883 }, { "epoch": 3.9458139018432155, "grad_norm": 0.34242197949439085, "learning_rate": 5.410435558773075e-09, "loss": 0.0229, "step": 8884 }, { "epoch": 3.946258050188763, "grad_norm": 0.39126976030692906, "learning_rate": 5.3206532870742065e-09, "loss": 0.0253, "step": 8885 }, { "epoch": 3.9467021985343105, "grad_norm": 0.43259511059270805, "learning_rate": 5.231621800525499e-09, "loss": 0.0222, "step": 8886 }, { "epoch": 3.9471463468798578, "grad_norm": 0.5231951089873627, "learning_rate": 5.143341112510691e-09, "loss": 0.0432, "step": 8887 }, { "epoch": 3.9475904952254055, "grad_norm": 0.4699803462882175, "learning_rate": 5.055811236299724e-09, "loss": 0.0242, "step": 8888 }, { "epoch": 3.948034643570953, "grad_norm": 0.4043702599111712, "learning_rate": 4.969032185049294e-09, "loss": 0.0367, "step": 8889 }, { "epoch": 3.9484787919165, "grad_norm": 0.41944855374085116, "learning_rate": 4.883003971803968e-09, "loss": 0.0236, "step": 8890 }, { "epoch": 3.9489229402620474, "grad_norm": 0.40457657146672565, "learning_rate": 4.797726609495623e-09, "loss": 0.0188, "step": 8891 }, { "epoch": 3.9493670886075947, "grad_norm": 0.4052288455507649, "learning_rate": 4.7132001109423396e-09, "loss": 0.0266, "step": 8892 }, { "epoch": 3.9498112369531424, "grad_norm": 0.5765436543403538, "learning_rate": 4.629424488850065e-09, "loss": 0.0362, "step": 8893 }, { "epoch": 3.9502553852986897, "grad_norm": 0.4676308344783614, "learning_rate": 4.546399755812059e-09, "loss": 0.0249, "step": 8894 }, { "epoch": 3.9506995336442374, "grad_norm": 0.28808412670077804, "learning_rate": 4.4641259243077825e-09, "loss": 0.0137, "step": 8895 }, { "epoch": 3.9511436819897847, "grad_norm": 0.47058698890390266, "learning_rate": 4.382603006705121e-09, "loss": 0.0319, "step": 8896 }, { "epoch": 3.951587830335332, "grad_norm": 0.41572017055736116, "learning_rate": 4.301831015257607e-09, "loss": 0.0221, "step": 8897 }, { "epoch": 3.9520319786808793, "grad_norm": 0.39457081647577197, "learning_rate": 4.221809962107193e-09, "loss": 0.0248, "step": 8898 }, { "epoch": 3.9524761270264266, "grad_norm": 0.5337090298271753, "learning_rate": 4.142539859282035e-09, "loss": 0.036, "step": 8899 }, { "epoch": 3.9529202753719743, "grad_norm": 0.4292452210463301, "learning_rate": 4.064020718698158e-09, "loss": 0.0276, "step": 8900 }, { "epoch": 3.9533644237175216, "grad_norm": 0.35441583703246865, "learning_rate": 3.986252552157788e-09, "loss": 0.0228, "step": 8901 }, { "epoch": 3.953808572063069, "grad_norm": 0.4031250072105655, "learning_rate": 3.909235371351017e-09, "loss": 0.028, "step": 8902 }, { "epoch": 3.9542527204086166, "grad_norm": 0.32842995453695917, "learning_rate": 3.832969187855251e-09, "loss": 0.0244, "step": 8903 }, { "epoch": 3.954696868754164, "grad_norm": 0.4192351275506548, "learning_rate": 3.757454013134099e-09, "loss": 0.0215, "step": 8904 }, { "epoch": 3.955141017099711, "grad_norm": 0.4517856702968858, "learning_rate": 3.682689858539035e-09, "loss": 0.0308, "step": 8905 }, { "epoch": 3.9555851654452585, "grad_norm": 0.4000805272591098, "learning_rate": 3.608676735308292e-09, "loss": 0.0257, "step": 8906 }, { "epoch": 3.9560293137908062, "grad_norm": 0.39861614075630786, "learning_rate": 3.5354146545668597e-09, "loss": 0.0268, "step": 8907 }, { "epoch": 3.9564734621363535, "grad_norm": 0.3761331007710691, "learning_rate": 3.462903627328151e-09, "loss": 0.0212, "step": 8908 }, { "epoch": 3.956917610481901, "grad_norm": 0.552448221226994, "learning_rate": 3.3911436644912256e-09, "loss": 0.0272, "step": 8909 }, { "epoch": 3.9573617588274486, "grad_norm": 0.40174202534071046, "learning_rate": 3.3201347768430093e-09, "loss": 0.0289, "step": 8910 }, { "epoch": 3.957805907172996, "grad_norm": 0.3916127999245288, "learning_rate": 3.249876975057187e-09, "loss": 0.0251, "step": 8911 }, { "epoch": 3.958250055518543, "grad_norm": 0.40161063795102814, "learning_rate": 3.1803702696947547e-09, "loss": 0.0201, "step": 8912 }, { "epoch": 3.9586942038640904, "grad_norm": 0.4175398836504673, "learning_rate": 3.111614671204022e-09, "loss": 0.0236, "step": 8913 }, { "epoch": 3.959138352209638, "grad_norm": 0.5695134220357339, "learning_rate": 3.043610189919499e-09, "loss": 0.0289, "step": 8914 }, { "epoch": 3.9595825005551855, "grad_norm": 0.47624104829660213, "learning_rate": 2.97635683606412e-09, "loss": 0.0319, "step": 8915 }, { "epoch": 3.9600266489007327, "grad_norm": 0.4611136348425102, "learning_rate": 2.909854619747021e-09, "loss": 0.0207, "step": 8916 }, { "epoch": 3.9604707972462805, "grad_norm": 0.49713796334737675, "learning_rate": 2.8441035509640947e-09, "loss": 0.0273, "step": 8917 }, { "epoch": 3.9609149455918278, "grad_norm": 0.5029210915242077, "learning_rate": 2.7791036395996563e-09, "loss": 0.0413, "step": 8918 }, { "epoch": 3.961359093937375, "grad_norm": 0.4809399615159702, "learning_rate": 2.7148548954236687e-09, "loss": 0.0326, "step": 8919 }, { "epoch": 3.9618032422829224, "grad_norm": 0.4107926474040777, "learning_rate": 2.6513573280939618e-09, "loss": 0.0243, "step": 8920 }, { "epoch": 3.9622473906284696, "grad_norm": 0.39038109738475957, "learning_rate": 2.5886109471551233e-09, "loss": 0.0251, "step": 8921 }, { "epoch": 3.9626915389740174, "grad_norm": 0.328940131759137, "learning_rate": 2.526615762039608e-09, "loss": 0.0163, "step": 8922 }, { "epoch": 3.9631356873195647, "grad_norm": 0.4003085651392341, "learning_rate": 2.465371782066073e-09, "loss": 0.0236, "step": 8923 }, { "epoch": 3.9635798356651124, "grad_norm": 0.4454242880030863, "learning_rate": 2.4048790164404866e-09, "loss": 0.0232, "step": 8924 }, { "epoch": 3.9640239840106597, "grad_norm": 0.34629533945323293, "learning_rate": 2.3451374742555764e-09, "loss": 0.0177, "step": 8925 }, { "epoch": 3.964468132356207, "grad_norm": 0.4133618406412272, "learning_rate": 2.2861471644919363e-09, "loss": 0.0191, "step": 8926 }, { "epoch": 3.9649122807017543, "grad_norm": 0.42649199370055857, "learning_rate": 2.2279080960163625e-09, "loss": 0.027, "step": 8927 }, { "epoch": 3.9653564290473016, "grad_norm": 0.3956675634414063, "learning_rate": 2.170420277584073e-09, "loss": 0.0263, "step": 8928 }, { "epoch": 3.9658005773928493, "grad_norm": 0.4346162397069076, "learning_rate": 2.1136837178353797e-09, "loss": 0.0258, "step": 8929 }, { "epoch": 3.9662447257383966, "grad_norm": 0.4508730863193407, "learning_rate": 2.05769842529957e-09, "loss": 0.0355, "step": 8930 }, { "epoch": 3.966688874083944, "grad_norm": 0.44677554596928787, "learning_rate": 2.002464408392135e-09, "loss": 0.0245, "step": 8931 }, { "epoch": 3.9671330224294916, "grad_norm": 0.3494473501914632, "learning_rate": 1.9479816754147672e-09, "loss": 0.0231, "step": 8932 }, { "epoch": 3.967577170775039, "grad_norm": 0.3733325402809653, "learning_rate": 1.894250234558137e-09, "loss": 0.0177, "step": 8933 }, { "epoch": 3.968021319120586, "grad_norm": 0.42732301014072877, "learning_rate": 1.8412700938985618e-09, "loss": 0.0245, "step": 8934 }, { "epoch": 3.9684654674661335, "grad_norm": 0.5850533107887496, "learning_rate": 1.7890412614002262e-09, "loss": 0.0173, "step": 8935 }, { "epoch": 3.9689096158116812, "grad_norm": 0.42327533255156274, "learning_rate": 1.7375637449135174e-09, "loss": 0.0217, "step": 8936 }, { "epoch": 3.9693537641572285, "grad_norm": 0.45340718199873464, "learning_rate": 1.68683755217669e-09, "loss": 0.0329, "step": 8937 }, { "epoch": 3.969797912502776, "grad_norm": 0.43589126986170634, "learning_rate": 1.6368626908147556e-09, "loss": 0.0294, "step": 8938 }, { "epoch": 3.9702420608483235, "grad_norm": 0.37544409700894205, "learning_rate": 1.5876391683400383e-09, "loss": 0.0356, "step": 8939 }, { "epoch": 3.970686209193871, "grad_norm": 0.3711146055897954, "learning_rate": 1.5391669921505093e-09, "loss": 0.0304, "step": 8940 }, { "epoch": 3.971130357539418, "grad_norm": 0.40437917019760156, "learning_rate": 1.4914461695336723e-09, "loss": 0.0186, "step": 8941 }, { "epoch": 3.9715745058849654, "grad_norm": 0.36049022928808616, "learning_rate": 1.4444767076626787e-09, "loss": 0.0221, "step": 8942 }, { "epoch": 3.972018654230513, "grad_norm": 0.49140822525474664, "learning_rate": 1.3982586135968813e-09, "loss": 0.0267, "step": 8943 }, { "epoch": 3.9724628025760604, "grad_norm": 0.5302909084020938, "learning_rate": 1.3527918942840556e-09, "loss": 0.041, "step": 8944 }, { "epoch": 3.9729069509216077, "grad_norm": 0.5476906455774254, "learning_rate": 1.3080765565592902e-09, "loss": 0.0273, "step": 8945 }, { "epoch": 3.9733510992671555, "grad_norm": 0.3333489521628196, "learning_rate": 1.2641126071433197e-09, "loss": 0.0268, "step": 8946 }, { "epoch": 3.9737952476127028, "grad_norm": 0.39064699714492984, "learning_rate": 1.2209000526447469e-09, "loss": 0.0204, "step": 8947 }, { "epoch": 3.97423939595825, "grad_norm": 0.5569247363465006, "learning_rate": 1.1784388995594864e-09, "loss": 0.0321, "step": 8948 }, { "epoch": 3.9746835443037973, "grad_norm": 0.4766119680419059, "learning_rate": 1.1367291542702107e-09, "loss": 0.0284, "step": 8949 }, { "epoch": 3.9751276926493446, "grad_norm": 0.5034524006899211, "learning_rate": 1.0957708230457942e-09, "loss": 0.0271, "step": 8950 }, { "epoch": 3.9755718409948924, "grad_norm": 0.3381689403401387, "learning_rate": 1.0555639120440887e-09, "loss": 0.0201, "step": 8951 }, { "epoch": 3.9760159893404396, "grad_norm": 0.46151159178049117, "learning_rate": 1.0161084273080378e-09, "loss": 0.0296, "step": 8952 }, { "epoch": 3.9764601376859874, "grad_norm": 0.4521959026876067, "learning_rate": 9.774043747690087e-10, "loss": 0.0311, "step": 8953 }, { "epoch": 3.9769042860315347, "grad_norm": 0.48398453476246417, "learning_rate": 9.394517602445697e-10, "loss": 0.0309, "step": 8954 }, { "epoch": 3.977348434377082, "grad_norm": 0.3856389581537802, "learning_rate": 9.022505894396017e-10, "loss": 0.0216, "step": 8955 }, { "epoch": 3.9777925827226293, "grad_norm": 0.3450747091674961, "learning_rate": 8.658008679462981e-10, "loss": 0.0157, "step": 8956 }, { "epoch": 3.9782367310681765, "grad_norm": 0.3798481275168741, "learning_rate": 8.301026012436098e-10, "loss": 0.0197, "step": 8957 }, { "epoch": 3.9786808794137243, "grad_norm": 0.4175802111530846, "learning_rate": 7.951557946972444e-10, "loss": 0.0254, "step": 8958 }, { "epoch": 3.9791250277592716, "grad_norm": 0.3948056525475575, "learning_rate": 7.609604535613324e-10, "loss": 0.0198, "step": 8959 }, { "epoch": 3.979569176104819, "grad_norm": 0.5080884160142413, "learning_rate": 7.275165829745412e-10, "loss": 0.0204, "step": 8960 }, { "epoch": 3.9800133244503666, "grad_norm": 0.3995310763869156, "learning_rate": 6.948241879650708e-10, "loss": 0.0296, "step": 8961 }, { "epoch": 3.980457472795914, "grad_norm": 0.37430290093081664, "learning_rate": 6.628832734467683e-10, "loss": 0.0238, "step": 8962 }, { "epoch": 3.980901621141461, "grad_norm": 0.43982737443124764, "learning_rate": 6.316938442213483e-10, "loss": 0.0209, "step": 8963 }, { "epoch": 3.9813457694870085, "grad_norm": 0.3775182755482849, "learning_rate": 6.012559049761723e-10, "loss": 0.025, "step": 8964 }, { "epoch": 3.981789917832556, "grad_norm": 0.39039801224690585, "learning_rate": 5.715694602875799e-10, "loss": 0.0207, "step": 8965 }, { "epoch": 3.9822340661781035, "grad_norm": 0.4199762038802494, "learning_rate": 5.426345146175571e-10, "loss": 0.0191, "step": 8966 }, { "epoch": 3.982678214523651, "grad_norm": 0.38333717972878917, "learning_rate": 5.144510723154028e-10, "loss": 0.0284, "step": 8967 }, { "epoch": 3.9831223628691985, "grad_norm": 0.4104443185350045, "learning_rate": 4.87019137617728e-10, "loss": 0.0189, "step": 8968 }, { "epoch": 3.983566511214746, "grad_norm": 0.5433078848500579, "learning_rate": 4.60338714647901e-10, "loss": 0.0302, "step": 8969 }, { "epoch": 3.984010659560293, "grad_norm": 0.3909125068421461, "learning_rate": 4.3440980741660254e-10, "loss": 0.0238, "step": 8970 }, { "epoch": 3.9844548079058404, "grad_norm": 0.41307839626778975, "learning_rate": 4.092324198212705e-10, "loss": 0.0235, "step": 8971 }, { "epoch": 3.984898956251388, "grad_norm": 0.321331298647483, "learning_rate": 3.848065556461e-10, "loss": 0.0236, "step": 8972 }, { "epoch": 3.9853431045969354, "grad_norm": 0.3312110185029968, "learning_rate": 3.6113221856370896e-10, "loss": 0.0219, "step": 8973 }, { "epoch": 3.9857872529424827, "grad_norm": 0.4192103414685381, "learning_rate": 3.382094121318069e-10, "loss": 0.0266, "step": 8974 }, { "epoch": 3.9862314012880304, "grad_norm": 0.35796707903705355, "learning_rate": 3.160381397965262e-10, "loss": 0.0177, "step": 8975 }, { "epoch": 3.9866755496335777, "grad_norm": 0.3908253098336717, "learning_rate": 2.946184048902012e-10, "loss": 0.0235, "step": 8976 }, { "epoch": 3.987119697979125, "grad_norm": 0.42457467356965617, "learning_rate": 2.7395021063303385e-10, "loss": 0.0397, "step": 8977 }, { "epoch": 3.9875638463246723, "grad_norm": 0.4884888876304642, "learning_rate": 2.540335601319832e-10, "loss": 0.0228, "step": 8978 }, { "epoch": 3.9880079946702196, "grad_norm": 0.45397167330183835, "learning_rate": 2.348684563802106e-10, "loss": 0.0215, "step": 8979 }, { "epoch": 3.9884521430157673, "grad_norm": 0.48150914332233963, "learning_rate": 2.1645490225929989e-10, "loss": 0.0207, "step": 8980 }, { "epoch": 3.9888962913613146, "grad_norm": 0.518549165106476, "learning_rate": 1.9879290053592682e-10, "loss": 0.0276, "step": 8981 }, { "epoch": 3.9893404397068624, "grad_norm": 0.3673159296164483, "learning_rate": 1.8188245386629998e-10, "loss": 0.0188, "step": 8982 }, { "epoch": 3.9897845880524097, "grad_norm": 0.34298096212254114, "learning_rate": 1.657235647917199e-10, "loss": 0.0169, "step": 8983 }, { "epoch": 3.990228736397957, "grad_norm": 0.4555639884295353, "learning_rate": 1.5031623574135456e-10, "loss": 0.0288, "step": 8984 }, { "epoch": 3.9906728847435042, "grad_norm": 0.3767875220229666, "learning_rate": 1.3566046903057405e-10, "loss": 0.0245, "step": 8985 }, { "epoch": 3.9911170330890515, "grad_norm": 0.3435105743106647, "learning_rate": 1.217562668631711e-10, "loss": 0.0192, "step": 8986 }, { "epoch": 3.9915611814345993, "grad_norm": 0.5264624768016674, "learning_rate": 1.0860363132914053e-10, "loss": 0.0215, "step": 8987 }, { "epoch": 3.9920053297801465, "grad_norm": 0.4654928703557639, "learning_rate": 9.620256440467934e-11, "loss": 0.0253, "step": 8988 }, { "epoch": 3.992449478125694, "grad_norm": 0.42855294671291005, "learning_rate": 8.455306795496221e-11, "loss": 0.0322, "step": 8989 }, { "epoch": 3.9928936264712416, "grad_norm": 0.33395128737336627, "learning_rate": 7.365514373081084e-11, "loss": 0.0159, "step": 8990 }, { "epoch": 3.993337774816789, "grad_norm": 0.7884624510448948, "learning_rate": 6.35087933698042e-11, "loss": 0.0429, "step": 8991 }, { "epoch": 3.993781923162336, "grad_norm": 0.4471065505434232, "learning_rate": 5.411401839738872e-11, "loss": 0.032, "step": 8992 }, { "epoch": 3.9942260715078834, "grad_norm": 0.3926302149338509, "learning_rate": 4.547082022632321e-11, "loss": 0.0269, "step": 8993 }, { "epoch": 3.994670219853431, "grad_norm": 0.35432194327597677, "learning_rate": 3.7579200155013483e-11, "loss": 0.024, "step": 8994 }, { "epoch": 3.9951143681989785, "grad_norm": 0.39044461483214893, "learning_rate": 3.043915937028796e-11, "loss": 0.025, "step": 8995 }, { "epoch": 3.9955585165445258, "grad_norm": 0.3981648535488869, "learning_rate": 2.4050698944622087e-11, "loss": 0.0221, "step": 8996 }, { "epoch": 3.9960026648900735, "grad_norm": 0.42669487275827683, "learning_rate": 1.8413819839468993e-11, "loss": 0.0231, "step": 8997 }, { "epoch": 3.996446813235621, "grad_norm": 0.33538547874165714, "learning_rate": 1.3528522901373741e-11, "loss": 0.0253, "step": 8998 }, { "epoch": 3.996890961581168, "grad_norm": 0.4935415446009399, "learning_rate": 9.394808864748861e-12, "loss": 0.0315, "step": 8999 }, { "epoch": 3.9973351099267154, "grad_norm": 0.37018534448771545, "learning_rate": 6.012678351319245e-12, "loss": 0.0243, "step": 9000 }, { "epoch": 3.9977792582722627, "grad_norm": 0.38372140221372397, "learning_rate": 3.3821318690119286e-12, "loss": 0.021, "step": 9001 }, { "epoch": 3.9982234066178104, "grad_norm": 0.3907968845917238, "learning_rate": 1.503169813621419e-12, "loss": 0.0222, "step": 9002 }, { "epoch": 3.9986675549633577, "grad_norm": 0.4131506866019968, "learning_rate": 3.757924676994762e-13, "loss": 0.0311, "step": 9003 }, { "epoch": 3.9991117033089054, "grad_norm": 0.47497534756496, "learning_rate": 0.0, "loss": 0.0269, "step": 9004 }, { "epoch": 3.9991117033089054, "eval_loss": 0.03634560480713844, "eval_runtime": 402.6359, "eval_samples_per_second": 37.667, "eval_steps_per_second": 1.177, "step": 9004 }, { "epoch": 3.9991117033089054, "step": 9004, "total_flos": 1010387024977920.0, "train_loss": 0.05092642861352228, "train_runtime": 98184.5209, "train_samples_per_second": 11.738, "train_steps_per_second": 0.092 } ], "logging_steps": 1, "max_steps": 9004, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1010387024977920.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }